In [None]:
# from google.colab import drive
# drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
import random
from sklearn.metrics import confusion_matrix, classification_report, f1_score
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC

In [None]:
NUM_FOLDS = 10
CATEGORICAL_FEATURE_COUNT=2

In [None]:
multiclass_training_df = pd.read_csv('/content/drive/My Drive/B Tech Project/DataSet 2019/multiclass_training_data.csv').iloc[:, 1:]
multiclass_testing_df = pd.read_csv('/content/drive/My Drive/B Tech Project/DataSet 2019/multiclass_testing_data.csv').iloc[:, 1:]

In [None]:
def preprocessing(training_data, testing_data, CATEGORICAL_FEATURE_COUNT=CATEGORICAL_FEATURE_COUNT):
    # Removing columns with constant values
    cols = training_data.columns
    for column_name in cols:
        if training_data[column_name].nunique() == 1:
            training_data.drop(column_name, axis=1, inplace=True)
            testing_data.drop(column_name, axis=1, inplace=True)
            

    # Getting Continuous features
    continuous = []
    cols = training_data.columns
    for column_name in cols:
        if column_name != 'Label':
            if training_data[column_name].nunique() != CATEGORICAL_FEATURE_COUNT:
                continuous.append(column_name)

    # Standard Scaling all the Continuous Values
    scaler = StandardScaler()
    scaler.fit(training_data[continuous])
    training_data[continuous] = scaler.transform(training_data[continuous])
    testing_data[continuous] = scaler.transform(testing_data[continuous])

    return training_data, testing_data

In [None]:
mapping = {0:0, 1:1, 2:2, 3:3, 4:3, 5:4}
multiclass_testing_df["Label"]=multiclass_testing_df["Label"].apply(lambda x: mapping[x])
multiclass_training_df["Label"]=multiclass_training_df["Label"].apply(lambda x: mapping[x])

In [None]:
multiclass_training_df, multiclass_testing_df = preprocessing(training_data=multiclass_training_df, 
                                                            testing_data=multiclass_testing_df )

In [None]:
def evaluateModel(model, train, test=None,flag=False):
    X, y = train.drop(columns=['Label']), train['Label']

    scores_macro={}
    scores_benign={}
    scores = [0] * 25

    for j in range(NUM_FOLDS):
        X_train = pd.concat([X[:j * len(X) // NUM_FOLDS], X[((j + 1) * len(X)) // NUM_FOLDS:]])
        y_train = pd.concat([y[:j * len(y) // NUM_FOLDS], y[(j + 1) * len(y) // NUM_FOLDS:]])
        if not flag:
            X_test = X[j * len(X) // NUM_FOLDS: (j + 1) * len(X) // NUM_FOLDS]
            y_test = y[j * len(y) // NUM_FOLDS: (j + 1) * len(y) // NUM_FOLDS]
        else: 
            X_test, y_test = test.drop(columns=['Label']), test['Label']
            
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        scores = list(map(lambda x, y: x + y, scores, 
                                confusion_matrix(y_test, y_pred).ravel()))
        scores_macro[j]=classification_report(y_test, y_pred, output_dict=True)["macro avg"]
        scores_benign[j]=classification_report(y_test, y_pred, output_dict=True)["0"]

    scores = list(map(lambda x: round(x/NUM_FOLDS, 2), scores))
    
    cumulative_benign={"precision": 0, "recall":0, "f1-score":0, "support":0}
    for key, value in scores_benign.items():
        for metric, measure in scores_benign[key].items():
            cumulative_benign[metric] += measure / NUM_FOLDS

    cumulative_macro={"precision": 0, "recall":0, "f1-score":0, "support":0}
    for key, value in scores_macro.items():
        for metric, measure in scores_macro[key].items():
            cumulative_macro[metric] += measure / NUM_FOLDS

    confusionMatrix = []
    for i in range(0, len(scores), 5):
        confusionMatrix.append(scores[i:i+5])

    return confusionMatrix, cumulative_macro, cumulative_benign    

In [None]:
mymodel=GaussianNB()
cf_matrix, macro_score, benign_score=evaluateModel(model=mymodel, train=multiclass_training_df, test=multiclass_testing_df, flag=True)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
print(pd.DataFrame(macro_score.values(), index=macro_score.keys(), columns=["Naive Bayes"]))
print("\nConfusion Matrix")
print("\n".join([str(row) for row in cf_matrix]))

             Naive Bayes
precision       0.851789
recall          0.788767
f1-score        0.731538
support    159949.000000

Confusion Matrix
[8780.1, 0.0, 3.9, 0.0, 4.0]
[6.0, 18884.6, 160.4, 0.0, 1.0]
[49.7, 2345.7, 55235.6, 0.0, 0.0]
[57.3, 0.0, 36352.9, 0.8, 4.0]
[180.0, 0.0, 8.0, 0.0, 37875.0]


In [None]:
mymodel=RandomForestClassifier()
cf_matrix, macro_score, benign_score=evaluateModel(model=mymodel, train=multiclass_training_df, test=multiclass_testing_df, flag=True)

In [None]:
print(pd.DataFrame(macro_score.values(), index=macro_score.keys(), columns=["Random Forest"]))
print("\nConfusion Matrix")
print("\n".join([str(row) for row in cf_matrix]))

           Random Forest
precision       0.985806
recall          0.992913
f1-score        0.989210
support    159949.000000

Confusion Matrix
[8780.0, 0.0, 0.0, 3.8, 4.2]
[2.0, 18844.0, 205.0, 1.0, 0.0]
[7.0, 1236.0, 56385.0, 3.0, 0.0]
[2.5, 0.0, 17.2, 36388.0, 7.3]
[29.4, 0.0, 16.0, 2.0, 38015.6]


In [None]:
mymodel=LinearSVC(dual=False, tol=0.001)
cf_matrix, macro_score, benign_score=evaluateModel(model=mymodel, train=multiclass_training_df, test=multiclass_testing_df, flag=True)

In [None]:
print(pd.DataFrame(macro_score.values(), index=macro_score.keys(), columns=["Linear SVC"]))
print("\nConfusion Matrix")
print("\n".join([str(row) for row in cf_matrix]))

              Linear SVC
precision       0.981501
recall          0.990411
f1-score        0.985681
support    159949.000000

Confusion Matrix
[8761.3, 0.0, 0.0, 14.3, 12.4]
[0.0, 18818.8, 230.2, 2.0, 1.0]
[2.0, 1657.4, 55963.6, 8.0, 0.0]
[6.5, 0.6, 74.6, 36326.9, 6.4]
[32.0, 0.0, 18.0, 0.0, 38013.0]
