In [None]:
# Imports
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import chi2
from sklearn.feature_selection import mutual_info_regression
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import confusion_matrix, classification_report, mutual_info_score
from sklearn.feature_selection import SelectFromModel

from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

In [None]:
# Defining some Globals
RANDOM_STATE = 42
CATEGORICAL_FEATURE_COUNT=2
CORELATION_FILTER_NUM_ITERATIONS=1

SPEARMAN_THRESHOLD=0.9
PEARSON_THRESHOLD=0.9
CHI_THRESHOLD=0.9
TEST_SPLIT_PROPORTION=0.8

NUM_FOLDS=10

In [None]:
# Getting the Preprocessed Dataset: Multi-Class
train = pd.read_csv("./Improved Datasets/multiclass_training_data.csv").iloc[:, 1:]
test = pd.read_csv("./Improved Datasets/multiclass_testing_data.csv").iloc[:, 1:]

In [None]:
train.head()

Unnamed: 0,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,...,Idle Mean,Idle Std,Idle Max,Idle Min,SimillarHTTP,Inbound,protocol_HOPOPT,protocol_TCP,protocol_UDP,Label
0,1,2,0,832,0,416,416,416.0,0.0,0,...,0.0,0.0,0,0,0,1,0,0,1,2
1,50,2,0,1234,0,617,617,617.0,0.0,0,...,0.0,0.0,0,0,0,1,0,0,1,2
2,1,2,0,494,0,247,247,247.0,0.0,0,...,0.0,0.0,0,0,0,1,0,0,1,3
3,1,2,0,12,0,6,6,6.0,0.0,0,...,0.0,0.0,0,0,0,1,0,1,0,4
4,1,2,0,12,0,6,6,6.0,0.0,0,...,0.0,0.0,0,0,0,1,0,1,0,4


In [None]:
# Removing columns with constant values
for column_name in train.columns:
    if train[column_name].nunique() == 1:
        train.drop(column_name, axis=1, inplace=True)

In [None]:
# Dividing the features into continuous and categorical features
categorical = []
continuous = []
for column_name in train.columns:
    if column_name != "Label":
        if train[column_name].nunique() <= CATEGORICAL_FEATURE_COUNT:
            categorical.append(column_name)
        else:
            continuous.append(column_name)

In [None]:
# Standard Scaling all the Continuous Values
scaler = StandardScaler()
scaler.fit(train[continuous])
train[continuous] = scaler.transform(train[continuous])
test[continuous] = scaler.transform(test[continuous])

In [None]:
def ContinuousVsContinuous(df, features, method, threshold=0.9):
    '''
    Helper Function to execute correlation based filter on continous features
    '''
    corrData = df.corr(method=method).abs().sample(frac=1, axis=1).loc[features, features]
    corrData = corrData.sample(frac=1)
    i = 0
    while len(features) > i:
        row = corrData[features[i]]
        for index in row.index:
            # The index correlation with itself will be 1 so it must be omitted
            # The indexes belonging to "ignore" must be omitted
            if row[index] > threshold and features[i] != index and index in features:
                features.remove(index)
                corrData.drop(index, inplace=True)
                corrData.drop(index, axis=1, inplace=True)
        i += 1
    return features

In [None]:
def getChiValues(df, features1, features2):
    '''
    Helper Function to carry out chi-square test
    '''
    corrData = pd.DataFrame(dtype=np.float32, columns=features1)
    for index, selected_feature in enumerate(features1[:-1]):
        data = chi2(df[features1[index:]], df[selected_feature])[0]

        for index2, feature in enumerate(features1[index:]):
            corrData.loc[selected_feature, feature] = data[index2]
            corrData.loc[feature, selected_feature] = data[index2]

    corrData.iloc[-1, -1] = corrData.iloc[-2, -2]
    return corrData

def CategoricalVsCategorical(df, features, threshold):
    '''
    Helper Function to execute correlation based filter on categorical features
    '''
    corrData = getChiValues(df, features, features)

    scaler = MinMaxScaler()
    corrData = scaler.fit_transform(corrData)

    corrData = pd.DataFrame(corrData, index=features, columns=features)
    i = 0
    while len(features) > i:
        row = corrData[features[i]]
        for index in row.index:
            # The index correlation with itself will be 1 so it must be omitted
            # The indexes belonging to "ignore" must be omitted
            if row[index] > threshold and features[i] != index and index in features:
                features.remove(index)
                corrData.drop(index, inplace=True)
                corrData.drop(index, axis=1, inplace=True)
        i += 1

    return features

In [None]:
# Correlation-based Filtering based on Spearman's Coefficient, Pearson's Coefficient & Chi-square Test
subset_of_features = []
for i in range(CORELATION_FILTER_NUM_ITERATIONS):
    spearman = ContinuousVsContinuous(train, continuous.copy(), method="spearman", threshold=0.9)
    pearson = ContinuousVsContinuous(train, continuous.copy(), method="pearson", threshold=0.9)
    subset_of_features += spearman
    subset_of_features += pearson
    subset_of_features = list(set(subset_of_features))

chi = CategoricalVsCategorical(train, categorical.copy(), threshold=0.9)
subset_of_features += ["Label"]
subset_of_features += chi

In [None]:
print(len(subset_of_features))

45


In [None]:
def relevance_scoring(features, labels, random_state, test_size):
    X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=test_size, random_state=random_state)
    myModel=RandomForestClassifier(n_estimators = 100)
    myModel.fit(X_train, y_train)
    score=myModel.feature_importances_
    return dict((feature, (value - min(score)) / (max(score) - min(score))) for feature, value in zip(features.columns, score))

def mutual_info_scoring(features, labels):
    score={}
    for feature in features.columns:
        score[feature]=mutual_info_score(features[feature], labels)
    score= dict((feature, (value - min(score.values())) / (max(score.values()) - min(score.values()))) for feature, value in score.items())
    return score

def relevance_report(features, labels, random_state, test_size):
    score1 = relevance_scoring(train.drop(columns=["Label"]), train["Label"], random_state, test_size)
    score1 = pd.DataFrame(score1.values(), index=score1.keys(), columns=["Relevance Score"])
    score_MI = mutual_info_scoring(train.drop(columns=["Label"]), train["Label"])
    score_MI = pd.DataFrame(score_MI.values(), index=score_MI.keys(), columns=["MI Score"])
    return pd.concat([score1, score_MI], axis=1)

In [None]:
# FilteredDataSet to be used from henceforth
backup=train.copy()
train=pd.DataFrame(train, columns=subset_of_features)
# Getting the relevance scoring criterions
report=relevance_report(train.drop(columns=["Label"]), train["Label"], RANDOM_STATE, TEST_SPLIT_PROPORTION)

In [None]:
def evaluateModels(train, test, model_of_choice):

    scores = [0] * ((len(test["Label"].value_counts().index)) ** 2)

    myModel = model_of_choice
    myModel.fit(train.drop(columns=["Label"]), train["Label"])
    predictions = myModel.predict(test.drop(columns=["Label"]))

    scores_macro_avg = classification_report(test["Label"], predictions, output_dict=True)["macro avg"]
    scores_benign = classification_report(test["Label"], predictions, output_dict=True)["0"]
    scores = confusion_matrix(test["Label"], predictions)

    return scores, scores_benign, scores_macro_avg

In [None]:
def generated_scores(train, test, ranked_features, model_of_choice, model_name):
    # Only Correlation-Based Filter
    selected_columns = list(ranked_features[: len(ranked_features)].index) + ["Label"]
    scores_all, s_benign_all, s_macro_avg_all = evaluateModels(pd.DataFrame(train, columns=selected_columns), pd.DataFrame(test, columns=selected_columns), model_of_choice)
    print(f"{model_name}: Only Correlation Based Filter\n")
    print(scores_all)
    print("\n")
    
    # Best 75% Features
    selected_columns = list(ranked_features[: 3 * len(ranked_features) // 4].index) + ["Label"]
    scores_75, s_benign_75, s_macro_avg_75 = evaluateModels(pd.DataFrame(train, columns=selected_columns), pd.DataFrame(test, columns=selected_columns), model_of_choice)
    print(f"{model_name}: Best 75% Features\n")
    print(scores_75)
    print("\n")

    # Best 50% Features
    selected_columns = list(ranked_features[: len(ranked_features) // 2].index) + ["Label"]
    scores_50, s_benign_50, s_macro_avg_50 = evaluateModels(pd.DataFrame(train, columns=selected_columns), pd.DataFrame(test, columns=selected_columns), model_of_choice)
    print(f"{model_name}: Best 50% Features\n")    
    print(scores_50)
    print("\n")

    # Best 25% Features
    selected_columns = list(ranked_features[: len(ranked_features) // 4].index) + ["Label"]
    scores_25, s_benign_25, s_macro_avg_25 = evaluateModels(pd.DataFrame(train, columns=selected_columns), pd.DataFrame(test, columns=selected_columns), model_of_choice)
    print(f"{model_name}: Best 25% Features\n")
    print(scores_25)
    print("\n")

    final_scores = []
    final_scores.append(pd.DataFrame(s_macro_avg_all.values(), index=s_macro_avg_all.keys(), columns=["100%"]))
    final_scores.append(pd.DataFrame(s_macro_avg_75.values(), index=s_macro_avg_75.keys(), columns=["75%"]))
    final_scores.append(pd.DataFrame(s_macro_avg_50.values(), index=s_macro_avg_50.keys(), columns=["50%"]))
    final_scores.append(pd.DataFrame(s_macro_avg_25.values(), index=s_macro_avg_25.keys(), columns=["25%"]))
    final_scores = pd.concat(final_scores, axis=1)
    print(f"{model_name}: Scores\n")
    print(final_scores.iloc[:-1, :])

In [None]:
# Choosing Dependent MI Score
ranked_features=report["MI Score"].copy()
ranked_features.sort_values(inplace=True, ascending=False)

In [None]:
generated_scores(train, test, ranked_features, GaussianNB(), "Naive Bayes")

Naive Bayes: Only Correlation Based Filter

[[ 8780     0     4     0     4]
 [    9 18944    98     0     1]
 [   55  3208 54368     0     0]
 [   67     0 36346     1     1]
 [  224     0     8     0 37831]]


Naive Bayes: Best 75% Features

[[ 8747     0     3     0    38]
 [   17 18929   105     0     1]
 [   58  2708 54865     0     0]
 [   76     0 36336     2     1]
 [   73     0     8     0 37982]]


Naive Bayes: Best 50% Features

[[ 8690     0    52     1    45]
 [    5 18923   123     0     1]
 [   46  2495 55090     0     0]
 [   50     0 36364     0     1]
 [   63     0    18     0 37982]]


Naive Bayes: Best 25% Features

[[ 5900     0  2550     1   337]
 [   32 18877   142     0     1]
 [   50  2044 55537     0     0]
 [   51     0 36364     0     0]
 [   46     0    18     0 37999]]


Naive Bayes: Scores

               100%       75%       50%       25%
precision  0.882954  0.889928  0.692972  0.690208
recall     0.786147  0.787762  0.787172  0.724834
f1-score   0.7257

In [None]:
generated_scores(train, test, ranked_features, RandomForestClassifier(n_jobs=-1), "Random Forest")

Random Forest: Only Correlation Based Filter

[[ 8788     0     0     0     0]
 [    0 18955    96     1     0]
 [    2   822 56802     4     1]
 [    3     2     9 36399     2]
 [    2     0     0     3 38058]]


Random Forest: Best 75% Features

[[ 8788     0     0     0     0]
 [    0 18953    99     0     0]
 [    2   826 56799     3     1]
 [    3     1     9 36399     3]
 [    2     0     0     3 38058]]


Random Forest: Best 50% Features

[[ 8786     0     0     2     0]
 [    0 18951   101     0     0]
 [    3   821 56803     3     1]
 [    3     1    18 36391     2]
 [    4     0     2     2 38055]]


Random Forest: Best 25% Features

[[ 8787     0     0     1     0]
 [    1 18955    96     0     0]
 [    3   822 56802     3     1]
 [    3     0    18 36392     2]
 [    5     0     2     3 38053]]


Random Forest: Scores

               100%       75%       50%       25%
precision  0.991080  0.991045  0.990979  0.990953
recall     0.995991  0.995959  0.995847  0.995903
f1-scor

In [None]:
generated_scores(train, test, ranked_features, LinearSVC(dual=False), "Linear SVC")

Linear SVC: Only Correlation Based Filter

[[ 8776     0     0     7     5]
 [    0 18819   230     2     1]
 [    2  1655 55966     8     0]
 [   13     8    64 36329     1]
 [   35     8     9     0 38011]]


Linear SVC: Best 75% Features

[[ 8736     0     0     7    45]
 [    0 18819   230     2     1]
 [    2  1653 55968     8     0]
 [   13     8    67 36326     1]
 [   34     6     8     0 38015]]


Linear SVC: Best 50% Features

[[ 8670     0     4    54    60]
 [    0 18821   228     2     1]
 [    2  1660 55961     8     0]
 [    7     0    87 36320     1]
 [   31     0    18     0 38014]]


Linear SVC: Best 25% Features

[[ 5729     5    22  2573   459]
 [    3 18618   428     2     1]
 [    4  1622 55998     7     0]
 [   18     0   200 36196     1]
 [   31     0    18     0 38014]]


Linear SVC: Scores

               100%       75%       50%       25%
precision  0.981349  0.981185  0.980997  0.963920
recall     0.990757  0.989858  0.988315  0.918699
f1-score   0.985772  0

In [None]:
# Choosing Dependent MI Score
ranked_features=report["Relevance Score"].copy()
ranked_features.sort_values(inplace=True, ascending=False)

In [None]:
generated_scores(train, test, ranked_features, GaussianNB(), "Naive Bayes")

Naive Bayes: Only Correlation Based Filter

[[ 8780     0     4     0     4]
 [    9 18944    98     0     1]
 [   55  3208 54368     0     0]
 [   67     0 36346     1     1]
 [  224     0     8     0 37831]]


Naive Bayes: Best 75% Features

[[ 8747     0     3     0    38]
 [   17 18929   105     0     1]
 [   56  2708 54867     0     0]
 [   76     0 36336     2     1]
 [   70     0     8     0 37985]]


Naive Bayes: Best 50% Features

[[ 8744     0     6     0    38]
 [   16 18911   124     0     1]
 [   58  2517 55056     0     0]
 [   67     0 36346     1     1]
 [   72     0     8     0 37983]]


Naive Bayes: Best 25% Features

[[ 8744     0     6     0    38]
 [    7 18877   167     0     1]
 [   11  1882 55738     0     0]
 [   23     0 34222  2170     0]
 [   89     0     8     0 37966]]


Naive Bayes: Scores

               100%       75%       50%       25%
precision  0.882954  0.890039  0.891829  0.902401
recall     0.786147  0.787785  0.788167  0.802001
f1-score   0.7257

In [None]:
generated_scores(train, test, ranked_features, RandomForestClassifier(n_jobs=-1), "Random Forest")

Random Forest: Only Correlation Based Filter

[[ 8788     0     0     0     0]
 [    0 18974    78     0     0]
 [    3   846 56778     3     1]
 [    3     2     9 36399     2]
 [    3     0     0     3 38057]]


Random Forest: Best 75% Features

[[ 8788     0     0     0     0]
 [    0 18977    75     0     0]
 [    3   847 56778     2     1]
 [    3     2     9 36399     2]
 [    2     0     0     4 38057]]


Random Forest: Best 50% Features

[[ 8788     0     0     0     0]
 [    0 18974    78     0     0]
 [    2   917 56708     3     1]
 [    3     1     9 36400     2]
 [    3     0     0     7 38053]]


Random Forest: Best 25% Features

[[ 8788     0     0     0     0]
 [    0 18978    74     0     0]
 [    1   909 56715     4     2]
 [    4     1     8 36399     3]
 [    1     0     0     2 38060]]


Random Forest: Scores

               100%       75%       50%       25%
precision  0.990885  0.990909  0.990211  0.990364
recall     0.996102  0.996133  0.995843  0.995941
f1-scor

In [None]:
generated_scores(train, test, ranked_features, LinearSVC(dual=False), "Linear SVC")

Linear SVC: Only Correlation Based Filter

[[ 8776     0     0     7     5]
 [    0 18819   230     2     1]
 [    2  1655 55966     8     0]
 [   12     9    66 36327     1]
 [   36     8     9     0 38010]]


Linear SVC: Best 75% Features

[[ 8737     0     0     6    45]
 [    0 18819   230     2     1]
 [    2  1653 55968     8     0]
 [   13     8    66 36327     1]
 [   34     6     8     0 38015]]


Linear SVC: Best 50% Features

[[ 8744     0     0     6    38]
 [    0 18876   173     2     1]
 [    4  1711 55910     6     0]
 [   13     8    61 36332     1]
 [   32     6     9     0 38016]]


Linear SVC: Best 25% Features

[[ 8743     1     0     6    38]
 [    4 18891   154     2     1]
 [   11  1722 55892     6     0]
 [   22     0    69 36323     1]
 [   38     0     8     2 38015]]


Linear SVC: Scores

               100%       75%       50%       25%
precision  0.981333  0.981195  0.980986  0.980461
recall     0.990741  0.989887  0.990476  0.990493
f1-score   0.985755  0