In [64]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [65]:
# Imports
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import chi2
from sklearn.feature_selection import mutual_info_regression
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import confusion_matrix, classification_report

from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

In [66]:
# Defining some Globals
RANDOM_STATE = 42
CATEGORICAL_FEATURE_COUNT=2
CORELATION_FILTER_NUM_ITERATIONS=1

SPEARMAN_THRESHOLD=0.9
PEARSON_THRESHOLD=0.9
CHI_THRESHOLD=0.9
TEST_SPLIT_PROPORTION=0.6

NUM_FOLDS=10

In [67]:
# Getting the Dataset
import pandas as pd
feat_info = pd.read_csv("/content/drive/My Drive/B Tech Project/DataSet 2018/Feat_info.csv", index_col="Feature Name", squeeze=True)
df = pd.read_csv("/content/drive/My Drive/B Tech Project/DataSet 2018/DDoS_2018.csv")
# Re-ordering the Columns to that in Feat_info.csv
df = pd.DataFrame(df, columns=feat_info.index)

In [68]:
# Categorical Columns need to be handled as well
# They include: Protocol, Dst Port and Label

df.loc[df["Protocol"] == 17, "Protocol"] = "UDP"
df.loc[df["Protocol"] == 6, "Protocol"] = "TCP"
df.loc[df["Protocol"] == 0, "Protocol"] = "HOPOPT"
protocol = pd.get_dummies(df["Protocol"], prefix="protocol")
df = pd.concat([df, protocol], axis=1)
df.drop("Protocol", axis=1, inplace=True)

df["protocol_HOPOPT"].astype("category")
df["protocol_UDP"].astype("category")
df["protocol_TCP"].astype("category")

# For Dst Port it is benign for all non-80 values
# So we can make the dataset more balanced by dropping
# the column itself so that the model is not biased on
# the basis of the port
df.drop("Dst Port", axis=1, inplace=True)

mapping = {"Benign": 0, "DDOS attack-HOIC": 1, "DDOS attack-LOIC-UDP": 2}
df["Label"] = df["Label"].apply(lambda label: mapping[label])

In [69]:
# Removing Timestamp
df.drop(columns=["Timestamp"], inplace=True)

In [70]:
# Stratification of DataSet
# Feature reduction will be done on the training dataset only
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=["Label"]), df["Label"], test_size=TEST_SPLIT_PROPORTION, stratify=df["Label"], random_state=RANDOM_STATE)

# df -> Training Dataset
# test -> Testing Dataset
df = pd.concat([X_train, y_train], axis=1)
df.reset_index(drop=True, inplace=True)

test = pd.concat([X_test, y_test], axis=1)
test.reset_index(drop=True, inplace=True)

In [71]:
myModel=GaussianNB()
myModel.fit(df.drop(columns=["Label"]), df["Label"])
pred=myModel.predict(test.drop(columns=["Label"]))
s_macro_avg_complete_NB=classification_report(test["Label"], pred, output_dict=True)["macro avg"]
print(classification_report(test["Label"], pred, output_dict=True)["macro avg"])

{'precision': 0.9999644120087142, 'recall': 0.9999502403705236, 'f1-score': 0.999957325271238, 'support': 309825}


In [72]:
myModel=RandomForestClassifier()
myModel.fit(df.drop(columns=["Label"]), df["Label"])
pred=myModel.predict(test.drop(columns=["Label"]))
s_macro_avg_complete_RF=classification_report(test["Label"], pred, output_dict=True)["macro avg"]
print(classification_report(test["Label"], pred, output_dict=True)["macro avg"])

{'precision': 0.9999966756590091, 'recall': 0.9999938413026875, 'f1-score': 0.9999952584441124, 'support': 309825}


In [73]:
myModel=LinearSVC(dual=False)
myModel.fit(df.drop(columns=["Label"]), df["Label"])
pred=myModel.predict(test.drop(columns=["Label"]))
s_macro_avg_complete_SVC=classification_report(test["Label"], pred, output_dict=True)["macro avg"]
print(classification_report(test["Label"], pred, output_dict=True)["macro avg"])

{'precision': 0.9996392255420911, 'recall': 0.9996080446019743, 'f1-score': 0.9996236306268526, 'support': 309825}


In [74]:
# Removing columns with constant values
for column_name in df.columns:
    if df[column_name].nunique() == 1:
        df.drop(column_name, axis=1, inplace=True)

In [75]:
# Dividing the features into continuous and categorical features
categorical = []
continuous = []
count = pd.DataFrame(dtype=np.int64, columns=["Number of Values"])
for column_name in df.columns:
    if column_name != "Label":
        count.loc[column_name] = df[column_name].nunique()
        if df[column_name].nunique() == CATEGORICAL_FEATURE_COUNT:
            categorical.append(column_name)
        else:
            continuous.append(column_name)

In [76]:
# Standard Scaling all the Continuous Values
scaler = StandardScaler()
scaler.fit(df[continuous])
df[continuous] = scaler.transform(df[continuous])
test[continuous] = scaler.transform(test[continuous])

In [77]:
def ContinuousVsContinuous(df, features, method, threshold=0.9):
    '''
    Helper Function to execute correlation based filter on continous features
    '''
    corrData = df.corr(method=method).abs().sample(frac=1, axis=1).loc[features, features]
    corrData = corrData.sample(frac=1)
    i = 0
    while len(features) > i:
        row = corrData[features[i]]
        for index in row.index:
            # The index correlation with itself will be 1 so it must be omitted
            # The indexes belonging to "ignore" must be omitted
            if row[index] > threshold and features[i] != index and index in features:
                features.remove(index)
                corrData.drop(index, inplace=True)
                corrData.drop(index, axis=1, inplace=True)
        i += 1
    return features

In [78]:
def getChiValues(df, features1, features2):
    '''
    Helper Function to carry out chi-square test
    '''
    corrData = pd.DataFrame(dtype=np.float32, columns=features1)
    for index, selected_feature in enumerate(features1[:-1]):
        data = chi2(df[features1[index:]], df[selected_feature])[0]

        for index2, feature in enumerate(features1[index:]):
            corrData.loc[selected_feature, feature] = data[index2]
            corrData.loc[feature, selected_feature] = data[index2]

    corrData.iloc[-1, -1] = corrData.iloc[-2, -2]
    return corrData

def CategoricalVsCategorical(df, features, threshold):
    '''
    Helper Function to execute correlation based filter on categorical features
    '''
    corrData = getChiValues(df, features, features)

    scaler = MinMaxScaler()
    corrData = scaler.fit_transform(corrData)

    corrData = pd.DataFrame(corrData, index=features, columns=features)
    i = 0
    while len(features) > i:
        row = corrData[features[i]]
        for index in row.index:
            # The index correlation with itself will be 1 so it must be omitted
            # The indexes belonging to "ignore" must be omitted
            if row[index] > threshold and features[i] != index and index in features:
                features.remove(index)
                corrData.drop(index, inplace=True)
                corrData.drop(index, axis=1, inplace=True)
        i += 1

    return features

In [79]:
# Correlation-based Filtering based on Spearman's Coefficient, Pearson's Coefficient & Chi-square Test
subset_of_features = []
for i in range(CORELATION_FILTER_NUM_ITERATIONS):
    spearman = ContinuousVsContinuous(df, continuous.copy(), method="spearman", threshold=SPEARMAN_THRESHOLD)
    pearson = ContinuousVsContinuous(df, continuous.copy(), method="pearson", threshold=PEARSON_THRESHOLD)
    subset_of_features += spearman
    subset_of_features += pearson
    subset_of_features = list(set(subset_of_features))

chi = CategoricalVsCategorical(df, categorical.copy(), threshold=CHI_THRESHOLD)
subset_of_features += ["Label"]
subset_of_features += chi

In [80]:
# Correlation-Based Filtering Completed
filteredDataSet = pd.DataFrame(df, columns=subset_of_features)
ranked_features=pd.read_csv("/content/drive/My Drive/B Tech Project/DataSet 2018/mutual_info_score.csv", index_col="Feature")

In [81]:
#Getting class weights to handle imbalance
class_weights = {}
for label in y_train.value_counts().index:
    class_weights[label] = sum(y_train.value_counts().drop(label)) / sum(y_train.value_counts())

sample_weights = y_train.apply(lambda x: class_weights[x])

In [82]:
def evaluateModels(train, test, model_of_choice):

    scores = [0] * ((len(test["Label"].value_counts().index)) ** 2)

    myModel = model_of_choice
    myModel.fit(train.drop(columns=["Label"]), train["Label"])
    predictions = myModel.predict(test.drop(columns=["Label"]))

    scores_macro_avg = classification_report(test["Label"], predictions, output_dict=True)["macro avg"]
    scores_benign = classification_report(test["Label"], predictions, output_dict=True)["0"]
    scores = confusion_matrix(test["Label"], predictions)

    return scores, scores_benign, scores_macro_avg

In [102]:
# Only Correlation-Based Filter
selected_columns = list(ranked_features[ranked_features["Information Gain"] >= 0].index) + ["Label"]
scores_all, s_benign_all, s_macro_avg_all = evaluateModels(pd.DataFrame(df, columns=selected_columns), pd.DataFrame(test, columns=selected_columns), GaussianNB())
print(scores_all)

[[107806    442      0]
 [    43 200496      0]
 [     0      0   1038]]


In [103]:
# Best 75% Features
selected_columns = list(ranked_features[ranked_features["Information Gain"] >= 0.25].index) + ["Label"]
scores_75, s_benign_75, s_macro_avg_75 = evaluateModels(pd.DataFrame(df, columns=selected_columns), pd.DataFrame(test, columns=selected_columns), GaussianNB())
print(scores_75)

[[108233     15      0]
 [   184 200355      0]
 [     0      0   1038]]


In [104]:
# Best 50% Features
selected_columns = list(ranked_features[ranked_features["Information Gain"] >= 0.5].index) + ["Label"]
scores_50, s_benign_50, s_macro_avg_50 = evaluateModels(pd.DataFrame(df, columns=selected_columns), pd.DataFrame(test, columns=selected_columns), GaussianNB())
print(scores_50)

[[108234     14      0]
 [   183 200356      0]
 [     0      0   1038]]


In [105]:
# Best 25% Features
selected_columns = list(ranked_features[ranked_features["Information Gain"] >= 0.75].index) + ["Label"]
scores_25, s_benign_25, s_macro_avg_25 = evaluateModels(pd.DataFrame(df, columns=selected_columns), pd.DataFrame(test, columns=selected_columns), GaussianNB())
print(scores_25)

[[107626    147    475]
 [   186 200353      0]
 [     0      0   1038]]


In [106]:
final_scores_NB = []
final_scores_NB.append(pd.DataFrame(s_macro_avg_complete_NB.values(), index=s_macro_avg_complete_NB.keys(), columns=["Initial"]))
final_scores_NB.append(pd.DataFrame(s_macro_avg_all.values(), index=s_macro_avg_all.keys(), columns=[">=0"]))
final_scores_NB.append(pd.DataFrame(s_macro_avg_75.values(), index=s_macro_avg_75.keys(), columns=[">=0.25"]))
final_scores_NB.append(pd.DataFrame(s_macro_avg_50.values(), index=s_macro_avg_50.keys(), columns=[">=0.5"]))
final_scores_NB.append(pd.DataFrame(s_macro_avg_25.values(), index=s_macro_avg_25.keys(), columns=[">=0.75"]))
final_scores_NB = pd.concat(final_scores_NB, axis=1)
print(final_scores_NB.iloc[:-1, :])

            Initial       >=0    >=0.25     >=0.5    >=0.75
precision  0.999964  0.999134  0.999409  0.999414  0.894532
recall     0.999950  0.998567  0.999648  0.999653  0.997775
f1-score   0.999957  0.998849  0.999528  0.999533  0.936409


In [107]:
# Only Correlation-Based Filter
selected_columns = list(ranked_features[ranked_features["Information Gain"] >= 0].index) + ["Label"]
scores_all, s_benign_all, s_macro_avg_all = evaluateModels(pd.DataFrame(df, columns=selected_columns), pd.DataFrame(test, columns=selected_columns), RandomForestClassifier())
print(scores_all)

[[108246      2      0]
 [     0 200539      0]
 [     0      0   1038]]


In [108]:
# Best 75% Features
selected_columns = list(ranked_features[ranked_features["Information Gain"] >= 0.25].index) + ["Label"]
scores_75, s_benign_75, s_macro_avg_75 = evaluateModels(pd.DataFrame(df, columns=selected_columns), pd.DataFrame(test, columns=selected_columns), RandomForestClassifier())
print(scores_75)

[[108246      2      0]
 [     0 200539      0]
 [     0      0   1038]]


In [109]:
# Best 50% Features
selected_columns = list(ranked_features[ranked_features["Information Gain"] >= 0.5].index) + ["Label"]
scores_50, s_benign_50, s_macro_avg_50 = evaluateModels(pd.DataFrame(df, columns=selected_columns), pd.DataFrame(test, columns=selected_columns), RandomForestClassifier())
print(scores_50)

[[108246      2      0]
 [     0 200539      0]
 [     0      0   1038]]


In [110]:
# Best 25% Features
selected_columns = list(ranked_features[ranked_features["Information Gain"] >= 0.75].index) + ["Label"]
scores_25, s_benign_25, s_macro_avg_25 = evaluateModels(pd.DataFrame(df, columns=selected_columns), pd.DataFrame(test, columns=selected_columns), RandomForestClassifier())
print(scores_25)

[[108244      4      0]
 [     0 200539      0]
 [     0      0   1038]]


In [111]:
final_scores_RF = []
final_scores_RF.append(pd.DataFrame(s_macro_avg_complete_RF.values(), index=s_macro_avg_complete_RF.keys(), columns=["Initial"]))
final_scores_RF.append(pd.DataFrame(s_macro_avg_all.values(), index=s_macro_avg_all.keys(), columns=[">=0"]))
final_scores_RF.append(pd.DataFrame(s_macro_avg_75.values(), index=s_macro_avg_75.keys(), columns=[">=0.25"]))
final_scores_RF.append(pd.DataFrame(s_macro_avg_50.values(), index=s_macro_avg_50.keys(), columns=[">=0.5"]))
final_scores_RF.append(pd.DataFrame(s_macro_avg_25.values(), index=s_macro_avg_25.keys(), columns=[">=0.75"]))
final_scores_RF = pd.concat(final_scores_RF, axis=1)
print(final_scores_RF.iloc[:-1, :])

            Initial       >=0    >=0.25     >=0.5    >=0.75
precision  0.999997  0.999997  0.999997  0.999997  0.999993
recall     0.999994  0.999994  0.999994  0.999994  0.999988
f1-score   0.999995  0.999995  0.999995  0.999995  0.999991


In [112]:
# Only Correlation-Based Filter
selected_columns = list(ranked_features[ranked_features["Information Gain"] >= 0].index) + ["Label"]
scores_all, s_benign_all, s_macro_avg_all = evaluateModels(pd.DataFrame(df, columns=selected_columns), pd.DataFrame(test, columns=selected_columns), LinearSVC(dual=False))
print(scores_all)

[[108223     21      4]
 [     0 200539      0]
 [     0      0   1038]]


In [113]:
# Best 75% Features
selected_columns = list(ranked_features[ranked_features["Information Gain"] >= 0.25].index) + ["Label"]
scores_75, s_benign_75, s_macro_avg_75 = evaluateModels(pd.DataFrame(df, columns=selected_columns), pd.DataFrame(test, columns=selected_columns), LinearSVC(dual=False))
print(scores_75)



[[108215     33      0]
 [     0 200539      0]
 [     0      4   1034]]


In [114]:
# Best 50% Features
selected_columns = list(ranked_features[ranked_features["Information Gain"] >= 0.5].index) + ["Label"]
scores_50, s_benign_50, s_macro_avg_50 = evaluateModels(pd.DataFrame(df, columns=selected_columns), pd.DataFrame(test, columns=selected_columns), LinearSVC(dual=False))
print(scores_50)



[[108214     34      0]
 [     0 200539      0]
 [     0      4   1034]]


In [115]:
# Best 25% Features
selected_columns = list(ranked_features[ranked_features["Information Gain"] >= 0.75].index) + ["Label"]
scores_25, s_benign_25, s_macro_avg_25 = evaluateModels(pd.DataFrame(df, columns=selected_columns), pd.DataFrame(test, columns=selected_columns), LinearSVC(dual=False))
print(scores_25)



[[107710    537      1]
 [    15 200524      0]
 [     0      5   1033]]


In [116]:
final_scores_SVC = []
final_scores_SVC.append(pd.DataFrame(s_macro_avg_complete_SVC.values(), index=s_macro_avg_complete_SVC.keys(), columns=["Initial"]))
final_scores_SVC.append(pd.DataFrame(s_macro_avg_all.values(), index=s_macro_avg_all.keys(), columns=[">=0"]))
final_scores_SVC.append(pd.DataFrame(s_macro_avg_75.values(), index=s_macro_avg_75.keys(), columns=[">=0.25"]))
final_scores_SVC.append(pd.DataFrame(s_macro_avg_50.values(), index=s_macro_avg_50.keys(), columns=[">=0.5"]))
final_scores_SVC.append(pd.DataFrame(s_macro_avg_25.values(), index=s_macro_avg_25.keys(), columns=[">=0.75"]))
final_scores_SVC = pd.concat(final_scores_SVC, axis=1)
print(final_scores_SVC.iloc[:-1, :])

            Initial       >=0    >=0.25     >=0.5    >=0.75
precision  0.999639  0.998686  0.999939  0.999937  0.998733
recall     0.999608  0.999923  0.998614  0.998611  0.996713
f1-score   0.999624  0.999303  0.999275  0.999273  0.997719
