In [15]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

In [16]:
RANDOM_STATE = 42
TEST_SPLIT_PROPORTION = 0.2

In [21]:
# Getting the Dataset
# feat_info = pd.read_csv("Feat_info.csv", index_col="Feature Name", squeeze=True)
df = pd.read_csv('./Original/DDoS 2019 - 80F- Final Dataset.csv')
# Re-ordering the Columns to that in Feat_info.csv
# df = pd.DataFrame(df, columns=feat_info.index)

# Categorical Columns need to be handled as well
# They include: Protocol, Dst Port and Label


df.loc[df["Protocol"] == 17, "Protocol"] = "UDP"
df.loc[df["Protocol"] == 6, "Protocol"] = "TCP"
df.loc[df["Protocol"] == 0, "Protocol"] = "HOPOPT"
protocol = pd.get_dummies(df["Protocol"], prefix="protocol")
df = pd.concat([df, protocol], axis=1)
df.drop("Protocol", axis=1, inplace=True)

df["protocol_HOPOPT"].astype("category")
df["protocol_UDP"].astype("category")
df["protocol_TCP"].astype("category")

# For Dst Port it is benign for all non-80 values
# So we can make the dataset more balanced by dropping
# the column itself so that the model is not biased on
# the basis of the port
df.drop(columns=["Source Port", "Destination Port"], inplace=True)

# Removing Timestamp
# df.drop(columns=["Timestamp"], inplace=True)

# multiclass classification
# Portmap & NetBIOS is assumed to be of same class to due to the similarity in the features
mapping = {"BENIGN": 0, "LDAP": 1, "MSSQL": 2, "NetBIOS": 3, "Portmap": 3, "Syn": 4}
df["Label"] = df["Label"].apply(lambda label: mapping[label])

# binary classificaion
mapping = {"BENIGN": 0, "Attack": 1}
df["Label2"] = df["Label2"].apply(lambda label: mapping[label])

# Stratification of DataSet
# Feature reduction will be done on the training dataset only
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=["Label", "Label2"]), df["Label"], test_size=TEST_SPLIT_PROPORTION, stratify=df["Label"], random_state=RANDOM_STATE)

# df -> Training Dataset
# test -> Testing Dataset
df1 = pd.concat([X_train, y_train], axis=1)
df1.reset_index(drop=True, inplace=True)
df1.to_csv('./Improved Datasets/multiclass_training_data.csv')

test = pd.concat([X_test, y_test], axis=1)
test.reset_index(drop=True, inplace=True)
test.to_csv('./Improved Datasets/multiclass_testing_data.csv')


X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=["Label", "Label2"]), df["Label2"], test_size=TEST_SPLIT_PROPORTION, stratify=df["Label2"], random_state=RANDOM_STATE)

# df -> Training Dataset
# test -> Testing Dataset
df1 = pd.concat([X_train, y_train], axis=1)
df1.reset_index(drop=True, inplace=True)
df1["Label"] = df1["Label2"]
df1.drop(columns=["Label2"], inplace=True)
df1.to_csv('./Improved Datasets/binary_training_data.csv')

test = pd.concat([X_test, y_test], axis=1)
test.reset_index(drop=True, inplace=True)
test["Label"] = test["Label2"]
test.drop(columns=["Label2"], inplace=True)
test.to_csv('./Improved Datasets/binary_testing_data.csv')

In [22]:
# Getting the Preprocessed Dataset: binary-Class
train = pd.read_csv("./Improved Datasets/binary_training_data.csv").iloc[:, 1:]
test = pd.read_csv("./Improved Datasets/binary_testing_data.csv").iloc[:, 1:]

In [23]:
train.head()

Unnamed: 0,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,...,Idle Mean,Idle Std,Idle Max,Idle Min,SimillarHTTP,Inbound,protocol_HOPOPT,protocol_TCP,protocol_UDP,Label
0,117,2,2,12,12,6,6,6.0,0.0,6,...,0.0,0.0,0,0,0,1,0,1,0,1
1,2,2,0,1106,0,553,553,553.0,0.0,0,...,0.0,0.0,0,0,0,1,0,0,1,1
2,2,2,0,916,0,458,458,458.0,0.0,0,...,0.0,0.0,0,0,0,1,0,0,1,1
3,1,2,0,1606,0,803,803,803.0,0.0,0,...,0.0,0.0,0,0,0,1,0,0,1,1
4,46,2,0,458,0,229,229,229.0,0.0,0,...,0.0,0.0,0,0,0,1,0,0,1,1


In [24]:
myModel=GaussianNB()
myModel.fit(train.drop(columns=["Label"]), train["Label"])
pred=myModel.predict(test.drop(columns=["Label"]))
s_macro_avg_complete_NB=classification_report(test["Label"], pred, output_dict=True)["macro avg"]
print(classification_report(test["Label"], pred, output_dict=True)["macro avg"])
print("\n")
print(confusion_matrix(test["Label"], pred))

{'precision': 0.6134803525763088, 'recall': 0.5743085191833537, 'f1-score': 0.5884799269815512, 'support': 159949}


[[  1544   7244]
 [  4093 147068]]


In [25]:
myModel=RandomForestClassifier()
myModel.fit(train.drop(columns=["Label"]), train["Label"])
pred=myModel.predict(test.drop(columns=["Label"]))
s_macro_avg_complete_NB=classification_report(test["Label"], pred, output_dict=True)["macro avg"]
print(classification_report(test["Label"], pred, output_dict=True)["macro avg"])
print("\n")
print(confusion_matrix(test["Label"], pred))

{'precision': 0.9996588583124858, 'recall': 0.9999801536110504, 'f1-score': 0.9998194475561418, 'support': 159949}


[[  8788      0]
 [     6 151155]]


In [26]:
myModel=LinearSVC(dual=False)
myModel.fit(train.drop(columns=["Label"]), train["Label"])
pred=myModel.predict(test.drop(columns=["Label"]))
s_macro_avg_complete_NB=classification_report(test["Label"], pred, output_dict=True)["macro avg"]
print(classification_report(test["Label"], pred, output_dict=True)["macro avg"])
print("\n")
print(confusion_matrix(test["Label"], pred))

{'precision': 0.9840059446321556, 'recall': 0.8008132206170455, 'f1-score': 0.8686320021736705, 'support': 159949}


[[  5290   3498]
 [    50 151111]]


In [27]:
# Getting the Preprocessed Dataset: Multi-Class
train = pd.read_csv("./Improved Datasets/multiclass_training_data.csv").iloc[:, 1:]
test = pd.read_csv("./Improved Datasets/multiclass_testing_data.csv").iloc[:, 1:]

In [28]:
myModel=GaussianNB()
myModel.fit(train.drop(columns=["Label"]), train["Label"])
pred=myModel.predict(test.drop(columns=["Label"]))
s_macro_avg_complete_NB=classification_report(test["Label"], pred, output_dict=True)["macro avg"]
print(classification_report(test["Label"], pred, output_dict=True)["macro avg"])
print("\n")
print(confusion_matrix(test["Label"], pred))

{'precision': 0.43155935689020647, 'recall': 0.2508692926869329, 'f1-score': 0.1323564737703199, 'support': 159949}


[[ 1272  6264   586   125   541]
 [    0 18999    51     2     0]
 [    0 56300  1279    21    31]
 [   43 36133   237     0     2]
 [  826 33609    47   148  3433]]


In [29]:
myModel=RandomForestClassifier()
myModel.fit(train.drop(columns=["Label"]), train["Label"])
pred=myModel.predict(test.drop(columns=["Label"]))
s_macro_avg_complete_NB=classification_report(test["Label"], pred, output_dict=True)["macro avg"]
print(classification_report(test["Label"], pred, output_dict=True)["macro avg"])
print("\n")
print(confusion_matrix(test["Label"], pred))

{'precision': 0.9910697443730502, 'recall': 0.9963688348613436, 'f1-score': 0.9936322613955326, 'support': 159949}


[[ 8788     0     0     0     0]
 [    2 18994    56     0     0]
 [    3   832 56793     1     2]
 [    3     2     8 36399     3]
 [    3     0     0     2 38058]]


In [31]:
myModel=LinearSVC(dual=False)
myModel.fit(train.drop(columns=["Label"]), train["Label"])
pred=myModel.predict(test.drop(columns=["Label"]))
s_macro_avg_complete_NB=classification_report(test["Label"], pred, output_dict=True)["macro avg"]
print(classification_report(test["Label"], pred, output_dict=True)["macro avg"])
print("\n")
print(confusion_matrix(test["Label"], pred))

{'precision': 0.7367324482947011, 'recall': 0.704988899661491, 'f1-score': 0.6779074262504391, 'support': 159949}


[[ 5340     0    46   390  3012]
 [    2 17958  1089     2     1]
 [   34  1154 56424     4    15]
 [    2     0 36211   153    49]
 [  305     0    19     1 37738]]
