In [209]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [210]:
# Imports
import pandas as pd
import numpy as np
import random

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, confusion_matrix, classification_report

from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

In [211]:
# Defining some Globals
RANDOM_STATE = 42
THRESHOLD = 0.0001
NUM_FOLDS = 3
MIN_NUM_FEATURES = 40
CATEGORICAL_FEATURE_COUNT=2
TEST_SPLIT_PROPORTION=0.6
AGGRESSIVE=False

In [212]:
# Getting the Dataset
import pandas as pd
feat_info = pd.read_csv("/content/drive/My Drive/B Tech Project/DataSet 2018/Feat_info.csv", index_col="Feature Name", squeeze=True)
df = pd.read_csv("/content/drive/My Drive/B Tech Project/DataSet 2018/DDoS_2018.csv")
# Re-ordering the Columns to that in Feat_info.csv
df = pd.DataFrame(df, columns=feat_info.index)

In [213]:
# Categorical Columns need to be handled as well
# They include: Protocol, Dst Port and Label

df.loc[df["Protocol"] == 17, "Protocol"] = "UDP"
df.loc[df["Protocol"] == 6, "Protocol"] = "TCP"
df.loc[df["Protocol"] == 0, "Protocol"] = "HOPOPT"
protocol = pd.get_dummies(df["Protocol"], prefix="protocol")
df = pd.concat([df, protocol], axis=1)
df.drop("Protocol", axis=1, inplace=True)

df["protocol_HOPOPT"].astype("category")
df["protocol_UDP"].astype("category")
df["protocol_TCP"].astype("category")

# For Dst Port it is benign for all non-80 values
# So we can make the dataset more balanced by dropping
# the column itself so that the model is not biased on
# the basis of the port
df.drop("Dst Port", axis=1, inplace=True)

# Labels are: DDOS attack-HOIC (233961), Benign (126289), DDOS attack-LOIC-UDP (1211)
mapping = {"Benign": 0, "DDOS attack-HOIC": 1, "DDOS attack-LOIC-UDP": 2}
df["Label"] = df["Label"].apply(lambda label: mapping[label])

In [214]:
# Removing Timestamp
df.drop(columns=["Timestamp"], inplace=True)

In [215]:
# Stratification of DataSet
# Feature reduction will be done on the training dataset only
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=["Label"]), df["Label"], test_size=TEST_SPLIT_PROPORTION, stratify=df["Label"], random_state=RANDOM_STATE)

# df -> Training Dataset
# test -> Testing Dataset
df = pd.concat([X_train, y_train], axis=1)
df.reset_index(drop=True, inplace=True)

test = pd.concat([X_test, y_test], axis=1)
test.reset_index(drop=True, inplace=True)

In [216]:
myModel=GaussianNB()
myModel.fit(df.drop(columns=["Label"]), df["Label"])
pred=myModel.predict(test.drop(columns=["Label"]))
print(classification_report(test["Label"], pred, output_dict=True)["macro avg"])

{'precision': 0.9999644120087142, 'recall': 0.9999502403705236, 'f1-score': 0.999957325271238, 'support': 309825}


In [217]:
myModel=RandomForestClassifier()
myModel.fit(df.drop(columns=["Label"]), df["Label"])
pred=myModel.predict(test.drop(columns=["Label"]))
print(classification_report(test["Label"], pred, output_dict=True)["macro avg"])

{'precision': 0.9999966756590091, 'recall': 0.9999938413026875, 'f1-score': 0.9999952584441124, 'support': 309825}


In [218]:
myModel=LinearSVC(dual=False)
myModel.fit(df.drop(columns=["Label"]), df["Label"])
pred=myModel.predict(test.drop(columns=["Label"]))
print(classification_report(test["Label"], pred, output_dict=True)["macro avg"])

{'precision': 0.9996392255420911, 'recall': 0.9996080446019743, 'f1-score': 0.9996236306268526, 'support': 309825}


In [219]:
# Removing columns with constant values
for column_name in df.columns:
    if df[column_name].nunique() == 1:
        df.drop(column_name, axis=1, inplace=True)

In [220]:
# Standard-scaling the continuous features
categorical = []
continuous = []
count = pd.DataFrame(dtype=np.int64, columns=["Number of Values"])
for column_name in df.columns:
    if column_name != "Label":
        count.loc[column_name] = df[column_name].nunique()
        if df[column_name].nunique() != CATEGORICAL_FEATURE_COUNT:
            continuous.append(column_name)

# Standard Scaling all the Continuous Values
scaler = StandardScaler()
scaler.fit(df[continuous])
df[continuous] = scaler.transform(df[continuous])

In [221]:
# Ranking the features for heuristic search implementation
corrData = df.corr().abs().loc["Label"].sort_values().drop("Label")
ranked_features = list(corrData.index)

In [222]:
# myModel = RandomForestClassifier()
# myModel.fit(df.drop(columns=["Label"]), df["Label"])
# predictions=myModel.predict(pd.DataFrame(test, columns=df.columns).drop(columns=["Label"]))
# print(f1_score(test["Label"], predictions, average="macro"))

In [223]:
def evaluateModel(test, model_of_choice, cross_validation_count=5):
    print(f"Testing for {len(test.columns)} Features")
    score = 0
    for i in range(cross_validation_count):
        # print(f"Iteration {i+1} of cross-validation")

        X_train_fold = pd.concat([test.drop(columns=["Label"]).iloc[:i * len(
            test) // cross_validation_count, :], test.drop(columns=["Label"]).iloc[(i+1) * len(test) // cross_validation_count: , :]])
        y_train_fold = pd.concat([test["Label"][:i * len(test) // cross_validation_count], test["Label"].iloc[(i+1) * len(test) // cross_validation_count:]])

        X_test_fold = test.drop(columns=["Label"]).iloc[i * len(test) // cross_validation_count: (i + 1) * len(test) // cross_validation_count, :]
        y_test_fold = test["Label"].iloc[i * len(test) // cross_validation_count: (i + 1) * len(test) // cross_validation_count]

        myModel = model_of_choice
        myModel.fit(X_train_fold, y_train_fold)
        predictions = myModel.predict(X_test_fold)

        score += f1_score(y_test_fold, predictions, average="macro") / cross_validation_count

    return score

In [224]:
def modified_backward_wrapper(training, testing, model_of_choice, ranking, min_features, num_folds, threshold, aggressive=False):
    '''
    Helper Function returning the current running score, the best overall score and score against test data
    '''  
    score_data = {}
    final_features = ranking.copy()
    
    final_accuracy = evaluateModel(training, model_of_choice, NUM_FOLDS)
    myModel = model_of_choice
    myModel.fit(pd.DataFrame(training, columns=final_features), training["Label"])
    predictions = myModel.predict(pd.DataFrame(testing, columns=final_features))

    score_data[len(final_features)] = [final_accuracy, final_accuracy, "-", f1_score(testing["Label"], predictions, average="macro")]
    
    while len(final_features) > min_features:
        flag = 1
        print(f"-------------------Checking for {len(final_features) - 1} Features-------------------")
        for index, feature in enumerate(final_features):
            # feature to be removed is: "feature"
            # we will train on the remaining features and testfor f1_score
            print(f"Testing for importance of {feature}")

            score = evaluateModel(pd.DataFrame(training, columns=final_features + ["Label"]).drop(columns=[feature]), model_of_choice, num_folds)
            print(f"{len(final_features) - 1}: {score} & {final_accuracy}")
            if (abs(score - final_accuracy) < threshold and aggressive) or (not aggressive and score >= final_accuracy):
                final_accuracy = max(score, final_accuracy)
                final_features.remove(feature)
                flag = 0

                myModel = model_of_choice
                myModel.fit(pd.DataFrame(training, columns=final_features), training["Label"])
                predictions = myModel.predict(pd.DataFrame(testing, columns=final_features))

                score_data[len(final_features)] = [score, final_accuracy, feature, final_features]
                break

        print(f"{len(final_features)} Features after removal: {final_features}")
        print(f"{score_data[len(final_features)]}")
        if flag:
            break
      
    return score_data, final_features

In [None]:
scoresNB, final_featuresNB = modified_backward_wrapper(df, test, GaussianNB(), ranked_features, MIN_NUM_FEATURES, NUM_FOLDS, THRESHOLD, AGGRESSIVE)

Testing for 69 Features
-------------------Checking for 67 Features-------------------
Testing for importance of Fwd Pkt Len Min
Testing for 68 Features
67: 0.9999537617884362 & 0.9999537617884362
67 Features after removal: ['Pkt Len Min', 'Bwd Pkt Len Min', 'SYN Flag Cnt', 'Fwd PSH Flags', 'URG Flag Cnt', 'protocol_HOPOPT', 'Bwd IAT Tot', 'Bwd IAT Std', 'Flow IAT Min', 'Fwd IAT Min', 'Bwd IAT Max', 'Active Std', 'Idle Min', 'Fwd IAT Mean', 'Flow IAT Mean', 'Active Min', 'Active Max', 'Flow IAT Max', 'Fwd IAT Max', 'Idle Std', 'Active Mean', 'Idle Mean', 'Flow IAT Std', 'Fwd IAT Std', 'Idle Max', 'Fwd Seg Size Min', 'protocol_TCP', 'protocol_UDP', 'Flow Duration', 'Fwd IAT Tot', 'Bwd Pkt Len Max', 'TotLen Bwd Pkts', 'Subflow Bwd Byts', 'Bwd IAT Mean', 'Subflow Fwd Byts', 'TotLen Fwd Pkts', 'Fwd Header Len', 'Subflow Fwd Pkts', 'Tot Fwd Pkts', 'Fwd Act Data Pkts', 'Fwd Pkts/s', 'Bwd Header Len', 'Flow Pkts/s', 'Bwd Pkt Len Std', 'Down/Up Ratio', 'Subflow Bwd Pkts', 'Tot Bwd Pkts', 'Bwd 

In [None]:
scoresRF, final_featuresRF = modified_backward_wrapper(df, test, RandomForestClassifier(), ranked_features, MIN_NUM_FEATURES - random.choice(5,10), NUM_FOLDS, THRESHOLD, AGGRESSIVE)

In [None]:
scoresSVC, final_featuresSVC = modified_backward_wrapper(df, test, LinearSVC(dual=False), ranked_features, MIN_NUM_FEATURES - random.choice(5, 10), NUM_FOLDS, THRESHOLD, AGGRESSIVE)

In [None]:
final_scores = pd.DataFrame(scores).transpose()
final_scores.columns = ["A", "B", "C", "D"]
#final_scores.columns = ["Score for Iteration", "Overall Accuracy", "Feature Removed", "Score against Test Data"]
print(final_scores)

In [None]:
myModel