In [56]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [57]:
# Imports
import pandas as pd
import numpy as np
import random
from sklearn.metrics import confusion_matrix, classification_report, f1_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC

In [58]:
# Defining Globals
NUM_FOLDS = 10
RANDOM_STATE = 42
TEST_SPLIT_PROPORTION = 0.6
CATEGORICAL_FEATURE_COUNT = 2
POPULATION_SIZE = 20
MIN_FEATURES_INITIAL_POP=10
MAX_FEATURES_INITIAL_POP=27

In [59]:
# Getting the Dataset
import pandas as pd
feat_info = pd.read_csv("/content/drive/My Drive/B Tech Project/DataSet 2018/Feat_info.csv", index_col="Feature Name", squeeze=True)
df = pd.read_csv("/content/drive/My Drive/B Tech Project/DataSet 2018/DDoS_2018.csv")
# Re-ordering the Columns to that in Feat_info.csv
df = pd.DataFrame(df, columns=feat_info.index)

In [60]:
# Categorical Columns need to be handled as well
# They include: Protocol, Dst Port and Label

df.loc[df["Protocol"] == 17, "Protocol"] = "UDP"
df.loc[df["Protocol"] == 6, "Protocol"] = "TCP"
df.loc[df["Protocol"] == 0, "Protocol"] = "HOPOPT"
protocol = pd.get_dummies(df["Protocol"], prefix="protocol")
df = pd.concat([df, protocol], axis=1)
df.drop("Protocol", axis=1, inplace=True)

df["protocol_HOPOPT"].astype("category")
df["protocol_UDP"].astype("category")
df["protocol_TCP"].astype("category")

# For Dst Port it is benign for all non-80 values
# So we can make the dataset more balanced by dropping
# the column itself so that the model is not biased on
# the basis of the port
df.drop("Dst Port", axis=1, inplace=True)
# Removing Timestamp
df.drop(columns=["Timestamp"], inplace=True)

mapping = {"Benign": 0, "DDOS attack-HOIC": 1, "DDOS attack-LOIC-UDP": 2}
df["Label"] = df["Label"].apply(lambda label: mapping[label])

In [61]:
# Stratification of DataSet
# Feature reduction will be done on the training dataset only
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=["Label"]), df["Label"], test_size=TEST_SPLIT_PROPORTION, stratify=df["Label"], random_state=RANDOM_STATE)

# df -> Training Dataset
# test -> Testing Dataset
df = pd.concat([X_train, y_train], axis=1)
df.reset_index(drop=True, inplace=True)

test = pd.concat([X_test, y_test], axis=1)
test.reset_index(drop=True, inplace=True)

In [62]:
print('Naive Bayes before Feature Selection')
myModel=GaussianNB()
myModel.fit(df.drop(columns=["Label"]), df["Label"])
pred=myModel.predict(test.drop(columns=["Label"]))
print(classification_report(test["Label"], pred, output_dict=True)["macro avg"])
print(confusion_matrix(test["Label"], pred))

Naive Bayes before Feature Selection
{'precision': 0.9999644120087142, 'recall': 0.9999502403705236, 'f1-score': 0.999957325271238, 'support': 309825}
[[108234     14      0]
 [     4 200535      0]
 [     0      0   1038]]


In [63]:
print('Random Forest before Feature Selection')
myModel=RandomForestClassifier()
myModel.fit(df.drop(columns=["Label"]), df["Label"])
pred=myModel.predict(test.drop(columns=["Label"]))
print(classification_report(test["Label"], pred, output_dict=True)["macro avg"])
print(confusion_matrix(test["Label"], pred))

Random Forest before Feature Selection
{'precision': 0.9999966756590091, 'recall': 0.9999938413026875, 'f1-score': 0.9999952584441124, 'support': 309825}
[[108246      2      0]
 [     0 200539      0]
 [     0      0   1038]]


In [64]:
print('LinearSVC before Feature Selection')
myModel=LinearSVC(dual=False)
myModel.fit(df.drop(columns=["Label"]), df["Label"])
pred=myModel.predict(test.drop(columns=["Label"]))
print(classification_report(test["Label"], pred, output_dict=True)["macro avg"])
print(confusion_matrix(test["Label"], pred))

LinearSVC before Feature Selection
{'precision': 0.9996392255420911, 'recall': 0.9996080446019743, 'f1-score': 0.9996236306268526, 'support': 309825}
[[108225     22      1]
 [     0 200539      0]
 [     1      0   1037]]


In [65]:
# Getting all categorical features
continuous = []
count = pd.DataFrame(dtype=np.int64, columns=["Number of Values"])
for column_name in df.columns:
    if column_name != "Label":
        count.loc[column_name] = df[column_name].nunique()
        if df[column_name].nunique() != CATEGORICAL_FEATURE_COUNT:
            continuous.append(column_name)

In [66]:
# Standard Scaling all the Continuous Values
scaler = StandardScaler()
scaler.fit(df[continuous])
df[continuous] = scaler.transform(df[continuous])
test[continuous] = scaler.transform(test[continuous])

In [67]:
# Removing columns with constant values
for column_name in df.columns:
    if df[column_name].nunique() == 1:
        df.drop(column_name, axis=1, inplace=True)

In [68]:
def arreq_in_list(myarr, list_arrays):
    return next((True for elem in list_arrays if np.array_equal(elem, myarr)), False)

In [69]:
def fitness_eval(X, y, model_of_choice):
    score = 0
    for j in range(NUM_FOLDS):
        X_train_numfolds = pd.concat([X[:j * len(X) // NUM_FOLDS], X[((j + 1) * len(X)) // NUM_FOLDS:]])
        X_test_numfolds = X[j * len(X) // NUM_FOLDS: (j + 1) * len(X) // NUM_FOLDS]

        y_train_numfolds = pd.concat([y[:j * len(y) // NUM_FOLDS], y[(j + 1) * len(y) // NUM_FOLDS:]])
        y_test_numfolds = y[j * len(y) // NUM_FOLDS: (j + 1) * len(y) // NUM_FOLDS]
        
        myModel.fit(X_train_numfolds, y_train_numfolds)
        y_pred = myModel.predict(X_test_numfolds)
        score += f1_score(y_test_numfolds, y_pred, average='macro') / NUM_FOLDS

    return score

In [70]:
def cuttlefish_algo(N, min_features, max_features, model_of_choice, df, iterations):
    population = []
    X, y = df.drop(columns=['Label']), df['Label'] 
    tot_features = len(X.columns)
    
    # selected features -> random selection
    new_row = np.array(random.sample(range(0, tot_features-1), k=random.randint(min_features, max_features)))
    for _ in range(N):
        population.append(new_row)
        new_row = np.array(random.sample(range(0, tot_features-1), k=random.randint(min_features, max_features)))
        while arreq_in_list(new_row, population):
            new_row = np.array(random.sample(range(0, tot_features-1), k=random.randint(min_features, max_features)))

        
    fitness = []
    for p in population:
        fitness.append(fitness_eval(X.iloc[:, p], y, model_of_choice))
        
    index_of_best = fitness.index(max(fitness))
    AVbestsubset = population[index_of_best]
    tmp = random.randint(1, len(AVbestsubset) - 1)
    bestSubset = np.concatenate((AVbestsubset[:tmp], AVbestsubset[tmp+1:]))

    stopping_iterations = iterations
    while stopping_iterations != 0:
        stopping_iterations -= 1
            
        # case 1 And case 2
            
        # sorting population based on fitness
        fitness_data = pd.DataFrame(data=np.array(fitness), columns=['fitness'])
        fitness_data.sort_values(by=['fitness'], ascending=False, inplace=True)
        
        population = list(map(population.__getitem__, fitness_data.index))
        k = random.randint(0, N//2)
        
        for i in range(k):
            # reflection array
            R = random.randint(0, len(population[i])-1)
            reflection = population[i][random.sample(range(0, len(population[i])), R)]
            
            # visibility array
            V = len(population[i]) - R
            visibility = []
            while len(visibility) != V:
                tmp = random.randint(0, tot_features-1)
                if tmp not in population[i]:
                    visibility.append(tmp)
            visibility = np.array(visibility)
            
            # new Subset
            newSubset = np.union1d(reflection, visibility)
            
            # evaluating accuracy on new subset
            accuracy_newSubset = fitness_eval(X.iloc[:, newSubset], y, model_of_choice)
            
            # evaluating accuracy on new subset
            accuracy_AVbestsubset = fitness_eval(X.iloc[:, AVbestsubset], y, model_of_choice)
            
            if accuracy_newSubset > accuracy_AVbestsubset:
                AVbestsubset = newSubset
            
        # case 3 and 4
        t = 10
        for i in range(t):
            # exchanging one feature from bestsubset.selected and bestsubset.unselected
            removed_selected_feature = bestSubset[random.randint(0, len(bestSubset)-1)]
            newSubset = np.delete(bestSubset, np.where(bestSubset == removed_selected_feature))
            while True:
                tmp = random.randint(0, tot_features-1)
                if tmp not in bestSubset and tmp != removed_selected_feature:
                    bestSubset = np.append(bestSubset, tmp)
                    break
            # evaluationg accuracy on new bestsubset
            accuracy_bestsubset = fitness_eval(X.iloc[:, bestSubset], y, model_of_choice)
            accuracy_newSubset = fitness_eval(X.iloc[:, newSubset], y, model_of_choice)
            # comparing accuracy for bestsubset and newSubset
            if accuracy_newSubset > accuracy_bestsubset:
                bestSubset = newSubset           
        
        # case 5
        for i in range(len(AVbestsubset)):
            newSubset = np.delete(AVbestsubset, i)
            accuracy_newSubset = fitness_eval(X.iloc[:, newSubset], y, model_of_choice)
            accuracy_bestsubset = fitness_eval(X.iloc[:, bestSubset], y, model_of_choice)
            if accuracy_newSubset > accuracy_bestsubset:
                bestSubset = newSubset
        
        
        # case 6
        for i in range(k, N):
            newSubset = random.sample(range(0, tot_features-1), len(population[i]))
            accuracy_newSubset = fitness_eval(X.iloc[:, newSubset], y, model_of_choice)
            accuracy_pi = fitness_eval(X.iloc[:, population[i]], y, model_of_choice)
            
            if accuracy_newSubset > accuracy_pi:
                population[i] = newSubset
            accuracy_AVbestsubset = fitness_eval(X.iloc[:, AVbestsubset], y, model_of_choice)
            
            if accuracy_newSubset > accuracy_AVbestsubset:
                AVbestsubset = population[i]

    return bestSubset

In [71]:
# CuttleFish for Naive Bayes
myModel=GaussianNB()
selected_features=cuttlefish_algo(N=POPULATION_SIZE,
                                  min_features=MIN_FEATURES_INITIAL_POP,
                                  max_features=MAX_FEATURES_INITIAL_POP,
                                  model_of_choice=myModel,
                                  df=df,
                                  iterations=1)
selected_features=list(df.columns[selected_features])
print(selected_features)

['TotLen Bwd Pkts', 'Flow Pkts/s', 'Flow IAT Mean', 'Bwd IAT Max', 'Fwd Header Len', 'Bwd Header Len', 'Bwd Pkts/s', 'Pkt Len Std', 'SYN Flag Cnt', 'PSH Flag Cnt', 'URG Flag Cnt', 'ECE Flag Cnt', 'Subflow Fwd Byts', 'Subflow Bwd Pkts', 'Subflow Bwd Byts', 'Fwd Seg Size Min', 'Active Max', 'protocol_HOPOPT']


In [72]:
myModel=GaussianNB()
myModel.fit(df.loc[:, selected_features], df["Label"])
pred=myModel.predict(test.loc[:, selected_features])
print(classification_report(test["Label"], pred, output_dict=True)["macro avg"])
print(confusion_matrix(test["Label"], pred))

{'precision': 0.9999667595736583, 'recall': 0.9999384130268766, 'f1-score': 0.9999525826265524, 'support': 309825}
[[108228     20      0]
 [     0 200539      0]
 [     0      0   1038]]


In [None]:
# CuttleFish for Random Forest
myModel=RandomForestClassifier()
selected_features=cuttlefish_algo(N=POPULATION_SIZE,
                                  min_features=MIN_FEATURES_INITIAL_POP,
                                  max_features=MAX_FEATURES_INITIAL_POP,
                                  model_of_choice=myModel,
                                  df=df,
                                  iterations=1)
selected_features=list(df.columns[selected_features])
print(selected_features)

In [None]:
myModel=RandomForestClassifier()
myModel.fit(df.loc[:, selected_features], df["Label"])
pred=myModel.predict(test.loc[:, selected_features])
print(classification_report(test["Label"], pred, output_dict=True)["macro avg"])
print(confusion_matrix(test["Label"], pred))

In [None]:
# CuttleFish for Linear SVC
myModel=LinearSVC(dual=False)
selected_features=cuttlefish_algo(N=POPULATION_SIZE,
                                  min_features=MIN_FEATURES_INITIAL_POP,
                                  max_features=MAX_FEATURES_INITIAL_POP,
                                  model_of_choice=myModel,
                                  df=df,
                                  iterations=1)
selected_features=list(df.columns[selected_features])
print(selected_features)

In [None]:
myModel=LinearSVC(dual=False)
myModel.fit(df.loc[:, selected_features], df["Label"])
pred=myModel.predict(test.loc[:, selected_features])
print(classification_report(test["Label"], pred, output_dict=True)["macro avg"])
print(confusion_matrix(test["Label"], pred))