In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import random
from sklearn.metrics import confusion_matrix, classification_report, f1_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC

In [None]:
NUM_FOLDS = 10
RANDOM_STATE = 42
TEST_SPLIT_PROPORTION = 0.6
CATEGORICAL_FEATURE_COUNT = 2
POPULATION_SIZE = 200
PARENTS = 100
MUTATION_RATE = 0.1
NUMBER_OF_GENERATIONS = 5

In [None]:
# Getting the Dataset
import pandas as pd
feat_info = pd.read_csv("/content/drive/My Drive/B Tech Project/DataSet 2018/Feat_info.csv", index_col="Feature Name", squeeze=True)
dataset = pd.read_csv("/content/drive/My Drive/B Tech Project/DataSet 2018/DDoS_2018.csv")
# Re-ordering the Columns to that in Feat_info.csv
dataset = pd.DataFrame(dataset, columns=feat_info.index)

In [None]:
# Categorical Columns need to be handled as well
# They include: Protocol, Dst Port and Label

dataset.loc[dataset["Protocol"] == 17, "Protocol"] = "UDP"
dataset.loc[dataset["Protocol"] == 6, "Protocol"] = "TCP"
dataset.loc[dataset["Protocol"] == 0, "Protocol"] = "HOPOPT"
protocol = pd.get_dummies(dataset["Protocol"], prefix="protocol")
dataset = pd.concat([dataset, protocol], axis=1)
dataset.drop("Protocol", axis=1, inplace=True)

dataset["protocol_HOPOPT"].astype("category")
dataset["protocol_UDP"].astype("category")
dataset["protocol_TCP"].astype("category")

# For Dst Port it is benign for all non-80 values
# So we can make the dataset more balanced by dropping
# the column itself so that the model is not biased on
# the basis of the port
dataset.drop("Dst Port", axis=1, inplace=True)
# Removing Timestamp
dataset.drop(columns=["Timestamp"], inplace=True)

mapping = {"Benign": 0, "DDOS attack-HOIC": 1, "DDOS attack-LOIC-UDP": 2}
dataset["Label"] = dataset["Label"].apply(lambda label: mapping[label])

In [None]:
# Stratification of DataSet
# Feature reduction will be done on the training dataset only
X_train, X_test, y_train, y_test = train_test_split(dataset.drop(columns=["Label"]), dataset["Label"], test_size=TEST_SPLIT_PROPORTION, stratify=dataset["Label"], random_state=RANDOM_STATE)

# df -> Training Dataset
# test -> Testing Dataset
df = pd.concat([X_train, y_train], axis=1)
df.reset_index(drop=True, inplace=True)

test = pd.concat([X_test, y_test], axis=1)
test.reset_index(drop=True, inplace=True)

In [None]:
print('Naive Bayes before...')
myModel=GaussianNB()
myModel.fit(df.drop(columns=["Label"]), df["Label"])
pred=myModel.predict(test.drop(columns=["Label"]))
print(classification_report(test["Label"], pred, output_dict=True)["macro avg"])
print(confusion_matrix(test["Label"], pred))

Naive Bayes before...
{'precision': 0.9999644120087142, 'recall': 0.9999502403705236, 'f1-score': 0.999957325271238, 'support': 309825}
[[108234     14      0]
 [     4 200535      0]
 [     0      0   1038]]


In [None]:
print('Random Forest before...')
myModel=RandomForestClassifier()
myModel.fit(df.drop(columns=["Label"]), df["Label"])
pred=myModel.predict(test.drop(columns=["Label"]))
print(classification_report(test["Label"], pred, output_dict=True)["macro avg"])
print(confusion_matrix(test["Label"], pred))

Random Forest before...
{'precision': 0.9999966756590091, 'recall': 0.9999938413026875, 'f1-score': 0.9999952584441124, 'support': 309825}
[[108246      2      0]
 [     0 200539      0]
 [     0      0   1038]]


In [None]:
print('LinearSVC before...')
myModel=LinearSVC(dual=False)
myModel.fit(df.drop(columns=["Label"]), df["Label"])
pred=myModel.predict(test.drop(columns=["Label"]))
print(classification_report(test["Label"], pred, output_dict=True)["macro avg"])
print(confusion_matrix(test["Label"], pred))

LinearSVC before...
{'precision': 0.9996392255420911, 'recall': 0.9996080446019743, 'f1-score': 0.9996236306268526, 'support': 309825}
[[108225     22      1]
 [     0 200539      0]
 [     1      0   1037]]


In [None]:
#defining various steps required for the genetic algorithm
def initilization_of_population(size, n_feat):
    population = []
    for i in range(size):
        chromosome = np.ones(n_feat, dtype=np.bool)
        chromosome[:int(0.3*n_feat)]=False
        np.random.shuffle(chromosome)
        population.append(chromosome)
    return population

def fitness_score(population, model_of_choice, data, test_size=0.3):
    X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=["Label"]), df["Label"], test_size=test_size, random_state=RANDOM_STATE)
    scores = []
    for chromosome in population:
        model_of_choice.fit(X_train.iloc[:,chromosome], y_train)
        predictions = model_of_choice.predict(X_test.iloc[:,chromosome])
        scores.append(f1_score(y_test,predictions, average='macro'))
    scores, population = np.array(scores), np.array(population) 
    inds = np.argsort(scores)
    print("yes")
    return list(scores[inds][::-1]), list(population[inds,:][::-1])

def selection(pop_after_fit,n_parents):
    population_nextgen = []
    for i in range(n_parents):
        population_nextgen.append(pop_after_fit[i])
    return population_nextgen

def crossover(pop_after_sel):
    population_nextgen=pop_after_sel
    for i in range(len(pop_after_sel)):
        child=pop_after_sel[i]
        child[3:7]=pop_after_sel[(i+1)%len(pop_after_sel)][3:7]
        population_nextgen.append(child)
    return population_nextgen

def mutation(pop_after_cross,mutation_rate):
    population_nextgen = []
    for i in range(0, len(pop_after_cross)):
        chromosome = pop_after_cross[i]
        for j in range(len(chromosome)):
            if random.random() < mutation_rate:
                chromosome[j]= not chromosome[j]
        population_nextgen.append(chromosome)
    return population_nextgen

def generations(size, n_feat, n_parents, mutation_rate, n_gen, model_of_choice, data):
    best_chromo = []
    best_score = []
    population_nextgen=initilization_of_population(size, n_feat)
    for i in range(n_gen):
        scores, pop_after_fit = fitness_score(population_nextgen, model_of_choice, data)
        pop_after_sel = selection(pop_after_fit, n_parents)
        pop_after_cross = crossover(pop_after_sel)
        population_nextgen = mutation(pop_after_cross,mutation_rate)
        best_chromo.append(pop_after_fit[0])
        best_score.append(scores[0])
    return best_chromo, best_score

In [None]:
# Removing columns with constant values
for column_name in df.columns:
    if df[column_name].nunique() == 1:
        df.drop(column_name, axis=1, inplace=True)

In [None]:
# Getting Continuous features
continuous = []
count = pd.DataFrame(dtype=np.int64, columns=["Number of Values"])
for column_name in df.columns:
    if column_name != "Label":
        count.loc[column_name] = df[column_name].nunique()
        if df[column_name].nunique() != CATEGORICAL_FEATURE_COUNT:
            continuous.append(column_name)

In [None]:
# Standard Scaling all the Continuous Values
scaler = StandardScaler()
scaler.fit(df[continuous])
df[continuous] = scaler.transform(df[continuous])
test[continuous] = scaler.transform(test[continuous])

In [None]:
# Genetic Algorithm: Naive Bayes
logmodel = GaussianNB()
chromo, score = generations(size=POPULATION_SIZE,
                            n_feat=(len(df.columns) - 1),
                            n_parents=PARENTS,
                            mutation_rate=MUTATION_RATE,
                            n_gen=NUMBER_OF_GENERATIONS,
                            data=df,
                            model_of_choice=logmodel)

yes
yes
yes
yes
yes


In [None]:
print("Naive bayes after...")
final_features = df.columns[:-1][chromo[-1]]
print(len(final_features))
print(final_features)

Naive bayes after...
38
Index(['Flow Duration', 'Tot Fwd Pkts', 'TotLen Bwd Pkts', 'Bwd Pkt Len Max',
       'Bwd Pkt Len Min', 'Bwd Pkt Len Mean', 'Bwd Pkt Len Std', 'Flow Byts/s',
       'Flow IAT Std', 'Flow IAT Max', 'Fwd IAT Mean', 'Fwd IAT Tot',
       'Fwd IAT Min', 'Fwd IAT Std', 'Bwd IAT Mean', 'Bwd IAT Max',
       'Bwd IAT Std', 'Bwd Header Len', 'Bwd Pkts/s', 'Pkt Len Mean',
       'Pkt Len Max', 'Pkt Len Var', 'PSH Flag Cnt', 'ACK Flag Cnt',
       'URG Flag Cnt', 'ECE Flag Cnt', 'Down/Up Ratio', 'Bwd Seg Size Avg',
       'Init Fwd Win Byts', 'Fwd Seg Size Min', 'Active Mean', 'Active Max',
       'Idle Mean', 'Idle Max', 'Idle Min', 'protocol_HOPOPT', 'protocol_TCP',
       'protocol_UDP'],
      dtype='object')


In [None]:
myModel=GaussianNB()
myModel.fit(df.loc[:, final_features], df['Label'])
pred=myModel.predict(test.loc[:, final_features])
print(classification_report(test["Label"], pred, output_dict=True)["macro avg"])
print(confusion_matrix(test["Label"], pred))

{'precision': 0.8871033872501334, 'recall': 0.6837324169191733, 'f1-score': 0.6311794621806867, 'support': 309825}
[[  5542 102706      0]
 [     0 200539      0]
 [     0      0   1038]]


In [None]:
# Genetic Algorithm: Random Forest
logmodel = RandomForestClassifier(n_estimators = 50, max_depth=5, n_jobs=-1)
chromo, score = generations(size=POPULATION_SIZE,
                            n_feat=(len(df.columns) - 1),
                            n_parents=PARENTS,
                            mutation_rate=MUTATION_RATE,
                            n_gen=NUMBER_OF_GENERATIONS,
                            data=df,
                            model_of_choice=logmodel)

yes
yes
yes
yes
yes


In [None]:
print("Random Forest after...")
final_features = df.columns[:-1][chromo[-1]]
print(len(final_features))
final_features

Random Forest after...
40


Index(['TotLen Fwd Pkts', 'Fwd Pkt Len Max', 'Fwd Pkt Len Mean',
       'Fwd Pkt Len Std', 'Bwd Pkt Len Min', 'Bwd Pkt Len Mean',
       'Bwd Pkt Len Std', 'Flow Byts/s', 'Flow Pkts/s', 'Flow IAT Std',
       'Flow IAT Min', 'Fwd IAT Tot', 'Fwd IAT Max', 'Fwd IAT Std',
       'Bwd IAT Mean', 'Bwd IAT Tot', 'Bwd IAT Max', 'Bwd IAT Std',
       'Fwd Header Len', 'Bwd Header Len', 'Fwd Pkts/s', 'Pkt Len Mean',
       'Pkt Len Var', 'SYN Flag Cnt', 'PSH Flag Cnt', 'URG Flag Cnt',
       'Down/Up Ratio', 'Pkt Size Avg', 'Fwd Seg Size Avg', 'Bwd Seg Size Avg',
       'Subflow Fwd Pkts', 'Subflow Bwd Pkts', 'Subflow Bwd Byts',
       'Init Fwd Win Byts', 'Init Bwd Win Byts', 'Fwd Seg Size Min',
       'Active Mean', 'Idle Min', 'protocol_HOPOPT', 'protocol_UDP'],
      dtype='object')

In [None]:
myModel=RandomForestClassifier()
myModel.fit(df.loc[:, final_features], df['Label'])
pred=myModel.predict(test.loc[:, final_features])
print(classification_report(test["Label"], pred, output_dict=True)["macro avg"])
print(confusion_matrix(test["Label"], pred))

{'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 309825}
[[108248      0      0]
 [     0 200539      0]
 [     0      0   1038]]


In [20]:
# Genetic Algorithm: Linear SVC
logmodel = LinearSVC(dual=False)
chromo, score = generations(size=POPULATION_SIZE,
                            n_feat=(len(df.columns) - 1),
                            n_parents=PARENTS,
                            mutation_rate=MUTATION_RATE,
                            n_gen=NUMBER_OF_GENERATIONS,
                            data=df,
                            model_of_choice=logmodel)



yes




yes




yes




yes




yes


In [21]:
print("LinearSVC after...")
final_features = df.columns[:-1][chromo[-1]]
print(len(final_features))
final_features

LinearSVC after...
38


Index(['Flow Duration', 'Fwd Pkt Len Max', 'Fwd Pkt Len Mean',
       'Bwd Pkt Len Min', 'Flow Pkts/s', 'Flow IAT Mean', 'Flow IAT Max',
       'Fwd IAT Mean', 'Fwd IAT Tot', 'Fwd IAT Min', 'Bwd IAT Mean',
       'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags', 'Bwd Header Len',
       'Pkt Len Mean', 'Pkt Len Max', 'Pkt Len Std', 'SYN Flag Cnt',
       'RST Flag Cnt', 'PSH Flag Cnt', 'ACK Flag Cnt', 'ECE Flag Cnt',
       'Pkt Size Avg', 'Fwd Seg Size Avg', 'Bwd Seg Size Avg',
       'Subflow Bwd Byts', 'Init Fwd Win Byts', 'Init Bwd Win Byts',
       'Fwd Act Data Pkts', 'Active Max', 'Active Min', 'Active Std',
       'Idle Mean', 'Idle Min', 'Idle Std', 'protocol_HOPOPT', 'protocol_UDP'],
      dtype='object')

In [22]:
myModel = LinearSVC(dual=False)
myModel.fit(df.loc[:, final_features], df['Label'])
pred=myModel.predict(test.loc[:, final_features])
print(classification_report(test["Label"], pred, output_dict=True)["macro avg"])
print(confusion_matrix(test["Label"], pred))

{'precision': 0.9987104366449154, 'recall': 0.9999692065134383, 'f1-score': 0.9993385904178602, 'support': 309825}
[[108238      6      4]
 [     0 200539      0]
 [     0      0   1038]]
