In [None]:
from sklearn import impute, preprocessing, metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import shuffle
import pandas as pd
import numpy as np
import sklearn as skl
from numpy import loadtxt
from collections import Counter

### Load Datasets

In [None]:
dados_DDos = pd.read_csv('CICIDS2017_Datasets\Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv', sep=',')
dados_PortScan = pd.read_csv('CICIDS2017_Datasets\Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv', sep=',')
dados_Bot = pd.read_csv('CICIDS2017_Datasets\Friday-WorkingHours-Morning.pcap_ISCX.csv', sep=',')

### Delete rows with NaN Values

In [None]:
for i in [dados_DDos,dados_PortScan,dados_Bot]:
    i.dropna(how='any', inplace=True)

In [None]:
for df in dados_DDos,dados_PortScan,dados_Bot:

    df.drop(['Flow ID'],axis=1,inplace=True)
    df.drop([' Timestamp'],axis=1,inplace=True)
    df.drop([' Source IP'], axis=1,inplace=True)
    df.drop([' Destination IP'], axis=1,inplace=True)
    df[' Bwd Packet Length Std']=df[' Bwd Packet Length Std'].astype(np.float)
    df['Flow Bytes/s']=df['Flow Bytes/s'].astype(np.float)
    df[' Flow Packets/s']=df[' Flow Packets/s'].astype(np.float)
    df[' Flow IAT Std']=df[' Flow IAT Std'].astype(np.float)
    df['Fwd Packets/s']=df['Fwd Packets/s'].astype(np.float)
    df[' Bwd Packets/s']=df[' Bwd Packets/s'].astype(np.float)
    df[' Packet Length Mean']=df[' Packet Length Mean'].astype(np.float)
    df[' Packet Length Std']=df[' Packet Length Std'].astype(np.float)
    df[' Packet Length Variance']=df[' Packet Length Variance'].astype(np.float)
    df[' Average Packet Size']=df[' Average Packet Size'].astype(np.float)
    df[' Avg Fwd Segment Size']=df[' Avg Fwd Segment Size'].astype(np.float)
    df.replace([np.inf, -np.inf], np.nan).dropna(axis=0, inplace=True)

### Multi Column Label Encoder

In [None]:
class MultiColumnLabelEncoder:
    def __init__(self,columns = None):
        self.columns = columns # array of column names to encode

    def fit(self,X,y=None):
        return self # not relevant here

    def transform(self,X):
        output = X.copy()
        if self.columns is not None:
            for col in self.columns:
                output[col] = LabelEncoder().fit_transform(output[col])
        else:
            for colname,col in output.iteritems():
                output[colname] = LabelEncoder().fit_transform(col)
        return output

    def fit_transform(self,X,y=None):
        return self.fit(X,y).transform(X)

In [None]:
class_name_DDos = dados_DDos[' Label'].unique()
class_name_PortScan = dados_PortScan[' Label'].unique()
class_name_Bot = dados_Bot[' Label'].unique()

for i in [dados_DDos,dados_PortScan,dados_Bot]:
    label_encoder = preprocessing.LabelEncoder()
    input_classes = i[' Label'].unique()
    label_encoder.fit(input_classes)
    i[' Label'] = label_encoder.transform(i[' Label'])

### Negative values check

In [None]:
for df in [dados_DDos,dados_PortScan,dados_Bot]:
    columns = df.columns
    #print('----------------------------')
    for i in columns:
        if (df[i] < 0).any():
            number_of_negatives = np.sum((df[i] < 0).values.ravel())
           # print(i + ' : '+ str(number_of_negatives))

### Drop single unique value features and the two features with high number of negatives

In [None]:
for i in dados_DDos,dados_PortScan,dados_Bot:
    i.drop([' Bwd PSH Flags'], axis=1,inplace=True)
    i.drop([' Fwd URG Flags'], axis=1,inplace=True)
    i.drop([' Bwd URG Flags'], axis=1,inplace=True)
    i.drop([' CWE Flag Count'], axis=1,inplace=True)
    i.drop(['Fwd Avg Bytes/Bulk'], axis=1,inplace=True)
    i.drop([' Fwd Avg Packets/Bulk'], axis=1,inplace=True)
    i.drop([' Fwd Avg Bulk Rate'], axis=1,inplace=True)
    i.drop([' Bwd Avg Bytes/Bulk'], axis=1,inplace=True)
    i.drop([' Bwd Avg Packets/Bulk'], axis=1,inplace=True)
    i.drop(['Bwd Avg Bulk Rate'], axis=1,inplace=True)
    i.drop(['Init_Win_bytes_forward'], axis=1,inplace=True) #muitos valores negativos
    i.drop([' Init_Win_bytes_backward'], axis=1,inplace=True) #muitos valores negativos

dados_DDos = dados_DDos.replace([np.inf, -np.inf], np.nan).dropna(axis=0)
dados_PortScan = dados_PortScan.replace([np.inf, -np.inf], np.nan).dropna(axis=0)
dados_Bot = dados_Bot.replace([np.inf, -np.inf], np.nan).dropna(axis=0)

### Remove all rows with negative values

In [None]:
############## Convert negative values to NaN 
dados_DDos_without_neg = dados_DDos[dados_DDos >= 0]
dados_PortScan_without_neg = dados_PortScan[dados_PortScan >= 0]
dados_Bot_without_neg = dados_Bot[dados_Bot >= 0]

############# Remove the rows with NaN 
dados_DDos_without_neg = dados_DDos_without_neg.replace([np.inf, -np.inf], np.nan).dropna(axis=0)
dados_PortScan_without_neg = dados_PortScan_without_neg.replace([np.inf, -np.inf], np.nan).dropna(axis=0)
dados_Bot_without_neg = dados_Bot_without_neg.replace([np.inf, -np.inf], np.nan).dropna(axis=0)


### Check the variables correlation

In [None]:
c = dados_DDos_without_neg.corr().abs()
s = c.unstack()
print(s[' Label'])
so = s.sort_values(kind="quicksort")
print(so[-61:-1])

filtered_correlation = {}
for item in so.iteritems(): # retorna os valores das correlações entre 0.7 e 1 sem repetições
    a = item[0][0]
    b = item[0][1]
    if (item[1] >= 0.7 and  item[1] < 1):
        if not(b in filtered_correlation and a in filtered_correlation[b]):
            if a not in filtered_correlation:
                filtered_correlation[a] = {}
            filtered_correlation[a][b]=item[1]

for i in filtered_correlation: # imprime os valores de forma bonitinha
    for j in filtered_correlation[i]:
        print(i + '\t' + j + '\t' + str(filtered_correlation[i][j]))

### For the variables more than 70% correlated, remove one of variables of each correlation

In [None]:
############################--- Remover as variáveis + de 70% correlacionadas ----##################################
for i in dados_DDos_without_neg,dados_PortScan_without_neg,dados_Bot_without_neg:
    i.drop([' Total Backward Packets'], axis=1,inplace=True)
    i.drop([' Total Length of Bwd Packets'], axis=1,inplace=True)
    i.drop([' Bwd Packet Length Std'], axis=1,inplace=True)
    i.drop([' Fwd Packet Length Mean'], axis=1,inplace=True)
    i.drop([' Flow IAT Std'], axis=1,inplace=True)
    i.drop(['Fwd IAT Total'], axis=1,inplace=True)
    i.drop([' Packet Length Std'], axis=1,inplace=True)
    i.drop([' Packet Length Variance'], axis=1,inplace=True)
    i.drop([' Fwd Header Length.1'], axis=1,inplace=True)
    i.drop(['Subflow Fwd Packets'], axis=1,inplace=True)
    i.drop([' Subflow Bwd Packets'], axis=1,inplace=True)
    i.drop([' Subflow Bwd Bytes'], axis=1,inplace=True)
    i.drop([' Active Max'], axis=1,inplace=True)
    i.drop([' Active Min'], axis=1,inplace=True)
    i.drop(['Idle Mean'], axis=1,inplace=True)
    i.drop([' Idle Min'], axis=1,inplace=True)
    i.drop([' Idle Max'], axis=1,inplace=True)
    i.drop([' Bwd IAT Mean'], axis=1,inplace=True)
    i.drop([' Fwd IAT Mean'], axis=1,inplace=True)
    i.drop([' Destination Port'], axis=1,inplace=True)
    i.drop(['Bwd Packet Length Max'], axis=1,inplace=True)
    i.drop([' Avg Fwd Segment Size'], axis=1,inplace=True)
    i.drop([' Bwd Packet Length Min'], axis=1,inplace=True)
    i.drop([' Bwd Header Length'], axis=1,inplace=True)
    i.drop([' Max Packet Length'], axis=1,inplace=True)
    i.drop([' PSH Flag Count'], axis=1,inplace=True)
    i.drop([' Bwd IAT Std'], axis=1,inplace=True)
    i.drop([' Bwd IAT Max'], axis=1,inplace=True)
    i.drop([' Fwd Packet Length Max'], axis=1,inplace=True)
    i.drop([' Fwd IAT Std'], axis=1,inplace=True)
    i.drop([' Min Packet Length'], axis=1,inplace=True)
    i.drop([' Flow Duration'], axis=1,inplace=True)
    i.drop([' Flow IAT Min'], axis=1,inplace=True) 
    i.drop([' Flow IAT Max'], axis=1,inplace=True)
    i.drop([' Bwd IAT Min'], axis=1,inplace=True)
    i.drop(['Fwd Packets/s'], axis=1,inplace=True)
    i.drop([' ECE Flag Count'], axis=1,inplace=True)
    i.drop([' Subflow Fwd Bytes'], axis=1,inplace=True)
    i.drop([' SYN Flag Count'], axis=1,inplace=True)
    i.drop([' Average Packet Size'], axis=1,inplace=True)
    i.drop([' Packet Length Mean'], axis=1,inplace=True)
    i.drop([' Bwd Packet Length Mean'], axis=1,inplace=True)
    i.drop([' Fwd Packet Length Std'], axis=1,inplace=True)

dados_DDos_without_neg.drop([' act_data_pkt_fwd'] , axis=1,inplace=True)
dados_DDos_without_neg.drop([' Fwd IAT Max'] , axis=1,inplace=True)
dados_DDos_without_neg.drop([' Total Fwd Packets'] , axis=1,inplace=True)
dados_PortScan_without_neg.drop([' act_data_pkt_fwd'] , axis=1,inplace=True)
dados_PortScan_without_neg.drop([' Flow IAT Mean'] , axis=1,inplace=True)
dados_PortScan_without_neg.drop([' Fwd Header Length'] , axis=1,inplace=True)
dados_PortScan_without_neg.drop([' Fwd Packet Length Min'] , axis=1,inplace=True)
dados_Bot_without_neg.drop([' Flow IAT Mean'] , axis =1,inplace=True)
dados_Bot_without_neg.drop([' Total Fwd Packets'] , axis=1,inplace=True)

### Log transformation

In [None]:
for i in [dados_DDos_without_neg,dados_PortScan_without_neg,dados_Bot_without_neg]:
    columns = i.columns
    for j in columns:
        max=i[j].max()
        if(max>10000):
            #print(j)
            #print(max)
            i[j]=np.log(1 + i[j])

dados_DDos_without_neg = dados_DDos_without_neg.replace([np.inf, -np.inf], np.nan).dropna(axis=0)
dados_PortScan_without_neg = dados_PortScan_without_neg.replace([np.inf, -np.inf], np.nan).dropna(axis=0)
dados_Bot_without_neg = dados_Bot_without_neg.replace([np.inf, -np.inf], np.nan).dropna(axis=0)

### Normalisation

In [None]:
#[0,1]
features_normalized_DDoS = preprocessing.normalize(dados_DDos_without_neg,norm='max',axis=0)
features_normalized_DDoS = pd.DataFrame(features_normalized_DDoS, columns=dados_DDos_without_neg.columns)
#features_normalized_DDoS['Total Length of Fwd Packets'].hist()
#plt.savefig('After_Normalisation_Fwd_Header_Length.png')

features_normalized_PortScan = preprocessing.normalize(dados_PortScan_without_neg,norm='max',axis=0)
features_normalized_PortScan = pd.DataFrame(features_normalized_PortScan, columns=dados_PortScan_without_neg.columns)

features_normalized_bot = preprocessing.normalize(dados_Bot_without_neg,norm='max',axis=0)
features_normalized_bot = pd.DataFrame(features_normalized_bot, columns=dados_Bot_without_neg.columns)

### Remove duplicated rows

In [None]:
# Duplicated Rows in original data
duplicateRowsDF = features_normalized_DDoS[features_normalized_DDoS.duplicated()]
#print(duplicateRowsDF.count())
features_normalized_DDoS=features_normalized_DDoS.drop_duplicates()

duplicateRowsDF = features_normalized_PortScan[features_normalized_PortScan.duplicated()]
#print(duplicateRowsDF.count())
features_normalized_PortScan=features_normalized_PortScan.drop_duplicates()

duplicateRowsDF = features_normalized_bot[features_normalized_bot.duplicated()]
#print(duplicateRowsDF.count())
features_normalized_bot=features_normalized_bot.drop_duplicates()

### Shuffle

In [None]:
features_normalized_DDoS = shuffle(features_normalized_DDoS)
features_normalized_PortScan = shuffle(features_normalized_PortScan)
features_normalized_bot = shuffle(features_normalized_bot)

### Class seperation

In [None]:
DDoS_benign = features_normalized_DDoS[' Label'] == 0
DDoS_attack = features_normalized_DDoS[' Label'] == 1

PortScan_benign = features_normalized_PortScan[' Label'] == 0
PortScan_attack = features_normalized_PortScan[' Label'] == 1

Botnet_benign = features_normalized_bot[' Label'] == 0
Botnet_attack = features_normalized_bot[' Label'] == 1


### 66-33 Train-test split

In [None]:
DDoS_Benign_train1, DDoS_Benign_train2, DDoS_Benign_test = np.array_split(features_normalized_DDoS[DDoS_benign], 3)
DDoS_Attack_train1, DDoS_Attack_train2, DDoS_attack_test = np.array_split(features_normalized_DDoS[DDoS_attack], 3)

PortScan_Benign_train1, PortScan_Benign_train2, PortScan_Benign_test = np.array_split(features_normalized_PortScan[PortScan_benign], 3)
PortScan_Attack_train1, PortScan_Attack_train2, PortScan_Attack_test = np.array_split(features_normalized_PortScan[PortScan_attack], 3)

Bot_Benign_train1, Bot_Benign_train2, Bot_Bening_test = np.array_split(features_normalized_bot[Botnet_benign], 3)
Bot_Attack_train1, Bot_Attack_train2, Bot_Attack_test = np.array_split(features_normalized_bot[Botnet_attack], 3)

In [None]:
# Join all train and test data and shuffle

DDoS_train = pd.concat([DDoS_Benign_train1, DDoS_Benign_train2, DDoS_Attack_train1, DDoS_Attack_train2])
DDoS_test = pd.concat([DDoS_Benign_test, DDoS_attack_test])
DDoS_train = shuffle(DDoS_train)
DDoS_test = shuffle(DDoS_test)

PortScan_train = pd.concat([PortScan_Benign_train1, PortScan_Benign_train2, PortScan_Attack_train1, PortScan_Attack_train2])
PortScan_test = pd.concat([PortScan_Benign_test, PortScan_Attack_test])
PortScan_train = shuffle(PortScan_train)
PortScan_test = shuffle(PortScan_test)

Bot_train = pd.concat([Bot_Benign_train1, Bot_Benign_train2, Bot_Attack_train1, Bot_Attack_train2,])
Bot_test = pd.concat([Bot_Bening_test, Bot_Attack_test])
Bot_train = shuffle(Bot_train)
Bot_test = shuffle(Bot_test)

In [None]:
#Save train and test data in csv files

DDoS_train.to_csv(r'Train_test_data/DDoS_train.csv', index = False)
DDoS_test.to_csv(r'Train_test_data/DDoS_test.csv', index = False)

PortScan_train.to_csv(r'Train_test_data/PortScan_train.csv', index = False)
PortScan_test.to_csv(r'Train_test_data/PortScan_test.csv', index = False)

Bot_train.to_csv(r'Train_test_data/Bot_train.csv', index = False)
Bot_test.to_csv(r'Train_test_data/Bot_test.csv', index = False)

### Data for the GAN models

In [None]:
GAN_Bot =  pd.concat([Bot_Attack_train1, Bot_Attack_train2,])
GAN_Bot = shuffle(GAN_Bot)
GAN_Bot.to_csv(r'GAN_generation\Bot_Attack_Data.csv', index = False)

### See the amount of data needed for GAN generation

In [None]:
counter = Counter(Bot_train[' Label'])
print(counter)