In [None]:
from sklearn import impute, preprocessing, metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import shuffle
import pandas as pd
import numpy as np
import sklearn as skl
from collections import Counter

### Load Datasets 

In [None]:
dados_WebAttacks = pd.read_csv('CICIDS2017_Datasets\Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv', sep=';' , encoding='latin-1')
dados_BruteForce = pd.read_csv('CICIDS2017_Datasets\Tuesday-WorkingHours.pcap_ISCX.csv', sep=',')
dados_Dos = pd.read_csv('CICIDS2017_Datasets\Wednesday-workingHours.pcap_ISCX.csv', sep=',')

### Delete rows with NaN Values

In [None]:
for i in [dados_WebAttacks,dados_BruteForce,dados_Dos]:
    i.dropna(how='any', inplace=True)

In [None]:
for df in dados_WebAttacks,dados_BruteForce,dados_Dos:

    df.drop(['Flow ID'],axis=1,inplace=True)
    df.drop([' Timestamp'],axis=1,inplace=True)
    df.drop([' Source IP'], axis=1,inplace=True)
    df.drop([' Destination IP'], axis=1,inplace=True)
    df[' Bwd Packet Length Std']=df[' Bwd Packet Length Std'].astype(np.float)
    df['Flow Bytes/s']=df['Flow Bytes/s'].astype(np.float)
    df[' Flow Packets/s']=df[' Flow Packets/s'].astype(np.float)
    df[' Flow IAT Std']=df[' Flow IAT Std'].astype(np.float)
    df['Fwd Packets/s']=df['Fwd Packets/s'].astype(np.float)
    df[' Bwd Packets/s']=df[' Bwd Packets/s'].astype(np.float)
    df[' Packet Length Mean']=df[' Packet Length Mean'].astype(np.float)
    df[' Packet Length Std']=df[' Packet Length Std'].astype(np.float)
    df[' Packet Length Variance']=df[' Packet Length Variance'].astype(np.float)
    df[' Average Packet Size']=df[' Average Packet Size'].astype(np.float)
    df[' Avg Fwd Segment Size']=df[' Avg Fwd Segment Size'].astype(np.float)
    df.replace([np.inf, -np.inf], np.nan).dropna(axis=0, inplace=True)

### Multi Column Label Encoder

In [None]:
class MultiColumnLabelEncoder:
    def __init__(self,columns = None):
        self.columns = columns # array of column names to encode

    def fit(self,X,y=None):
        return self # not relevant here

    def transform(self,X):
        output = X.copy()
        if self.columns is not None:
            for col in self.columns:
                output[col] = LabelEncoder().fit_transform(output[col])
        else:
            for colname,col in output.iteritems():
                output[colname] = LabelEncoder().fit_transform(col)
        return output

    def fit_transform(self,X,y=None):
        return self.fit(X,y).transform(X)

In [None]:
for i in [dados_WebAttacks,dados_BruteForce,dados_Dos]:
    label_encoder = preprocessing.LabelEncoder()
    input_classes = i[' Label'].unique()
    #print(input_classes)
    label_encoder.fit(input_classes)
    i[' Label'] = label_encoder.transform(i[' Label'])
    #print(i[' Label'].unique())

### Negative values check

In [None]:
for df in [dados_WebAttacks,dados_BruteForce,dados_Dos]:
    columns = df.columns
    #print('----------------------------')
    for i in columns:
        if (df[i] < 0).any():
            number_of_negatives = np.sum((df[i] < 0).values.ravel())
            #print(i + ' : '+ str(number_of_negatives))

### Drop single unique value features and the two features with high number of negatives

In [None]:
for i in dados_WebAttacks,dados_BruteForce,dados_Dos:
    i.drop([' Bwd PSH Flags'], axis=1,inplace=True)
    i.drop([' Fwd URG Flags'], axis=1,inplace=True)
    i.drop([' Bwd URG Flags'], axis=1,inplace=True)
    i.drop([' CWE Flag Count'], axis=1,inplace=True)
    i.drop(['Fwd Avg Bytes/Bulk'], axis=1,inplace=True)
    i.drop([' Fwd Avg Packets/Bulk'], axis=1,inplace=True)
    i.drop([' Fwd Avg Bulk Rate'], axis=1,inplace=True)
    i.drop([' Bwd Avg Bytes/Bulk'], axis=1,inplace=True)
    i.drop([' Bwd Avg Packets/Bulk'], axis=1,inplace=True)
    i.drop(['Bwd Avg Bulk Rate'], axis=1,inplace=True)
    i.drop(['Init_Win_bytes_forward'], axis=1,inplace=True) #muitos valores negativos
    i.drop([' Init_Win_bytes_backward'], axis=1,inplace=True) #muitos valores negativos

# Fix the webattack dataset  misspelled Fwd Header Length.1 feature   
dados_WebAttacks.rename(columns={' Fwd Header Length_1':' Fwd Header Length.1'}, inplace=True)

dados_WebAttacks = dados_WebAttacks.replace([np.inf, -np.inf], np.nan).dropna(axis=0)
dados_BruteForce = dados_BruteForce.replace([np.inf, -np.inf], np.nan).dropna(axis=0)
dados_Dos = dados_Dos.replace([np.inf, -np.inf], np.nan).dropna(axis=0)

### Remove all rows with negative values

In [None]:
############## Convert negative values to NaN 
dados_WebAttacks_without_neg = dados_WebAttacks[dados_WebAttacks >= 0]
dados_BruteForce_without_neg = dados_BruteForce[dados_BruteForce >= 0]
dados_Dos_without_neg = dados_Dos[dados_Dos >= 0]

############# Remove the rows with NaN 
dados_WebAttacks_without_neg = dados_WebAttacks_without_neg.replace([np.inf, -np.inf], np.nan).dropna(axis=0)
dados_BruteForce_without_neg = dados_BruteForce_without_neg.replace([np.inf, -np.inf], np.nan).dropna(axis=0)
dados_Dos_without_neg = dados_Dos_without_neg.replace([np.inf, -np.inf], np.nan).dropna(axis=0)

### Check the variables correlation

In [None]:
c = dados_Dos_without_neg.corr().abs()
s = c.unstack()
#print(s[' Label'])
so = s.sort_values(kind="quicksort")
#print(so[-61:-1])

filtered_correlation = {}
for item in so.iteritems(): # retorna os valores das correlações entre 0.7 e 1 sem repetições
    a = item[0][0]
    b = item[0][1]
    if (item[1] >= 0.7 and  item[1] < 1):
        if not(b in filtered_correlation and a in filtered_correlation[b]):
            if a not in filtered_correlation:
                filtered_correlation[a] = {}
            filtered_correlation[a][b]=item[1]

for i in filtered_correlation: # print the values
    for j in filtered_correlation[i]:
        print(i + '\t' + j + '\t' + str(filtered_correlation[i][j]))

### For the variables more than 70% correlated, remove one of variables of each correlation

In [None]:
for i in dados_WebAttacks_without_neg,dados_BruteForce_without_neg,dados_Dos_without_neg:
        i.drop([' Total Backward Packets'], axis=1,inplace=True)
        i.drop([' Total Length of Bwd Packets'], axis=1,inplace=True)
        i.drop([' Bwd Packet Length Std'], axis=1,inplace=True)
        i.drop([' Fwd Packet Length Mean'], axis=1,inplace=True)
        i.drop([' Flow IAT Std'], axis=1,inplace=True)
        i.drop(['Fwd IAT Total'], axis=1,inplace=True)
        i.drop([' Packet Length Std'], axis=1,inplace=True)
        i.drop([' Packet Length Variance'], axis=1,inplace=True)
        i.drop([' Fwd Header Length.1'], axis=1,inplace=True)
        i.drop(['Subflow Fwd Packets'], axis=1,inplace=True)
        i.drop([' Subflow Bwd Packets'], axis=1,inplace=True)
        i.drop([' Subflow Bwd Bytes'], axis=1,inplace=True)
        i.drop([' Active Max'], axis=1,inplace=True)
        i.drop([' Active Min'], axis=1,inplace=True)
        i.drop(['Idle Mean'], axis=1,inplace=True)
        i.drop([' Idle Min'], axis=1,inplace=True)
        i.drop([' Idle Max'], axis=1,inplace=True)
        i.drop([' Bwd IAT Mean'], axis=1,inplace=True)
        i.drop([' Fwd IAT Mean'], axis=1,inplace=True)
        i.drop([' Destination Port'], axis=1,inplace=True)
        i.drop(['Bwd Packet Length Max'], axis=1,inplace=True)
        i.drop([' Avg Fwd Segment Size'], axis=1,inplace=True)
        i.drop([' Bwd Packet Length Min'], axis=1,inplace=True)
        i.drop([' Bwd Header Length'], axis=1,inplace=True)
        i.drop([' Max Packet Length'], axis=1,inplace=True)
        i.drop([' Bwd IAT Std'], axis=1,inplace=True)
        i.drop([' Bwd IAT Max'], axis=1,inplace=True)
        i.drop([' Fwd Packet Length Max'], axis=1,inplace=True)
        i.drop([' Fwd IAT Std'], axis=1,inplace=True)
        i.drop([' Min Packet Length'], axis=1,inplace=True)
        i.drop([' Flow Duration'], axis=1,inplace=True) 
        i.drop([' Flow IAT Min'], axis=1,inplace=True) 
        i.drop([' Flow IAT Max'], axis=1,inplace=True)
        i.drop([' Bwd IAT Min'], axis=1,inplace=True)
        i.drop(['Fwd Packets/s'], axis=1,inplace=True)
        i.drop([' ECE Flag Count'], axis=1,inplace=True)
        i.drop([' Subflow Fwd Bytes'], axis=1,inplace=True)
        i.drop([' SYN Flag Count'], axis=1,inplace=True)
        i.drop([' Average Packet Size'], axis=1,inplace=True)
        i.drop([' Packet Length Mean'], axis=1,inplace=True)
        i.drop([' Bwd Packet Length Mean'], axis=1,inplace=True)
        i.drop([' Fwd Packet Length Std'], axis=1,inplace=True)

dados_WebAttacks_without_neg.drop([' act_data_pkt_fwd'] , axis=1,inplace=True)
dados_WebAttacks_without_neg.drop([' Total Fwd Packets'] , axis=1,inplace=True)
dados_BruteForce_without_neg.drop([' Flow IAT Mean'] , axis=1,inplace=True)
dados_BruteForce_without_neg.drop([' Total Fwd Packets'] , axis=1,inplace=True)
dados_Dos_without_neg.drop([' Avg Bwd Segment Size'] , axis=1,inplace=True)
dados_Dos_without_neg.drop([' Total Fwd Packets'] , axis=1,inplace=True)
dados_Dos_without_neg.drop([' Fwd Header Length'] , axis=1,inplace=True)


### Log transformation

In [None]:
for i in [dados_WebAttacks_without_neg,dados_BruteForce_without_neg,dados_Dos_without_neg]:
    columns = i.columns
    for j in columns:
        max=i[j].max()
        if(max>10000):
            i[j]=np.log(1 + i[j])

dados_WebAttacks_without_neg = dados_WebAttacks_without_neg.replace([np.inf, -np.inf], np.nan).dropna(axis=0)
dados_BruteForce_without_neg = dados_BruteForce_without_neg.replace([np.inf, -np.inf], np.nan).dropna(axis=0)
dados_Dos_without_neg = dados_Dos_without_neg.replace([np.inf, -np.inf], np.nan).dropna(axis=0)

### Normalisation

In [None]:
dados_WebAttacks_normalized = dados_WebAttacks_without_neg
dados_BruteForce_normalized = dados_BruteForce_without_neg
dados_Dos_normalized = dados_Dos_without_neg

############## WebAttacks
Web_columns = dados_WebAttacks_normalized.columns[:-1]
dados_WebAttacks_normalized[Web_columns] = preprocessing.normalize(dados_WebAttacks_normalized[Web_columns], norm='max',axis=0)

############## BruteForce
BrutForce_columns = dados_BruteForce_normalized.columns[:-1]
dados_BruteForce_normalized[BrutForce_columns] = preprocessing.normalize(dados_BruteForce_normalized[BrutForce_columns], norm='max',axis=0)

############## Dos
Dos_columns = dados_Dos_normalized.columns[:-1]
dados_Dos_normalized[Dos_columns] = preprocessing.normalize(dados_Dos_normalized[Dos_columns], norm='max',axis=0)


### Remove duplicated rows

In [None]:
dados_WebAttacks_normalized = dados_WebAttacks_normalized.drop_duplicates()
dados_BruteForce_normalized = dados_BruteForce_normalized.drop_duplicates()
dados_Dos_normalized = dados_Dos_normalized.drop_duplicates()

### Shuffle

In [None]:
dados_WebAttacks_normalized = shuffle(dados_WebAttacks_normalized)
dados_BruteForce_normalized = shuffle(dados_BruteForce_normalized)
dados_Dos_normalized = shuffle(dados_Dos_normalized)

### Classes count

In [None]:
counter = Counter(dados_WebAttacks_normalized[' Label'])
print(counter)
counter = Counter(dados_BruteForce_normalized[' Label'])
print(counter)
counter = Counter(dados_Dos_normalized[' Label'])
print(counter)

### Class seperation

In [None]:
Web_Attack_Benign = dados_WebAttacks_normalized[' Label'] == 0
Web_Attack_Brute_Force = dados_WebAttacks_normalized[' Label'] == 1
Web_Attack_SQL_Injection = dados_WebAttacks_normalized[' Label'] == 2
Web_Attack_XSS = dados_WebAttacks_normalized[' Label'] == 3
#since Web_Attack_SQL_Injection won't be used, the value of the Web_Attack_XSS class should be changed for 2
dados_WebAttacks_normalized.loc[Web_Attack_XSS,' Label']=2

Brute_Force_Benign = dados_BruteForce_normalized[' Label'] == 0
Brute_Force_FTP_Patator = dados_BruteForce_normalized[' Label'] == 1
Brute_Force_SSH_Patator = dados_BruteForce_normalized[' Label'] == 2

Dos_Benign = dados_Dos_normalized[' Label'] == 0
Dos_GoldenEye = dados_Dos_normalized[' Label'] == 1
DoS_Hulk = dados_Dos_normalized[' Label'] == 2
DoS_Slowhttptest = dados_Dos_normalized[' Label'] == 3
DoS_Slowlori = dados_Dos_normalized[' Label'] == 4
Heartbleed = dados_Dos_normalized[' Label'] == 5

### 66-33 Train-test split

In [None]:
WebAttack_Benign_train1, WebAttack_Benign_train2, WebAttack_Benign_test = np.array_split(dados_WebAttacks_normalized[Web_Attack_Benign], 3)
WebAttack_Brute_Force_train1, WebAttack_Brute_Force_train2, WebAttack_Brute_Force_test = np.array_split(dados_WebAttacks_normalized[Web_Attack_Brute_Force], 3)
Web_Attack_XSS_train1, Web_Attack_XSS_train2, Web_Attack_XSS_test = np.array_split(dados_WebAttacks_normalized[Web_Attack_XSS], 3)

Brute_Force_Benign_train1, Brute_Force_Benign_train2, Brute_Force_Benign_test = np.array_split(dados_BruteForce_normalized[Brute_Force_Benign], 3)
Brute_Force_FTP_Patator_train1, Brute_Force_FTP_Patator_train2, Brute_Force_FTP_Patator_test = np.array_split(dados_BruteForce_normalized[Brute_Force_FTP_Patator], 3)
Brute_Force_SSH_Patator_train1, Brute_Force_SSH_Patator_train2, Brute_Force_SSH_Patator_test = np.array_split(dados_BruteForce_normalized[Brute_Force_SSH_Patator], 3)

Dos_Benign_train1, Dos_Benign_train2, Dos_Benign_test = np.array_split(dados_Dos_normalized[Dos_Benign], 3)
Dos_GoldenEye_train1, Dos_GoldenEye_train2, Dos_GoldenEye_test = np.array_split(dados_Dos_normalized[Dos_GoldenEye], 3)
DoS_Hulk_train1, DoS_Hulk_train2, DoS_Hulk_test = np.array_split(dados_Dos_normalized[DoS_Hulk], 3)
DoS_Slowhttptest_train1, DoS_Slowhttptest_train2, DoS_Slowhttptest_test = np.array_split(dados_Dos_normalized[DoS_Slowhttptest], 3)
DoS_Slowlori_train1, DoS_Slowlori_train2, DoS_Slowlori_test = np.array_split(dados_Dos_normalized[DoS_Slowlori], 3)

In [None]:
# Join all train and test data and shuffle

WebAttack_train = pd.concat([WebAttack_Benign_train1, WebAttack_Benign_train2, WebAttack_Brute_Force_train1, WebAttack_Brute_Force_train2, Web_Attack_XSS_train1, Web_Attack_XSS_train2])
WebAttack_test = pd.concat([WebAttack_Benign_test, WebAttack_Brute_Force_test, Web_Attack_XSS_test])
WebAttack_train = shuffle(WebAttack_train)
WebAttack_test = shuffle(WebAttack_test)

Brute_Force_train = pd.concat([Brute_Force_Benign_train1, Brute_Force_Benign_train2, Brute_Force_FTP_Patator_train1, Brute_Force_FTP_Patator_train2, Brute_Force_SSH_Patator_train1, Brute_Force_SSH_Patator_train2])
Brute_Force_test = pd.concat([Brute_Force_Benign_test, Brute_Force_FTP_Patator_test, Brute_Force_SSH_Patator_test])
Brute_Force_train = shuffle(Brute_Force_train)
Brute_Force_test = shuffle(Brute_Force_test)

Dos_train = pd.concat([Dos_Benign_train1, Dos_Benign_train2, Dos_GoldenEye_train1, Dos_GoldenEye_train2, DoS_Hulk_train1, DoS_Hulk_train2, DoS_Slowhttptest_train1, DoS_Slowhttptest_train2, DoS_Slowlori_train1, DoS_Slowlori_train2])
Dos_test = pd.concat([Dos_Benign_test, Dos_GoldenEye_test, DoS_Hulk_test, DoS_Slowhttptest_test, DoS_Slowlori_test])
Dos_train = shuffle(Dos_train)
Dos_test = shuffle(Dos_test)

### Save train and test data in csv files

In [None]:
WebAttack_train.to_csv(r'Train_test_data/WebAttack_train.csv', index = False)
WebAttack_test.to_csv(r'Train_test_data/WebAttack_test.csv', index = False)

Brute_Force_train.to_csv(r'Train_test_data/Brute_Force_train.csv', index = False)
Brute_Force_test.to_csv(r'Train_test_data/Brute_Force_test.csv', index = False)

Dos_train.to_csv(r'Train_test_data/Dos_train.csv', index = False)
Dos_test.to_csv(r'Train_test_data/Dos_test.csv', index = False)

### Data for the GAN models

In [None]:
GAN_WebAttack_Brute_Force = pd.concat([WebAttack_Brute_Force_train1, WebAttack_Brute_Force_train2])
GAN_WebAttack_Brute_Force = shuffle(GAN_WebAttack_Brute_Force)
GAN_WebAttack_Brute_Force.to_csv(r'GAN_generation\Web_Attack_Brute_Force_Data.csv', index = False)

GAN_Web_Attack_XSS  = pd.concat([Web_Attack_XSS_train1, Web_Attack_XSS_train2])
GAN_Web_Attack_XSS = shuffle(GAN_Web_Attack_XSS)
GAN_Web_Attack_XSS.to_csv(r'GAN_generation\Web_Attack_XSS_Data.csv', index = False)

GAN_Brute_Force_FTP_Patator = pd.concat([Brute_Force_FTP_Patator_train1, Brute_Force_FTP_Patator_train2])
GAN_Brute_Force_FTP_Patator = shuffle(GAN_Brute_Force_FTP_Patator)
GAN_Brute_Force_FTP_Patator.to_csv(r'GAN_generation\Brute_Force_FTP_Patator_Data.csv', index = False)

GAN_Brute_Force_SSH_Patator = pd.concat([Brute_Force_SSH_Patator_train1, Brute_Force_SSH_Patator_train2])
GAN_Brute_Force_SSH_Patator = shuffle(GAN_Brute_Force_SSH_Patator)
GAN_Brute_Force_SSH_Patator.to_csv(r'GAN_generation\Brute_Force_SSH_Patator_Data.csv', index = False)

GAN_Dos_GoldenEye = pd.concat([Dos_GoldenEye_train1, Dos_GoldenEye_train2])
GAN_Dos_GoldenEye = shuffle(GAN_Dos_GoldenEye)
GAN_Dos_GoldenEye.to_csv(r'GAN_generation\Dos_GoldenEye_Data.csv', index = False)

GAN_DoS_Hulk = pd.concat([DoS_Hulk_train1, DoS_Hulk_train2])
GAN_DoS_Hulk = shuffle(GAN_DoS_Hulk)
GAN_DoS_Hulk.to_csv(r'GAN_generation\DoS_Hulk_Data.csv', index = False)

GAN_DoS_Slowhttptest = pd.concat([DoS_Slowhttptest_train1, DoS_Slowhttptest_train2])
GAN_DoS_Slowhttptest = shuffle(GAN_DoS_Slowhttptest)
GAN_DoS_Slowhttptest.to_csv(r'GAN_generation\DoS_Slowhttptest_Data.csv', index = False)

GAN_DoS_Slowlori = pd.concat([DoS_Slowlori_train1, DoS_Slowlori_train2])
GAN_DoS_Slowlori = shuffle(GAN_DoS_Slowlori)
GAN_DoS_Slowlori.to_csv(r'GAN_generation\DoS_Slowlori_Data.csv', index = False)

### See the amount of data needed for GAN generation

In [None]:
counter = Counter(WebAttack_train[' Label'])
print(counter)

counter = Counter(Brute_Force_train[' Label'])
print(counter)

counter = Counter(Dos_train[' Label'])
print(counter)

In [None]:
GAN_WebAttack_Brute_Force.shape