In [None]:
import matplotlib
import pandas as pd
import numpy as np
import seaborn as sb
import sklearn as skl
import os
from matplotlib import pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder

### Load datasets

In [None]:
dados_DDos = pd.read_csv('CICIDS2017_Datasets\Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv', sep=',')
dados_PortScan = pd.read_csv('CICIDS2017_Datasets\Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv', sep=',')
dados_Bot = pd.read_csv('CICIDS2017_Datasets\Friday-WorkingHours-Morning.pcap_ISCX.csv', sep=',')
dados_Benign = pd.read_csv('CICIDS2017_Datasets\Monday-WorkingHours.pcap_ISCX.csv', sep=',')
dados_Infiltration = pd.read_csv('CICIDS2017_Datasets\Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv', sep=',')
dados_WebAttacks = pd.read_csv('CICIDS2017_Datasets\Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv', sep=';' , encoding='latin-1')
dados_BruteForce = pd.read_csv('CICIDS2017_Datasets\Tuesday-WorkingHours.pcap_ISCX.csv', sep=',')
dados_Dos = pd.read_csv('CICIDS2017_Datasets\Wednesday-workingHours.pcap_ISCX.csv', sep=',')

### Remove NaN values

In [None]:
for i in [dados_DDos,dados_PortScan,dados_Bot,dados_Benign,dados_Infiltration,dados_WebAttacks,dados_BruteForce,dados_Dos]:
    i.dropna(how='any', inplace=True)
    colNames = i.columns
    null_columns=colNames[i.isnull().any()]
    print(i[null_columns].isnull().sum())
    
for df in dados_DDos,dados_PortScan,dados_Bot,dados_Benign,dados_Infiltration,dados_WebAttacks,dados_BruteForce,dados_Dos:

    df.drop(['Flow ID'],axis=1,inplace=True)
    df.drop([' Timestamp'],axis=1,inplace=True)

    df[' Bwd Packet Length Std']=df[' Bwd Packet Length Std'].astype(np.float)
    df['Flow Bytes/s']=df['Flow Bytes/s'].astype(np.float)
    df[' Flow Packets/s']=df[' Flow Packets/s'].astype(np.float)
    df[' Flow IAT Std']=df[' Flow IAT Std'].astype(np.float)
    df['Fwd Packets/s']=df['Fwd Packets/s'].astype(np.float)
    df[' Bwd Packets/s']=df[' Bwd Packets/s'].astype(np.float)
    df[' Packet Length Mean']=df[' Packet Length Mean'].astype(np.float)
    df[' Packet Length Std']=df[' Packet Length Std'].astype(np.float)
    df[' Packet Length Variance']=df[' Packet Length Variance'].astype(np.float)
    df[' Average Packet Size']=df[' Average Packet Size'].astype(np.float)
    df[' Avg Fwd Segment Size']=df[' Avg Fwd Segment Size'].astype(np.float)

    df.replace([np.inf, -np.inf], np.nan).dropna(axis=0, inplace=True)

### Label enconding

In [None]:
class MultiColumnLabelEncoder:
    def __init__(self,columns = None):
        self.columns = columns # array of column names to encode

    def fit(self,X,y=None):
        return self # not relevant here

    def transform(self,X):
        '''
        Transforms columns of X specified in self.columns using
        LabelEncoder(). If no columns specified, transforms all
        columns in X.
        '''
        output = X.copy()
        if self.columns is not None:
            for col in self.columns:
                output[col] = LabelEncoder().fit_transform(output[col])
        else:
            for colname,col in output.iteritems():
                output[colname] = LabelEncoder().fit_transform(col)
        return output

    def fit_transform(self,X,y=None):
        return self.fit(X,y).transform(X)

In [None]:
encoding_pipeline = Pipeline([
    ('encoding',MultiColumnLabelEncoder(columns=[' Label',' Destination IP',' Source IP']))])

dados_DDos = encoding_pipeline.fit_transform(dados_DDos)
dados_PortScan = encoding_pipeline.fit_transform(dados_PortScan)
dados_Bot = encoding_pipeline.fit_transform(dados_Bot)
dados_Benign = encoding_pipeline.fit_transform(dados_Benign)
dados_Infiltration = encoding_pipeline.fit_transform(dados_Infiltration)
dados_WebAttacks = encoding_pipeline.fit_transform(dados_WebAttacks)
dados_BruteForce = encoding_pipeline.fit_transform(dados_BruteForce)
dados_Dos = encoding_pipeline.fit_transform(dados_Dos)

dados_DDos = dados_DDos.replace([np.inf, -np.inf], np.nan).dropna(axis=0)
dados_PortScan = dados_PortScan.replace([np.inf, -np.inf], np.nan).dropna(axis=0)
dados_Bot = dados_Bot.replace([np.inf, -np.inf], np.nan).dropna(axis=0)
dados_Benign = dados_Benign.replace([np.inf, -np.inf], np.nan).dropna(axis=0)
dados_Infiltration = dados_Infiltration.replace([np.inf, -np.inf], np.nan).dropna(axis=0)
dados_WebAttacks = dados_WebAttacks.replace([np.inf, -np.inf], np.nan).dropna(axis=0)
dados_BruteForce = dados_BruteForce.replace([np.inf, -np.inf], np.nan).dropna(axis=0)
dados_Dos = dados_Dos.replace([np.inf, -np.inf], np.nan).dropna(axis=0)

dados_WebAttacks.rename(columns={' Fwd Header Length_1':' Fwd Header Length.1'}, inplace=True)

### Build the exploratory graphical models

In [None]:
def make_graphics(df, directory_name):
    colNames = df.columns

    fig = plt.figure()
    fig.set_figheight(20)
    fig.set_figwidth(15)
    fig.subplots_adjust(hspace=1, wspace=1)

    j = 0
    max_size = 8
    columns_len = len(colNames)
    os.mkdir(directory_name)
    for i in range(1,columns_len): 
        if j == max_size:
            fig.savefig(directory_name + "/" + 'visualization' + str(i/j) + '.png', bbox_inches='tight')

            # create a new fig obj
            fig = plt.figure()
            fig.set_figheight(20)
            fig.set_figwidth(15)
            fig.subplots_adjust(hspace=1, wspace=1)

            j = 0

        
        ax = fig.add_subplot(max_size,3,3*(j+1)-2)
        ax.boxplot(x=df[colNames[i]],showmeans=True)
        ax.set_title(colNames[i])

        ax = fig.add_subplot(max_size,3,3*(j+1)-1)
        ax.hist(x=df[colNames[i]], bins=20)
        ax.axvline(df[colNames[i]].mean(), color='k', linestyle='dashed', linewidth=1)
        ax.set_title(colNames[i])
        ax.plot()

        ax = fig.add_subplot(max_size,3,3*(j+1))
        ax.scatter(x=df[colNames[i]], y=df[' Label'])
        ax.set_ylabel(' Label')
        ax.set_title(colNames[i])

        j += 1

    if j > 0:
        fig.savefig(directory_name + "/" + 'visualization_last.png', bbox_inches='tight')


In [None]:
directories = ["dados_DDos","dados_PortScan","dados_Bot","dados_Benign","dados_Infiltration","dados_WebAttacks","dados_BruteForce","dados_Dos"]
data_list_obj = enumerate([dados_DDos,dados_PortScan,dados_Bot,dados_Benign,dados_Infiltration,dados_WebAttacks,dados_BruteForce,dados_Dos])

for idx, df in data_list_obj:
    make_graphics(df, directories[idx])
    plt.savefig('visualization' + str(idx) + '.png')

### Remove unique single value features

In [None]:
for idx, i in enumerate([dados_DDos,dados_PortScan,dados_Bot,dados_Benign,dados_Infiltration,dados_WebAttacks,dados_BruteForce,dados_Dos]):
    i.drop([' Bwd PSH Flags'], axis=1,inplace=True)
    i.drop([' Fwd URG Flags'], axis=1,inplace=True)
    i.drop([' Bwd URG Flags'], axis=1,inplace=True)
    i.drop([' CWE Flag Count'], axis=1,inplace=True)
    i.drop(['Fwd Avg Bytes/Bulk'], axis=1,inplace=True)
    i.drop([' Fwd Avg Packets/Bulk'], axis=1,inplace=True)
    i.drop([' Fwd Avg Bulk Rate'], axis=1,inplace=True)
    i.drop([' Bwd Avg Bytes/Bulk'], axis=1,inplace=True)
    i.drop([' Bwd Avg Packets/Bulk'], axis=1,inplace=True)
    i.drop(['Bwd Avg Bulk Rate'], axis=1,inplace=True)

### Correlation matrix

In [None]:
datasets = [dados_DDos,dados_PortScan,dados_Bot,dados_Benign,dados_Infiltration,dados_WebAttacks,dados_BruteForce,dados_Dos]

for i in datasets:
    correlations = i.corr()
    fig_dims = (20, 20)
    fig, ax = plt.subplots(figsize=fig_dims)
    correlation_matrix = sb.heatmap(correlations, xticklabels=correlations.columns, yticklabels=correlations.columns, ax=ax, cmap="PuBu", vmin=-1, vmax=1)
    #fig.savefig("correlation_matrix" + str(idx) +".png")