<a href="https://colab.research.google.com/github/fwangliberty/AIoTDesign-Frontend/blob/master/CNN1D_78_features_split_50_50.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Intrusion Detection using CNN1N for CICIDS 2017 Data Set with 78 Features 

We use the pre-processing dataset from mlp4nids (Multi-layer perceptron for network intrusion detection) https://github.com/ArnaudRosay/mlp4nids. The train, test and validation sets have connection-based new features. The data splitting rate is 50:25:25

In [1]:
import os
from os.path import join
import glob
import pandas as pd
import numpy as np
import time
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
def display_metrics(y_test, y_pred, label_names):
  print('\nAccuracy: {:.2f}\n'.format(accuracy_score(y_test, y_pred)))

  print('Micro Precision: {:.2f}'.format(precision_score(y_test, y_pred, average='micro')))
  print('Micro Recall: {:.2f}'.format(recall_score(y_test, y_pred, average='micro')))
  print('Micro F1-score: {:.2f}\n'.format(f1_score(y_test, y_pred, average='micro')))

  print('Macro Precision: {:.2f}'.format(precision_score(y_test, y_pred, average='macro')))
  print('Macro Recall: {:.2f}'.format(recall_score(y_test, y_pred, average='macro')))
  print('Macro F1-score: {:.2f}\n'.format(f1_score(y_test, y_pred, average='macro')))

  print('Weighted Precision: {:.2f}'.format(precision_score(y_test, y_pred, average='weighted')))
  print('Weighted Recall: {:.2f}'.format(recall_score(y_test, y_pred, average='weighted')))
  print('Weighted F1-score: {:.2f}'.format(f1_score(y_test, y_pred, average='weighted')))

  print('\nClassification Report\n')
  print(classification_report(y_test, y_pred, target_names=label_names))

In [3]:
def display_all(df):
    with pd.option_context("display.max_rows", 100, "display.max_columns", 100): 
        print(df)

In [4]:
def make_value2index(attacks):
    #make dictionary
    attacks = sorted(attacks)
    d = {}
    counter=0
    for attack in attacks:
        d[attack] = counter
        counter+=1
    return d

In [5]:
# chganges label from string to integer/index
def encode_label(Y_str):
    labels_d = make_value2index(np.unique(Y_str))
    Y = [labels_d[y_str] for y_str  in Y_str]
    Y = np.array(Y)
    return np.array(Y)

## Step 1. Loading csv files

In [7]:
# All columns
col_names = np.array(['dst sport count', 'src dport count', 'dst src count', 'dport count', 'sport count', 'dst host count','src host count','Source Port', 'Destination Port',
                      'Protocol', 'Flow Duration', 'Total Fwd Packets', 'Total Backward Packets', 'Total Length of Fwd Packets',
                      'Total Length of Bwd Packets', 'Fwd Packet Length Max', 'Fwd Packet Length Min', 'Fwd Packet Length Mean',
                      'Fwd Packet Length Std', 'Bwd Packet Length Max', 'Bwd Packet Length Min', 'Bwd Packet Length Mean', 'Bwd Packet Length Std',
                      'Flow Bytes/s', 'Flow Packets/s', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Total',
                      'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max',
                      'Bwd IAT Min', 'Fwd PSH Flags', 'Fwd URG Flags', 'Fwd Header Length', 'Bwd Header Length',
                      'Fwd Packets/s', 'Bwd Packets/s', 'Min Packet Length', 'Max Packet Length', 'Packet Length Mean', 'Packet Length Std',
                      'Packet Length Variance', 'FIN Flag Count', 'SYN Flag Count', 'RST Flag Count', 'PSH Flag Count', 'ACK Flag Count',
                      'URG Flag Count', 'CWE Flag Count', 'ECE Flag Count', 'Down/Up Ratio', 'Average Packet Size', 'Avg Fwd Segment Size',
                      'Avg Bwd Segment Size','Subflow Fwd Packets', 'Subflow Fwd Bytes',
                      'Subflow Bwd Packets', 'Subflow Bwd Bytes', 'Init_Win_bytes_forward', 'Init_Win_bytes_backward',
                      'act_data_pkt_fwd', 'min_seg_size_forward', 'Active Mean', 'Active Std', 'Active Max', 'Active Min', 'Idle Mean',
                      'Idle Std', 'Idle Max', 'Idle Min', 'Label'])

### Option 1. Connect to Google Drive

In [8]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [9]:
# load three csv files generated by mlp4nids (Multi-layer perceptron for network intrusion detection )
# first load the train set
df_train = pd.read_csv('/content/drive/My Drive/CICIDS2017/train_set_ext78.csv',names=col_names, skiprows=1)  

In [10]:
print('Train set size: ', df_train.shape)

Train set size:  (556548, 79)


In [11]:
df_test = pd.read_csv('/content/drive/My Drive/CICIDS2017/test_set_ext78.csv',names=col_names, skiprows=1)  
print('Test set size: ', df_test.shape)

df_val = pd.read_csv('/content/drive/My Drive/CICIDS2017/crossval_set_ext78.csv',names=col_names, skiprows=1)  
print('Validation set size: ', df_val.shape)

Test set size:  (278271, 79)
Validation set size:  (278271, 79)


### Option 2. Load from local machine

In [None]:
dataroot = '../data/cicids2017clean/train_set_ext.csv'
df_train = pd.read_csv(dataroot, names=col_names, skiprows=1) 

In [None]:
dataroot = '../data/cicids2017clean/crossval_set_ext.csv'
df_val = pd.read_csv(dataroot, names=col_names, skiprows=1) 
dataroot = '../data/cicids2017clean/test_set_ext.csv'
df_test = pd.read_csv(dataroot, names=col_names, skiprows=1) 

## Step 2. Exploring the dataset

In [12]:
df_train.head()

Unnamed: 0,dst sport count,src dport count,dst src count,dport count,sport count,dst host count,src host count,Source Port,Destination Port,Protocol,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Min,Bwd Packet Length Mean,Bwd Packet Length Std,Flow Bytes/s,Flow Packets/s,Flow IAT Mean,Flow IAT Std,Flow IAT Max,Flow IAT Min,Fwd IAT Total,Fwd IAT Mean,Fwd IAT Std,Fwd IAT Max,Fwd IAT Min,Bwd IAT Total,Bwd IAT Mean,Bwd IAT Std,Bwd IAT Max,Bwd IAT Min,Fwd PSH Flags,Fwd URG Flags,Fwd Header Length,Bwd Header Length,Fwd Packets/s,Bwd Packets/s,Min Packet Length,Max Packet Length,Packet Length Mean,Packet Length Std,Packet Length Variance,FIN Flag Count,SYN Flag Count,RST Flag Count,PSH Flag Count,ACK Flag Count,URG Flag Count,CWE Flag Count,ECE Flag Count,Down/Up Ratio,Average Packet Size,Avg Fwd Segment Size,Avg Bwd Segment Size,Subflow Fwd Packets,Subflow Fwd Bytes,Subflow Bwd Packets,Subflow Bwd Bytes,Init_Win_bytes_forward,Init_Win_bytes_backward,act_data_pkt_fwd,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
480307,1,36,36,48,1,48,81,56014,53,17,60843,2,2,94,226,47,47,47.0,0.0,113,113,113.0,0.0,5259.438226,65.742978,20281.0,35121.66,60836,3,3,3.0,0.0,3,3,4,4.0,0.0,4,4,0,0,40,40,32.871489,32.871489,47,113,73.4,36.149689,1306.8,0,0,0,0,0,0,0,0,1,91.75,47.0,113.0,2,94,2,226,-1,-1,1,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
2273904,1,96,96,96,1,96,96,33606,80,6,2,2,0,0,0,0,0,0.0,0.0,0,0,0.0,0.0,0.0,1000000.0,2.0,0.0,2,2,2,2.0,0.0,2,2,0,0.0,0.0,0,0,0,0,64,0,1000000.0,0.0,0,0,0.0,0.0,0.0,0,0,0,0,1,0,0,0,0,0.0,0.0,0.0,2,0,0,0,251,-1,0,32,0.0,0.0,0,0,0.0,0.0,0,0,DoS Hulk
2173332,1,1,1,26,1,1,1,54288,80,6,44,1,1,8,0,8,8,8.0,0.0,0,0,0.0,0.0,181818.1818,45454.54545,44.0,0.0,44,44,0,0.0,0.0,0,0,0,0.0,0.0,0,0,1,0,32,32,22727.27273,22727.27273,0,8,5.333333,4.618802,21.333333,0,1,0,0,1,0,0,0,1,8.0,8.0,0.0,1,8,1,0,229,235,0,32,0.0,0.0,0,0,0.0,0.0,0,0,DoS slowloris
2267207,1,100,100,100,1,100,100,50768,80,6,2,2,0,0,0,0,0,0.0,0.0,0,0,0.0,0.0,0.0,1000000.0,2.0,0.0,2,2,2,2.0,0.0,2,2,0,0.0,0.0,0,0,0,0,64,0,1000000.0,0.0,0,0,0.0,0.0,0.0,0,0,0,0,1,0,0,0,0,0.0,0.0,0.0,2,0,0,0,251,-1,0,32,0.0,0.0,0,0,0.0,0.0,0,0,DoS Hulk
90817,1,100,100,100,1,100,100,62083,80,6,3795333,4,0,24,0,6,6,6.0,0.0,0,0,0.0,0.0,6.323556,1.053926,1265111.0,2190632.0,3794635,348,3795333,1265111.0,2190632.043,3794635,348,0,0.0,0.0,0,0,0,0,80,0,1.053926,0.0,6,6,6.0,0.0,0.0,0,0,0,0,1,0,0,0,0,7.5,6.0,0.0,4,24,0,0,256,-1,3,20,0.0,0.0,0,0,0.0,0.0,0,0,DDoS


Count the number of attacks

In [13]:
df_train['Label'].value_counts()

BENIGN                        278274
DoS Hulk                      115062
PortScan                       79402
DDoS                           64012
DoS GoldenEye                   5146
FTP-Patator                     3967
SSH-Patator                     2948
DoS slowloris                   2898
DoS Slowhttptest                2749
Bot                              978
Web Attack � Brute Force         753
Web Attack � XSS                 326
Infiltration                      18
Web Attack � Sql Injection        10
Heartbleed                         5
Name: Label, dtype: int64

In [None]:
df_train.describe()

Unnamed: 0,dst host count,src host count,Source Port,Destination Port,Protocol,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,...,act_data_pkt_fwd,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min
count,556548.0,556548.0,556548.0,556548.0,556548.0,556548.0,556548.0,556548.0,556548.0,556548.0,...,556548.0,556548.0,556548.0,556548.0,556548.0,556548.0,556548.0,556548.0,556548.0,556548.0
mean,63.986114,71.911368,41059.562686,7182.692884,8.35862,21599310.0,7.561386,8.259167,682.2941,12659.55,...,4.883591,25.540766,108236.1,32426.6,157184.9,88653.28,16406460.0,1206008.0,17305000.0,15526210.0
std,40.704131,36.408934,20762.257722,16999.053361,4.518634,38246650.0,641.917381,865.356666,8207.379,1934124.0,...,612.739185,6.418384,716470.0,356700.6,958873.9,666846.9,32835310.0,7260774.0,33900420.0,32592790.0
min,1.0,1.0,0.0,0.0,0.0,-13.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,42.0,33830.0,80.0,6.0,112.0,1.0,1.0,6.0,6.0,...,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,96.0,97.0,49512.0,80.0,6.0,68674.5,2.0,2.0,56.0,112.0,...,1.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,100.0,100.0,57000.0,1034.0,6.0,11043320.0,6.0,6.0,338.0,11595.0,...,2.0,32.0,953.0,0.0,953.0,882.0,9553880.0,0.0,9803576.0,7462839.0
max,100.0,100.0,65535.0,65532.0,17.0,119999900.0,207964.0,284602.0,2866110.0,627000000.0,...,198636.0,60.0,110000000.0,70500000.0,110000000.0,110000000.0,120000000.0,74200000.0,120000000.0,120000000.0


Read test and validation sets

In [None]:
print('Test set: ')
df_test['Label'].value_counts()

Test set: 


BENIGN                        139135
DoS Hulk                       57531
PortScan                       39701
DDoS                           32006
DoS GoldenEye                   2573
FTP-Patator                     1983
SSH-Patator                     1474
DoS slowloris                   1449
DoS Slowhttptest                1374
Bot                              489
Web Attack � Brute Force         376
Web Attack � XSS                 163
Infiltration                       9
Web Attack � Sql Injection         5
Heartbleed                         3
Name: Label, dtype: int64

In [None]:
df_test.describe()

Unnamed: 0,Source Port,Destination Port,Protocol,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Min,Bwd Packet Length Mean,Bwd Packet Length Std,Flow Bytes/s,Flow Packets/s,Flow IAT Mean,Flow IAT Std,Flow IAT Max,Flow IAT Min,Fwd IAT Total,Fwd IAT Mean,Fwd IAT Std,Fwd IAT Max,Fwd IAT Min,Bwd IAT Total,Bwd IAT Mean,Bwd IAT Std,Bwd IAT Max,Bwd IAT Min,Fwd PSH Flags,Fwd URG Flags,Fwd Header Length,Bwd Header Length,Fwd Packets/s,Bwd Packets/s,Min Packet Length,Max Packet Length,Packet Length Mean,Packet Length Std,Packet Length Variance,FIN Flag Count,SYN Flag Count,RST Flag Count,PSH Flag Count,ACK Flag Count,URG Flag Count,CWE Flag Count,ECE Flag Count,Down/Up Ratio,Average Packet Size,Avg Fwd Segment Size,Avg Bwd Segment Size,Subflow Fwd Packets,Subflow Fwd Bytes,Subflow Bwd Packets,Subflow Bwd Bytes,Init_Win_bytes_forward,Init_Win_bytes_backward,act_data_pkt_fwd,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min
count,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0
mean,45364.90575,5840.63364,7.480749,20184940.0,13.425781,15.071391,501.816,27770.75,185.079276,9.858127,42.498337,62.543191,1437.918892,16.702825,472.989076,598.916973,1319690.0,91530.9,2091054.0,4928267.0,15411550.0,211900.1,19817160.0,4173283.0,5569102.0,15224870.0,1619357.0,10586950.0,2418274.0,2028897.0,6305801.0,1360565.0,0.055008,0.0,318.3946,338.412,85411.6,6162.424,7.706957,1472.16314,241.332367,466.822599,897032.5,0.058659,0.055008,7.9e-05,0.42342,0.383936,0.024473,0.0,7.9e-05,0.515485,266.229446,42.498337,472.989076,13.425781,501.816,15.071391,27770.75,10570.92108,1731.26747,10.076965,26.450577,99502.58,43576.01,163586.3,72657.77,13899760.0,931034.2,14611830.0,13197460.0
std,17986.051427,15731.041102,3.759347,37476720.0,1208.778536,1591.037162,7898.116,3604963.0,392.614677,61.235977,99.578231,132.298567,2586.48709,46.983524,797.823718,1144.520735,23653540.0,293954.0,5446991.0,10464060.0,31565390.0,3035605.0,37418610.0,11780490.0,12628960.0,31610730.0,10665330.0,29133230.0,10451820.0,7731527.0,20466300.0,9753000.0,0.227996,0.0,25247.65,31824.66,288988.5,35115.81,17.933308,2584.335069,383.113699,824.111902,2290032.0,0.234986,0.227996,0.008891,0.494102,0.486344,0.154512,0.0,0.008891,0.551013,420.649091,99.578231,797.823718,1208.778536,7898.116,1591.037162,3604963.0,17760.016938,7434.284715,1153.408495,6.864564,742396.2,438225.0,1087195.0,666408.7,30552650.0,6557272.0,31588120.0,30246800.0
min,0.0,0.0,0.0,-1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-12000000.0,-2000000.0,-1.0,0.0,-1.0,-13.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,-1.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,39814.0,80.0,6.0,80.0,2.0,1.0,2.0,0.0,2.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,41.87093,0.7809336,71.0,0.0,78.0,3.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40.0,20.0,0.5793178,0.008503405,0.0,6.0,3.333333,2.309401,5.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2.0,0.0,2.0,2.0,1.0,0.0,251.0,-1.0,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,50634.0,80.0,6.0,85383.0,3.0,1.0,37.0,61.0,30.0,0.0,10.90048,0.0,48.0,0.0,41.0,0.0,1179.98,60.26881,26946.9,10098.84,54893.5,12.0,7013.0,3503.5,335.1686,6663.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,72.0,32.0,32.05693,0.7639334,0.0,56.0,48.0,18.622567,346.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,59.75,10.90048,41.0,3.0,37.0,1.0,61.0,304.0,0.0,1.0,24.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,58142.0,443.0,6.0,9936814.0,7.0,6.0,356.0,4532.0,321.0,6.0,54.333333,111.538871,1460.0,6.0,542.958333,663.236171,74001.25,26315.79,1557685.0,2991012.0,8012042.0,72.0,8004861.0,2007233.0,2359285.0,6955281.0,145.0,217463.8,44523.51,55015.63,148199.8,45.0,0.0,0.0,172.0,152.0,13888.89,61.20382,6.0,1460.0,298.055556,533.916876,285067.2,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,315.055556,54.333333,542.958333,7.0,356.0,6.0,4532.0,8192.0,235.0,3.0,32.0,5.0,0.0,5.0,5.0,5853280.0,0.0,5860882.0,5499876.0
max,65534.0,65534.0,17.0,119999900.0,219759.0,291922.0,1323378.0,655453000.0,23360.0,2065.0,4638.923469,7125.596846,17376.0,2146.0,3884.924556,6715.738331,2071000000.0,3000000.0,114392600.0,84781720.0,119994600.0,114392600.0,119999800.0,119961000.0,84602930.0,119994800.0,119961000.0,119999600.0,119974100.0,84418010.0,119974100.0,119974100.0,1.0,0.0,4644908.0,5838440.0,3000000.0,2000000.0,389.0,23360.0,1877.272727,4414.547151,19488230.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,7.0,2068.0,4638.923469,3884.924556,219759.0,1323378.0,291922.0,655453000.0,65535.0,65535.0,213557.0,93.0,101659700.0,64349500.0,101659700.0,101659700.0,119994600.0,73532390.0,119994600.0,119994600.0


In [None]:
print('Validation set: ')
df_val['Label'].value_counts()

Validation set: 


BENIGN                        139135
DoS Hulk                       57531
PortScan                       39701
DDoS                           32006
DoS GoldenEye                   2573
FTP-Patator                     1983
SSH-Patator                     1474
DoS slowloris                   1449
DoS Slowhttptest                1374
Bot                              489
Web Attack � Brute Force         376
Web Attack � XSS                 163
Infiltration                       9
Web Attack � Sql Injection         5
Heartbleed                         3
Name: Label, dtype: int64

In [None]:
df_val.describe()

Unnamed: 0,Source Port,Destination Port,Protocol,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Min,Bwd Packet Length Mean,Bwd Packet Length Std,Flow Bytes/s,Flow Packets/s,Flow IAT Mean,Flow IAT Std,Flow IAT Max,Flow IAT Min,Fwd IAT Total,Fwd IAT Mean,Fwd IAT Std,Fwd IAT Max,Fwd IAT Min,Bwd IAT Total,Bwd IAT Mean,Bwd IAT Std,Bwd IAT Max,Bwd IAT Min,Fwd PSH Flags,Fwd URG Flags,Fwd Header Length,Bwd Header Length,Fwd Packets/s,Bwd Packets/s,Min Packet Length,Max Packet Length,Packet Length Mean,Packet Length Std,Packet Length Variance,FIN Flag Count,SYN Flag Count,RST Flag Count,PSH Flag Count,ACK Flag Count,URG Flag Count,CWE Flag Count,ECE Flag Count,Down/Up Ratio,Average Packet Size,Avg Fwd Segment Size,Avg Bwd Segment Size,Subflow Fwd Packets,Subflow Fwd Bytes,Subflow Bwd Packets,Subflow Bwd Bytes,Init_Win_bytes_forward,Init_Win_bytes_backward,act_data_pkt_fwd,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min
count,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0,278270.0
mean,42404.657796,4584.284134,8.772361,22677200.0,5.697168,5.401071,363.328418,6993.901,146.271053,12.216883,37.118951,48.506259,1644.006185,31.044985,545.020851,677.301293,1026168.0,76478.36,2035292.0,5660128.0,18679830.0,117620.9,22429470.0,4097971.0,7275898.0,18575770.0,853211.7,10332440.0,2108780.0,2497470.0,6852268.0,877710.6,0.032285,0.0,-18535.84,-6184.786,69229.16,7292.921,11.819104,1667.675854,265.849753,521.021331,1032910.0,0.063622,0.032285,0.000111,0.325008,0.360617,0.052697,0.0,0.000111,0.635189,293.529747,37.118951,545.020851,5.697168,363.328418,5.401071,6994.36,8300.353962,1196.878449,2.850606,-6302.925,101166.3,38665.28,158403.2,77729.19,17583770.0,933741.7,18297550.0,16894050.0
std,18921.321337,13260.493797,4.780477,39575180.0,67.142466,87.166823,3246.913595,258556.5,380.776699,29.589643,80.881268,118.838474,2798.340693,63.404307,851.938455,1229.566919,18488960.0,273385.5,4353752.0,10816190.0,35326090.0,1940212.0,39554070.0,9992215.0,14501080.0,35365830.0,7979563.0,29286910.0,9212534.0,9080364.0,22681320.0,8098570.0,0.176757,0.0,4255039.0,856332.5,268176.6,35408.18,21.282926,2802.004249,395.033818,872.649085,2337799.0,0.244078,0.176757,0.010554,0.468379,0.480181,0.223428,0.0,0.010554,0.543604,431.308583,80.881268,851.938455,67.142466,3246.913595,87.166823,258608.2,13668.065695,6710.083319,32.991321,728696.9,840451.5,448735.6,1167019.0,758208.7,34554260.0,6638199.0,35404210.0,34373310.0
min,0.0,0.0,0.0,-12.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-12000000.0,-2000000.0,-12.0,0.0,-12.0,-12.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1929350000.0,-167770500.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,-1.0,-1.0,0.0,-83885310.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,35454.0,53.0,6.0,87.0,1.0,1.0,0.0,6.0,0.0,0.0,0.0,0.0,6.0,0.0,6.0,0.0,119.7319,0.6190867,62.0,0.0,85.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40.0,20.0,0.4824028,0.06077324,0.0,6.0,2.0,3.464102,12.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,6.0,1.0,0.0,1.0,6.0,-1.0,-1.0,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,49688.0,80.0,6.0,48520.0,2.0,2.0,56.0,138.0,32.0,0.0,30.0,0.0,87.0,0.0,79.0,0.0,3111.79,78.89413,19778.95,10620.92,45088.0,4.0,76.0,69.0,0.0,74.0,3.0,3.0,3.0,0.0,3.0,2.0,0.0,0.0,64.0,40.0,39.94567,7.588463,0.0,94.0,59.6,28.481573,811.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,75.5,30.0,79.0,2.0,56.0,2.0,138.0,256.0,0.0,1.0,32.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,55812.0,443.0,17.0,11812030.0,6.0,5.0,317.0,6899.0,231.0,25.0,49.0,79.203964,2052.0,46.0,702.75,757.42027,122449.0,25974.03,1982858.0,3728053.0,9999176.0,55.0,10700000.0,2714097.0,3740416.0,10000000.0,48.0,152490.8,31879.14,56627.72,136444.8,46.0,0.0,0.0,164.0,132.0,13333.33,8474.576,23.0,2313.0,384.862121,682.325913,465568.7,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,408.011858,49.0,702.75,6.0,317.0,5.0,6899.0,8192.0,235.0,2.0,32.0,652.0,0.0,652.0,504.75,9604032.0,0.0,9992129.0,7454226.0
max,65535.0,65389.0,17.0,120000000.0,16412.0,20326.0,624920.0,74900000.0,24820.0,2325.0,5177.25641,5199.042702,15928.0,1983.0,5800.5,8194.660487,2070000000.0,3000000.0,119000000.0,84800000.0,120000000.0,119000000.0,120000000.0,120000000.0,83700000.0,120000000.0,120000000.0,120000000.0,120000000.0,83400000.0,120000000.0,120000000.0,1.0,0.0,533580.0,650440.0,3000000.0,2000000.0,1306.0,24820.0,2265.586207,4731.522394,22400000.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,8.0,2328.0,5177.25641,5800.5,16412.0,624920.0,20326.0,74870240.0,65535.0,65535.0,6697.0,56.0,106000000.0,50400000.0,106000000.0,106000000.0,120000000.0,76600000.0,120000000.0,120000000.0


## Step 3. Encode Datasets

Encoding the labels, and generate numpy array. Note that the label has not been encoded as one-hot coding. We will use one-hot code later. 

### Step 3.1 Encoding train dataset

In [14]:
df_label = df_train['Label']
data = df_train.drop(columns=['Label'])
Xtrain = data.values
y_train = encode_label(df_label.values)

### Step 3.2. Encoding test dataset

In [15]:
df_label = df_test['Label']
data = df_test.drop(columns=['Label'])
Xtest = data.values
y_test = encode_label(df_label.values)

### Step 3.3 Encoding validation dataset

In [16]:
df_label = df_val['Label']
data = df_val.drop(columns=['Label'])
Xval = data.values
y_val = encode_label(df_label.values)

## Step 4. Normalization or Standardization

The continuous feature values are normalized into the same feature space. This is important when using features that have different measurements, and is a general requirement of many machine learning algorithms. We implement the two methods to see the impact on the final classifications. 

## Option 1. Normalization

The values of the datasets are normalized using the Min-Max scaling technique, bringing them all within a range of [0,1].

### Step 4.1 Normalizing train dataset

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(Xtrain)
X_train

array([[7.66048676e-01, 5.93603125e-03, 1.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [7.29167620e-01, 1.22077764e-03, 3.52941176e-01, ...,
        0.00000000e+00, 7.12500000e-01, 7.12500000e-01],
       [7.86144808e-01, 6.76005616e-03, 3.52941176e-01, ...,
        8.96864890e-03, 4.90833333e-01, 4.83333333e-01],
       ...,
       [2.53009842e-01, 1.22077764e-03, 3.52941176e-01, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [7.85610742e-01, 8.08765183e-04, 1.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [8.26276036e-01, 1.22077764e-03, 3.52941176e-01, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00]])

### Step 4.2. Normalizing validation dataset

In [None]:
X_val = scaler.fit_transform(Xval)
X_val

array([[4.37994965e-01, 1.22344737e-03, 3.52941176e-01, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [8.41290913e-01, 6.12488339e-02, 3.52941176e-01, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [2.87327382e-02, 6.77483980e-03, 3.52941176e-01, ...,
        5.68691207e-05, 8.33333333e-02, 8.32947583e-02],
       ...,
       [9.30510414e-01, 8.10533882e-04, 1.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [6.50492103e-01, 8.68494701e-02, 3.52941176e-01, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [4.24353399e-01, 1.22344737e-03, 3.52941176e-01, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00]])

### Step 4.3. Normalizing test dataset

In [None]:
X_test = scaler.fit_transform(Xtest)
X_test

array([[5.45487838e-01, 9.66383862e-01, 3.52941176e-01, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [9.17142247e-01, 1.22074038e-03, 3.52941176e-01, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [7.92001099e-01, 7.97448653e-02, 3.52941176e-01, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       ...,
       [9.04507584e-01, 1.22074038e-03, 3.52941176e-01, ...,
        0.00000000e+00, 8.33370661e-01, 8.33370661e-01],
       [9.22345653e-01, 8.08740501e-04, 1.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [5.45030061e-01, 6.75984985e-03, 3.52941176e-01, ...,
        5.01804492e-03, 4.88761059e-01, 4.84412289e-01]])

## Option 2. Standardization

In [17]:
from sklearn.preprocessing import StandardScaler

In [18]:
scaler = StandardScaler()

X_train = scaler.fit_transform(Xtrain)
X_val = scaler.fit_transform(Xval)
X_test = scaler.fit_transform(Xtest)

X_train

array([[-0.46360917, -0.21367668, -0.60970034, ..., -0.16609924,
        -0.51046627, -0.47636976],
       [-0.46360917,  1.19349981,  0.79867464, ..., -0.16609924,
        -0.51046627, -0.47636976],
       [-0.46360917, -1.03452964, -1.43125242, ..., -0.16609924,
        -0.51046627, -0.47636976],
       ...,
       [-0.32230411, -1.03452964, -1.31388783, ..., -0.16609924,
        -0.51046627, -0.47636976],
       [-0.46360917, -0.82345316, -1.21999617, ..., -0.16609924,
        -0.51046627, -0.47636976],
       [-0.46360917, -0.28403551, -0.68011909, ..., -0.16609924,
        -0.51046627, -0.47636976]])

## Step 5 One-hot encoding for labels

y_train, y_test and y_val have to be one-hot-encoded. That means they must have dimension (number_of_samples, 15), where 15 denotes number of classes.

In [19]:
from tensorflow.keras.utils import to_categorical

Save the labels for AdaBoostClassifier

In [20]:
y_train_ada = y_train
y_test_ada = y_test
y_val_ada = y_val

In [21]:
y_train = to_categorical(y_train, 15)
y_test = to_categorical(y_test, 15)
y_val = to_categorical(y_val, 15)

## Step 6. Define the metrics

In [22]:
import tensorflow as tf
from sklearn.ensemble import RandomForestClassifier

#importing confusion matrix
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

from sklearn import metrics
from sklearn.metrics import accuracy_score

#importing accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import mean_squared_error,mean_absolute_error

Get the attacks' names

In [23]:
labels_d = make_value2index(df_test['Label'])

In [24]:
print(labels_d)

{'BENIGN': 139134, 'Bot': 139623, 'DDoS': 171629, 'DoS GoldenEye': 174202, 'DoS Hulk': 231733, 'DoS Slowhttptest': 233107, 'DoS slowloris': 234556, 'FTP-Patator': 236539, 'Heartbleed': 236542, 'Infiltration': 236551, 'PortScan': 276252, 'SSH-Patator': 277726, 'Web Attack � Brute Force': 278102, 'Web Attack � Sql Injection': 278107, 'Web Attack � XSS': 278270}


# CNN1D Model 

In [25]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, BatchNormalization, Flatten, Dense, Activation,Dropout,MaxPooling1D
from tensorflow.keras.constraints import max_norm

In [41]:
#hyper-params
batch_size = 7500 # increasing batch size with more gpu added
input_dim = X_train.shape[1]
num_class = 15                   # 15 intrusion classes, including benign traffic class
num_epochs = 100
learning_rates = 1e-4
regularizations = 1e-3
optim = tf.keras.optimizers.Adam(lr=learning_rates, beta_1=0.9, beta_2=0.999, epsilon=1e-8)

print(input_dim)
print(num_class)

78
15


In [27]:
#X_train_r = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))
X_train_r = np.zeros((len(X_train), input_dim, 1))
X_train_r[:, :, 0] = X_train[:, :input_dim]
print(X_train_r.shape)

(556548, 78, 1)


In [28]:
X_test_r = np.zeros((len(X_test), input_dim, 1))
X_test_r[:, :, 0] = X_test[:, :input_dim]
print(X_test_r.shape)

(278271, 78, 1)


In [29]:
X_val_r = np.zeros((len(X_val), input_dim, 1))
X_val_r[:, :, 0] = X_val[:, :input_dim]
print(X_val_r.shape)

(278271, 78, 1)


In [42]:
model = Sequential()

# input layer
model.add(Conv1D(filters=32, kernel_size=23, activation='relu', padding='same', kernel_initializer='he_uniform', input_shape=(78,1)))
model.add(Conv1D(filters=32, kernel_size=17, activation='relu', padding='same', kernel_initializer='he_uniform'))
model.add(MaxPooling1D(pool_size=2,strides=2))
model.add(Dropout(0.2))
model.add(BatchNormalization(axis=1))

model.add(Conv1D(filters=64, kernel_size=23, activation='relu', padding='same', kernel_initializer='he_uniform'))
model.add(Conv1D(filters=64, kernel_size=17, activation='relu', padding='same', kernel_initializer='he_uniform'))
model.add(MaxPooling1D(pool_size=2,strides=2))
model.add(Dropout(0.2))
model.add(BatchNormalization(axis=1)) 

model.add(Flatten())
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(num_class))
model.add(Activation('softmax'))

model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_4 (Conv1D)            (None, 78, 32)            768       
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 78, 32)            17440     
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 39, 32)            0         
_________________________________________________________________
dropout_3 (Dropout)          (None, 39, 32)            0         
_________________________________________________________________
batch_normalization_2 (Batch (None, 39, 32)            156       
_________________________________________________________________
conv1d_6 (Conv1D)            (None, 39, 64)            47168     
_________________________________________________________________
conv1d_7 (Conv1D)            (None, 39, 64)           

In [36]:
METRICS = [
      tf.keras.metrics.TruePositives(name='tp'),
      tf.keras.metrics.FalsePositives(name='fp'),
      tf.keras.metrics.TrueNegatives(name='tn'),
      tf.keras.metrics.FalseNegatives(name='fn'), 
      tf.keras.metrics.BinaryAccuracy(name='accuracy'),
      tf.keras.metrics.Precision(name='precision'),
      tf.keras.metrics.Recall(name='recall'),
      tf.keras.metrics.AUC(name='auc'),
]

metrics=METRICS

In [44]:
from keras.optimizers import Nadam
from keras.callbacks import LearningRateScheduler, ModelCheckpoint
import keras
import time
time_start = time.time()

reduce_lr = keras.callbacks.ReduceLROnPlateau(moniter='val_loss',
                                              factor=0.1,
                                              patience=10)
nadam = Nadam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, schedule_decay=0.004)
model.compile(loss = "categorical_crossentropy",optimizer = "nadam", metrics = metrics)

#model.compile(
#      optimizer=tf.keras.optimizers.Adam(lr=1e-4),
#      loss=tf.keras.losses.BinaryCrossentropy(),
#      metrics=metrics)
  

history = model.fit(X_train_r, y_train, 
                    epochs=100, 
                    batch_size=batch_size, 
                    verbose=2,
                    validation_data=(X_val_r, y_val),
                    callbacks=[reduce_lr])
time_end = time.time()
train_time = time_end - time_start
print("train_time:",train_time)

Epoch 1/100
75/75 - 19s - loss: 0.3332 - tp: 757863.0000 - fp: 22247.0000 - tn: 11665219.0000 - fn: 76956.0000 - accuracy: 0.9921 - precision: 0.9715 - recall: 0.9078 - auc: 0.9928 - val_loss: 0.2329 - val_tp: 257006.0000 - val_fp: 2431.0000 - val_tn: 3893363.0000 - val_fn: 21265.0000 - val_accuracy: 0.9943 - val_precision: 0.9906 - val_recall: 0.9236 - val_auc: 0.9995
Epoch 2/100
75/75 - 14s - loss: 0.0417 - tp: 548889.0000 - fp: 4675.0000 - tn: 7786997.0000 - fn: 7659.0000 - accuracy: 0.9985 - precision: 0.9916 - recall: 0.9862 - auc: 0.9995 - val_loss: 0.0775 - val_tp: 269645.0000 - val_fp: 5284.0000 - val_tn: 3890510.0000 - val_fn: 8626.0000 - val_accuracy: 0.9967 - val_precision: 0.9808 - val_recall: 0.9690 - val_auc: 0.9998
Epoch 3/100
75/75 - 14s - loss: 0.0250 - tp: 552080.0000 - fp: 3036.0000 - tn: 7788636.0000 - fn: 4468.0000 - accuracy: 0.9991 - precision: 0.9945 - recall: 0.9920 - auc: 0.9997 - val_loss: 0.0719 - val_tp: 270904.0000 - val_fp: 5922.0000 - val_tn: 3889872.000

## Get the metrics

In [45]:
# evaluate model
accuracy = model.evaluate(X_test_r, y_test, batch_size=batch_size, verbose=1)



In [46]:
y_pred=model.predict(X_test_r)

In [47]:
display_metrics(y_test_ada, np.argmax(y_pred, axis = 1), labels_d)


Accuracy: 0.98

Micro Precision: 0.98
Micro Recall: 0.98
Micro F1-score: 0.98



  _warn_prf(average, modifier, msg_start, len(result))


Macro Precision: 0.58
Macro Recall: 0.56
Macro F1-score: 0.56

Weighted Precision: 0.98
Weighted Recall: 0.98
Weighted F1-score: 0.98

Classification Report



  _warn_prf(average, modifier, msg_start, len(result))


                            precision    recall  f1-score   support

                    BENIGN       0.99      1.00      0.99    139135
                       Bot       0.00      0.00      0.00       489
                      DDoS       0.94      1.00      0.97     32006
             DoS GoldenEye       0.92      0.93      0.93      2573
                  DoS Hulk       1.00      0.96      0.98     57531
          DoS Slowhttptest       0.55      0.79      0.65      1374
             DoS slowloris       0.85      0.44      0.58      1449
               FTP-Patator       1.00      0.94      0.97      1983
                Heartbleed       0.00      0.00      0.00         3
              Infiltration       0.00      0.00      0.00         9
                  PortScan       1.00      1.00      1.00     39701
               SSH-Patator       0.97      0.48      0.65      1474
  Web Attack � Brute Force       0.55      0.86      0.67       376
Web Attack � Sql Injection       0.00      0.00