<a href="https://colab.research.google.com/github/fwangliberty/AIoTDesign-Frontend/blob/master/Pre_trained_CNN1D_http_smotenc.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Using SMOTENC to train a pre-trained CNN1D model with is_http, standardalized dataset

We investigate the effect of adding "is_http80" on the detection results.

In [1]:
import os
from os.path import join
import glob
import pandas as pd
import numpy as np
import time
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
def display_metrics(y_test, y_pred, label_names):
  print('\nAccuracy: {:.2f}\n'.format(accuracy_score(y_test, y_pred)))

  print('Micro Precision: {:.2f}'.format(precision_score(y_test, y_pred, average='micro')))
  print('Micro Recall: {:.2f}'.format(recall_score(y_test, y_pred, average='micro')))
  print('Micro F1-score: {:.2f}\n'.format(f1_score(y_test, y_pred, average='micro')))

  print('Macro Precision: {:.2f}'.format(precision_score(y_test, y_pred, average='macro')))
  print('Macro Recall: {:.2f}'.format(recall_score(y_test, y_pred, average='macro')))
  print('Macro F1-score: {:.2f}\n'.format(f1_score(y_test, y_pred, average='macro')))

  print('Weighted Precision: {:.2f}'.format(precision_score(y_test, y_pred, average='weighted')))
  print('Weighted Recall: {:.2f}'.format(recall_score(y_test, y_pred, average='weighted')))
  print('Weighted F1-score: {:.2f}'.format(f1_score(y_test, y_pred, average='weighted')))

  print('\nClassification Report\n')
  print(classification_report(y_test, y_pred, target_names=label_names))

In [3]:
def make_value2index(attacks):
    #make dictionary
    attacks = sorted(attacks)
    d = {}
    counter=0
    for attack in attacks:
        d[attack] = counter
        counter+=1
    return d

In [4]:
# chganges label from string to integer/index
def encode_label(Y_str):
    labels_d = make_value2index(np.unique(Y_str))
    Y = [labels_d[y_str] for y_str  in Y_str]
    Y = np.array(Y)
    return np.array(Y)

## Step 1. Loading NPY files

In [5]:
# All columns
col_names = np.array(['dst sport count', 'src dport count', 'dst src count', 'dport count', 'sport count', 'dst host count','src host count','Source Port', 'Destination Port',
                      'Protocol', 'Flow Duration', 'Total Fwd Packets', 'Total Backward Packets', 'Total Length of Fwd Packets',
                      'Total Length of Bwd Packets', 'Fwd Packet Length Max', 'Fwd Packet Length Min', 'Fwd Packet Length Mean',
                      'Fwd Packet Length Std', 'Bwd Packet Length Max', 'Bwd Packet Length Min', 'Bwd Packet Length Mean', 'Bwd Packet Length Std',
                      'Flow Bytes/s', 'Flow Packets/s', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Total',
                      'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max',
                      'Bwd IAT Min', 'Fwd PSH Flags', 'Fwd URG Flags', 'Fwd Header Length', 'Bwd Header Length',
                      'Fwd Packets/s', 'Bwd Packets/s', 'Min Packet Length', 'Max Packet Length', 'Packet Length Mean', 'Packet Length Std',
                      'Packet Length Variance', 'FIN Flag Count', 'SYN Flag Count', 'RST Flag Count', 'PSH Flag Count', 'ACK Flag Count',
                      'URG Flag Count', 'CWE Flag Count', 'ECE Flag Count', 'Down/Up Ratio', 'Average Packet Size', 'Avg Fwd Segment Size',
                      'Avg Bwd Segment Size','Subflow Fwd Packets', 'Subflow Fwd Bytes',
                      'Subflow Bwd Packets', 'Subflow Bwd Bytes', 'Init_Win_bytes_forward', 'Init_Win_bytes_backward',
                      'act_data_pkt_fwd', 'min_seg_size_forward', 'Active Mean', 'Active Std', 'Active Max', 'Active Min', 'Idle Mean',
                      'Idle Std', 'Idle Max', 'Idle Min', 'Label'])

### Option 1. Connect to Google Drive

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
from numpy import load

In [None]:
# load array
X_train_5000 = load('/content/drive/My Drive/CICIDS2017/X_train_5000.npy')
# print the array
print(X_train_5000)

In [None]:
y_train_5000 = load('/content/drive/My Drive/CICIDS2017/y_train_5000.npy')

In [None]:
X_train_3000 = load('/content/drive/My Drive/CICIDS2017/X_train_smotenc3000.npy')
y_train_3000 = load('/content/drive/My Drive/CICIDS2017/y_train_smotenc3000.npy')

In [8]:
X_train_1000 = load('/content/drive/My Drive/CICIDS2017/X_train_smotenc1000.npy')
y_train_1000 = load('/content/drive/My Drive/CICIDS2017/y_train_smotenc1000.npy')

In [9]:
print(y_train_1000)

[ 2  0  0 ... 14 14 14]


In [10]:
X_test = load('/content/drive/My Drive/CICIDS2017/X_test.npy') 
X_val = load('/content/drive/My Drive/CICIDS2017/X_val.npy')

In [11]:
y_test = load('/content/drive/My Drive/CICIDS2017/y_test.npy')
y_val = load('/content/drive/My Drive/CICIDS2017/y_val.npy')

### Option 2. Load from local machine

In [None]:
from numpy import load

# load array
X_train_5000 = load('X_train_5000.npy')
# print the array
print(X_train_5000)

In [None]:
X_test = load('X_test.npy') 
X_val = load('X_val.npy')

In [None]:
y_train_5000 = load('y_train_5000.npy')
y_test = load('y_test.npy')
y_val = load('y_val.npy')

## Step 2 One-hot encoding for labels

y_train, y_test and y_val have to be one-hot-encoded. That means they must have dimension (number_of_samples, 15), where 15 denotes number of classes.

In [12]:
from tensorflow.keras.utils import to_categorical

Save the labels for AdaBoostClassifier

In [13]:
y_test_cat = to_categorical(y_test, 15)
y_val_cat = to_categorical(y_val, 15)

In [None]:
y_train_cat_5000 = to_categorical(y_train_5000, 15)

In [None]:
y_train_cat_3000 = to_categorical(y_train_3000, 15)

In [14]:
y_train_cat_1000 = to_categorical(y_train_1000, 15)

## Step 3. Define the metrics

In [15]:
import tensorflow as tf
from sklearn.ensemble import RandomForestClassifier

#importing confusion matrix
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

from sklearn import metrics
from sklearn.metrics import accuracy_score

#importing accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import mean_squared_error,mean_absolute_error

In [16]:
METRICS = [
      tf.keras.metrics.TruePositives(name='tp'),
      tf.keras.metrics.FalsePositives(name='fp'),
      tf.keras.metrics.TrueNegatives(name='tn'),
      tf.keras.metrics.FalseNegatives(name='fn'), 
      tf.keras.metrics.BinaryAccuracy(name='accuracy'),
      tf.keras.metrics.Precision(name='precision'),
      tf.keras.metrics.Recall(name='recall'),
      tf.keras.metrics.AUC(name='auc'),
]

Get the attacks' names

In [17]:
df_test = pd.read_csv('/content/drive/My Drive/CICIDS2017/test_set_ext78_2.csv',names=col_names, skiprows=1)  
labels_d = make_value2index(df_test['Label'])

In [18]:
print(labels_d)

{'BENIGN': 105018, 'Bot': 105298, 'DDoS': 124569, 'DoS GoldenEye': 126111, 'DoS Hulk': 160658, 'DoS Slowhttptest': 161486, 'DoS slowloris': 162320, 'FTP-Patator': 163498, 'Heartbleed': 163500, 'Infiltration': 163501, 'PortScan': 187347, 'SSH-Patator': 188173, 'Web Attack � Brute Force': 188382, 'Web Attack � Sql Injection': 188389, 'Web Attack � XSS': 188482}


# CNN1D Model 

In [20]:
batch_size = 200 # increasing batch size with more gpu added
input_dim =  X_train_5000.shape[1]
num_class = 15                   # 15 intrusion classes, including benign traffic class
num_epochs = 100

print(input_dim)
print(num_class)

79
15


In [None]:
X_train_r = np.zeros((len(X_train_5000), input_dim, 1))
X_train_r[:, :, 0] = X_train_5000[:, :input_dim]
print(X_train_r.shape)

(893055, 79, 1)


In [45]:
X_test_r = np.zeros((len(X_test), input_dim, 1))
X_test_r[:, :, 0] = X_test[:, :input_dim]
print(X_test_r.shape)

(188483, 79, 1)


In [25]:
X_val_r = np.zeros((len(X_val), input_dim, 1))
X_val_r[:, :, 0] = X_val[:, :input_dim]
print(X_val_r.shape)

(188484, 79, 1)


In [None]:
del X_val
del X_train_5000
del X_test

In [None]:
# load the Random Forest model from disk
cnn1d_file_name = "/content/drive/My Drive/CICIDS2017/cnn1d_78_http_smote1000.h5"
cnn1d_model = tf.keras.models.load_model(cnn1d_file_name)
 
cnn1d_model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d (Conv1D)              (None, 79, 32)            768       
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 79, 32)            17440     
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 39, 32)            0         
_________________________________________________________________
dropout (Dropout)            (None, 39, 32)            0         
_________________________________________________________________
batch_normalization (BatchNo (None, 39, 32)            156       
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 39, 64)            47168     
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 39, 64)            6

In [None]:
from keras.optimizers import Nadam, Adam, RMSprop
from keras.callbacks import LearningRateScheduler, ModelCheckpoint
import keras
import time
time_start = time.time()

reduce_lr = keras.callbacks.ReduceLROnPlateau(moniter='val_loss',
                                              factor=0.1,
                                              patience=10)
nadam = Nadam(lr=1e-5, beta_1=0.9, beta_2=0.999, epsilon=1e-08, schedule_decay=0.0001)
cnn1d_model.compile(loss = "categorical_crossentropy",optimizer = "nadam", metrics = METRICS)
#model2d_g.compile(loss='categorical_crossentropy',
#              optimizer= RMSprop(lr=0.001), #,SGD(lr=0.01), #
#              metrics=METRICS)

#model2d_g.compile(
#      optimizer= Adam(lr=1e-4),
#      loss=tf.keras.losses.BinaryCrossentropy(),
#      loss='categorical_crossentropy',
#      metrics=METRICS)

history = cnn1d_model.fit(X_train_r, y_train_cat_5000, 
                    epochs=50, 
                    batch_size=batch_size, 
                    verbose=2,
                    validation_data=(X_val_r, y_val_cat),
                    callbacks=[reduce_lr])
time_end = time.time()
train_time = time_end - time_start
print("train_time:",train_time)

Epoch 1/50
4466/4466 - 54s - loss: 0.0101 - tp: 1437909.0000 - fp: 5471.0000 - tn: 20205910.0000 - fn: 5746.0000 - accuracy: 0.9995 - precision: 0.9962 - recall: 0.9960 - auc: 0.9999 - val_loss: 0.0153 - val_tp: 187775.0000 - val_fp: 703.0000 - val_tn: 2638073.0000 - val_fn: 709.0000 - val_accuracy: 0.9995 - val_precision: 0.9963 - val_recall: 0.9962 - val_auc: 0.9999
Epoch 2/50
4466/4466 - 49s - loss: 0.0097 - tp: 889534.0000 - fp: 3396.0000 - tn: 12499374.0000 - fn: 3521.0000 - accuracy: 0.9995 - precision: 0.9962 - recall: 0.9961 - auc: 1.0000 - val_loss: 0.0176 - val_tp: 187700.0000 - val_fp: 784.0000 - val_tn: 2637992.0000 - val_fn: 784.0000 - val_accuracy: 0.9994 - val_precision: 0.9958 - val_recall: 0.9958 - val_auc: 0.9998
Epoch 3/50
4466/4466 - 49s - loss: 0.0098 - tp: 889550.0000 - fp: 3352.0000 - tn: 12499418.0000 - fn: 3505.0000 - accuracy: 0.9995 - precision: 0.9962 - recall: 0.9961 - auc: 0.9999 - val_loss: 0.0175 - val_tp: 187806.0000 - val_fp: 672.0000 - val_tn: 2638104

KeyboardInterrupt: ignored

## Get the metrics

In [None]:
# evaluate model
accuracy = cnn1d_model.evaluate(X_test_r, y_test_cat, batch_size=batch_size, verbose=1)



In [None]:
y_pred=cnn1d_model.predict(X_test_r)

In [None]:
display_metrics(y_test, np.argmax(y_pred, axis = 1), labels_d)


Accuracy: 1.00

Micro Precision: 1.00
Micro Recall: 1.00
Micro F1-score: 1.00

Macro Precision: 0.91
Macro Recall: 0.97
Macro F1-score: 0.93

Weighted Precision: 1.00
Weighted Recall: 1.00
Weighted F1-score: 1.00

Classification Report

                            precision    recall  f1-score   support

                    BENIGN       1.00      1.00      1.00    105019
                       Bot       0.97      0.95      0.96       280
                      DDoS       1.00      1.00      1.00     19271
             DoS GoldenEye       1.00      1.00      1.00      1542
                  DoS Hulk       1.00      1.00      1.00     34547
          DoS Slowhttptest       1.00      1.00      1.00       828
             DoS slowloris       1.00      1.00      1.00       834
               FTP-Patator       1.00      1.00      1.00      1178
                Heartbleed       1.00      1.00      1.00         2
              Infiltration       0.33      1.00      0.50         1
             

### Save the model

In [None]:
cnn1d_model.save('cnn1d_78_http_smote5000.h5')

In [None]:
cnn1d_model.save('/content/drive/My Drive/CICIDS2017/cnn1d_78_http_smote5000.h5')

## Train the model using smote 3000

In [None]:
X_train_r = np.zeros((len(X_train_3000), input_dim, 1))
X_train_r[:, :, 0] = X_train_3000[:, :input_dim]
print(X_train_r.shape)

(887055, 79, 1)


In [None]:
y_train_cat = to_categorical(y_train_3000, 15)

In [None]:
cnn1d_file_name = "/content/drive/My Drive/CICIDS2017/cnn1d_78_http_smote5000.h5"
cnn1d_model = tf.keras.models.load_model(cnn1d_file_name)

In [None]:
history = cnn1d_model.fit(X_train_r, y_train_cat, 
                    epochs=50, 
                    batch_size=batch_size, 
                    verbose=2,
                    validation_data=(X_val_r, y_val_cat),
                    callbacks=[reduce_lr])

Epoch 1/50
8871/8871 - 95s - loss: 0.0101 - tp: 883650.0000 - fp: 3288.0000 - tn: 12415482.0000 - fn: 3405.0000 - accuracy: 0.9995 - precision: 0.9963 - recall: 0.9962 - auc: 0.9999 - val_loss: 0.0104 - val_tp: 188005.0000 - val_fp: 476.0000 - val_tn: 2638300.0000 - val_fn: 479.0000 - val_accuracy: 0.9997 - val_precision: 0.9975 - val_recall: 0.9975 - val_auc: 0.9999
Epoch 2/50
8871/8871 - 93s - loss: 0.0102 - tp: 883615.0000 - fp: 3316.0000 - tn: 12415454.0000 - fn: 3440.0000 - accuracy: 0.9995 - precision: 0.9963 - recall: 0.9961 - auc: 0.9999 - val_loss: 0.0133 - val_tp: 187896.0000 - val_fp: 583.0000 - val_tn: 2638193.0000 - val_fn: 588.0000 - val_accuracy: 0.9996 - val_precision: 0.9969 - val_recall: 0.9969 - val_auc: 0.9999
Epoch 3/50
8871/8871 - 94s - loss: 0.0105 - tp: 883519.0000 - fp: 3387.0000 - tn: 12415383.0000 - fn: 3536.0000 - accuracy: 0.9995 - precision: 0.9962 - recall: 0.9960 - auc: 0.9999 - val_loss: 0.0105 - val_tp: 187972.0000 - val_fp: 506.0000 - val_tn: 2638270.

In [None]:
accuracy = cnn1d_model.evaluate(X_test_r, y_test_cat, batch_size=batch_size, verbose=1)



In [None]:
y_pred=cnn1d_model.predict(X_test_r)

In [None]:
display_metrics(y_test, np.argmax(y_pred, axis = 1), labels_d)


Accuracy: 1.00

Micro Precision: 1.00
Micro Recall: 1.00
Micro F1-score: 1.00

Macro Precision: 0.91
Macro Recall: 0.97
Macro F1-score: 0.93

Weighted Precision: 1.00
Weighted Recall: 1.00
Weighted F1-score: 1.00

Classification Report

                            precision    recall  f1-score   support

                    BENIGN       1.00      1.00      1.00    105019
                       Bot       0.97      0.98      0.97       280
                      DDoS       1.00      1.00      1.00     19271
             DoS GoldenEye       1.00      0.99      1.00      1542
                  DoS Hulk       1.00      1.00      1.00     34547
          DoS Slowhttptest       1.00      1.00      1.00       828
             DoS slowloris       1.00      1.00      1.00       834
               FTP-Patator       1.00      1.00      1.00      1178
                Heartbleed       1.00      1.00      1.00         2
              Infiltration       0.33      1.00      0.50         1
             

### Save smote 3000 model

In [None]:
cnn1d_model.save('/content/drive/My Drive/CICIDS2017/cnn1d_78_http_smote3000.h5')

## Train the model using SMOTENC 1000

In [21]:
X_train_r = np.zeros((len(X_train_1000), input_dim, 1))
X_train_r[:, :, 0] = X_train_1000[:, :input_dim]
print(X_train_r.shape)

(881114, 79, 1)


In [22]:
y_train_cat = to_categorical(y_train_1000, 15)

In [23]:
cnn1d_file_name = "/content/drive/My Drive/CICIDS2017/cnn1d_78_http_smote1000.h5"
cnn1d_model = tf.keras.models.load_model(cnn1d_file_name)

In [28]:
import keras

reduce_lr = keras.callbacks.ReduceLROnPlateau(moniter='val_loss',
                                              factor=0.1,
                                              patience=10)

In [42]:
from keras.optimizers import Nadam, Adam, RMSprop
nadam = Nadam(lr=1e-5, beta_1=0.9, beta_2=0.999, epsilon=1e-08, schedule_decay=0.0001)

In [43]:
history = cnn1d_model.fit(X_train_r, y_train_cat, 
                    epochs=50, 
                    batch_size=3000, 
                    verbose=2,
                    validation_data=(X_val_r, y_val_cat),
                    callbacks=[reduce_lr])

Epoch 1/50
294/294 - 25s - loss: 0.0047 - tp: 879483.0000 - fp: 1555.0000 - tn: 12334041.0000 - fn: 1631.0000 - accuracy: 0.9998 - precision: 0.9982 - recall: 0.9981 - auc: 1.0000 - val_loss: 0.0121 - val_tp: 188180.0000 - val_fp: 301.0000 - val_tn: 2638475.0000 - val_fn: 304.0000 - val_accuracy: 0.9998 - val_precision: 0.9984 - val_recall: 0.9984 - val_auc: 0.9999
Epoch 2/50
294/294 - 23s - loss: 0.0047 - tp: 879521.0000 - fp: 1527.0000 - tn: 12334069.0000 - fn: 1593.0000 - accuracy: 0.9998 - precision: 0.9983 - recall: 0.9982 - auc: 1.0000 - val_loss: 0.0121 - val_tp: 188180.0000 - val_fp: 302.0000 - val_tn: 2638474.0000 - val_fn: 304.0000 - val_accuracy: 0.9998 - val_precision: 0.9984 - val_recall: 0.9984 - val_auc: 0.9999
Epoch 3/50
294/294 - 23s - loss: 0.0046 - tp: 879502.0000 - fp: 1548.0000 - tn: 12334048.0000 - fn: 1612.0000 - accuracy: 0.9998 - precision: 0.9982 - recall: 0.9982 - auc: 1.0000 - val_loss: 0.0121 - val_tp: 188181.0000 - val_fp: 301.0000 - val_tn: 2638475.0000 -

KeyboardInterrupt: ignored

In [46]:
accuracy = cnn1d_model.evaluate(X_test_r, y_test_cat, batch_size=batch_size, verbose=1)



In [47]:
y_pred=cnn1d_model.predict(X_test_r)

In [48]:
display_metrics(y_test, np.argmax(y_pred, axis = 1), labels_d)


Accuracy: 1.00

Micro Precision: 1.00
Micro Recall: 1.00
Micro F1-score: 1.00

Macro Precision: 0.93
Macro Recall: 0.98
Macro F1-score: 0.95

Weighted Precision: 1.00
Weighted Recall: 1.00
Weighted F1-score: 1.00

Classification Report

                            precision    recall  f1-score   support

                    BENIGN       1.00      1.00      1.00    105019
                       Bot       0.97      0.98      0.97       280
                      DDoS       1.00      1.00      1.00     19271
             DoS GoldenEye       1.00      0.99      1.00      1542
                  DoS Hulk       1.00      1.00      1.00     34547
          DoS Slowhttptest       1.00      1.00      1.00       828
             DoS slowloris       1.00      1.00      1.00       834
               FTP-Patator       1.00      1.00      1.00      1178
                Heartbleed       1.00      1.00      1.00         2
              Infiltration       0.50      1.00      0.67         1
             

### Save the SMOTENC 1000 model

In [49]:
cnn1d_model.save('/content/drive/My Drive/CICIDS2017/cnn1d_78_http_smotenc1000.h5')