# Prototype: Alpha

| Properties      | Data    |
|---------------|-----------|
| *Labels* | `['BENIGN', 'DDoS']` |
| *Normalization* | `Min-Max` |
| *Sample Size* | `2000`|
| *Adversarial Attack* | `FGSM` |
| *Explanations* | `SHAP` |


---

In [13]:
# To import modules from the functions directory
import sys
import os
sys.path.append(os.path.abspath(os.path.join('..')))

## Data Preprocessing

In [14]:
import functions.data_preprocessing as dp
import importlib
importlib.reload(dp)

encoding_type = 0 # binary encoding
norm_type = 0 # min-max normalization
label_names = ['BENIGN', 'DDoS'] # labels to include
sample_size = 500 # sample size for each label

label_df, feature_df = dp.preprocess(encoding_type, norm_type, label_names=label_names, sample_size=sample_size)
label_df.value_counts()

--- Combining all CICIDS2017 files ---
Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv
Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv
Friday-WorkingHours-Morning.pcap_ISCX.csv
Monday-WorkingHours.pcap_ISCX.csv
Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv
Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv
Tuesday-WorkingHours.pcap_ISCX.csv
Wednesday-workingHours.pcap_ISCX.csv
--- Removing NaN and Infinity values ---
Number of rows with NaN values:  1358
Removing NaN values....
Number of rows with Infinity values: 1509
Removing Infinity values....
--- Extracting labels ---
 Label
BENIGN    2271320
DDoS       128025
Name: count, dtype: int64
--- Sampling balanced data ---
Sample to shape: (1000, 79)
--- Splitting labels and features ---
--- Encoding labels as binary one-hot values ---
--- Removing irrelevant features ---
Removed Zero Columns: [' Bwd PSH Flags', ' Fwd URG Flags', ' Bwd URG Flags', ' RST Flag Count', ' CWE Flag Count', ' ECE Flag Count', 'Fwd Avg B

BENIGN  ATTACK
False   True      500
True    False     500
Name: count, dtype: int64

## Split Data

In [15]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(feature_df, label_df, test_size=0.2, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(800, 66) (200, 66) (800, 2) (200, 2)


## Create IDS

In [16]:
import functions.intrusion_detection_system as ids
import importlib
importlib.reload(ids)

ids_model = ids.build_intrusion_detection_system(X_train, y_train, X_test, y_test)

Epoch 1/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 30ms/step - accuracy: 0.4959 - loss: 0.6745 - val_accuracy: 0.5188 - val_loss: 0.5877
Epoch 2/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.4905 - loss: 0.5775 - val_accuracy: 0.6313 - val_loss: 0.5245
Epoch 3/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.7440 - loss: 0.5123 - val_accuracy: 0.9125 - val_loss: 0.4729
Epoch 4/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.9211 - loss: 0.4623 - val_accuracy: 0.9750 - val_loss: 0.4300
Epoch 5/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.9933 - loss: 0.4044 - val_accuracy: 1.0000 - val_loss: 0.3857
Epoch 6/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.9965 - loss: 0.3795 - val_accuracy: 1.0000 - val_loss: 0.3416
Epoch 7/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[

## Generate Attacks

In [17]:
import functions.attack_generator as ag
import importlib
importlib.reload(ag)

art_model = ag.convert_to_art_model(ids_model, X_train)

# import numpy as np
# target_label = np.zeros_like(y_train)
# target_label[:, 0] = 1 # desired predicted label = [1, 0] = BENIGN
# print(target_label[:3])

X_adv_fgsm = ag.generate_fgsm_attacks(art_model, X_train)
ag.evaluate_art_model(art_model, X_adv_fgsm, y_train)

Adversarial FGSM examples generated. Shape: (800, 66)
Accuracy: 0.63125
              precision    recall  f1-score   support

      BENIGN       0.58      0.93      0.72       404
      ATTACK       0.82      0.33      0.47       396

   micro avg       0.63      0.63      0.63       800
   macro avg       0.70      0.63      0.59       800
weighted avg       0.70      0.63      0.59       800
 samples avg       0.63      0.63      0.63       800

Confusion Matrix: Positive == BENIGN
TN: 129, FP: 267, FN: 28, TP: 376


0.63125

## Explainer

In [18]:
import functions.explainer as exp
import importlib
importlib.reload(exp)

explainer = exp.generate_shap_explainer(ids_model, X_train)

shap_values = exp.generate_shap_values(explainer, X_train)
print(shap_values.shape)
shap_values_df = exp.convert_shap_values_to_pd(shap_values, X_train.columns)

shap_values_adv = exp.generate_shap_values(explainer, X_adv_fgsm)
print(shap_values_adv.shape)
shap_values_adv_df = exp.convert_shap_values_to_pd(shap_values_adv, X_train.columns)

PermutationExplainer explainer: 801it [00:41, 14.80it/s]                         


(800, 66)


PermutationExplainer explainer: 801it [00:44, 14.01it/s]                         

(800, 66)





## Detector

In [19]:
import functions.detector as det
import importlib
importlib.reload(det)
import numpy as np

# create dataframe
X, y = det.build_train_datasets(shap_values_df, shap_values_adv_df)
print(X.shape, y.shape)

# create normalizer
normalizer = det.create_min_max_normalizer(X)
# print(np.max(normalizer.data_max_))
# print(np.min(normalizer.data_min_))

# TODO: how to normalize? min/max should be consistent for all datasets
# normalize features
X = normalizer.transform(X)

# split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1503)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

# build detector
detector = det.build_detector(X_train, y_train, X_test, y_test)


# normalizer_adv = det.create_min_max_normalizer(shap_values_adv_df)
# print(np.max(normalizer_adv.data_max_))
# print(np.min(normalizer_adv.data_min_))

# shap_values_normalized = det.normalize_shap_values(shap_values_df)
# print(shap_values_normalized.shape)
# print(np.max(shap_values_normalized))
# print(np.min(shap_values_normalized))

(1600, 66) (1600, 2)
(1440, 66) (160, 66) (1440, 2) (160, 2)
Epoch 1/10
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 19ms/step - accuracy: 0.5030 - loss: 0.6919 - val_accuracy: 0.5278 - val_loss: 0.6672
Epoch 2/10
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.5110 - loss: 0.6681 - val_accuracy: 0.5972 - val_loss: 0.6454
Epoch 3/10
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.5784 - loss: 0.6421 - val_accuracy: 0.7465 - val_loss: 0.6247
Epoch 4/10
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.6644 - loss: 0.6161 - val_accuracy: 0.7604 - val_loss: 0.6059
Epoch 5/10
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.7189 - loss: 0.5884 - val_accuracy: 0.7743 - val_loss: 0.5866
Epoch 6/10
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.7525 - loss: 0.5686 - val_accuracy: 0.81