# Prototype: Alpha

| Properties      | Data    |
|---------------|-----------|
| *Labels* | `['BENIGN', 'DDoS']` |
| *Normalization* | `Min-Max` |
| *Sample Size* | `2000`|
| *Adversarial Attack* | `FGSM` |
| *Explanations* | `SHAP` |


---

In [182]:
# To import modules from the functions directory
import sys
import os
sys.path.append(os.path.abspath(os.path.join('..')))

## Data Preprocessing

In [233]:
import functions.data_preprocessing as dp
import importlib
importlib.reload(dp)

encoding_type = 0 # binary encoding
norm_type = 0 # min-max normalization
label_names = ['BENIGN', 'DDoS'] # labels to include
sample_size = 1000 # sample size for each label

dataset = dp.build_dataset(label_names)

normalizer, zero_columns = dp.generate_normalizer(dataset, norm_type)

feature_df, label_df = dp.preprocess_data(dataset, encoding_type, normalizer, zero_columns, sample_size, 42)
print(label_df.value_counts())

-- Building CICIDS2017 dataset --
--- Combining all CICIDS2017 files ---
Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv
Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv
Friday-WorkingHours-Morning.pcap_ISCX.csv
Monday-WorkingHours.pcap_ISCX.csv
Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv
Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv
Tuesday-WorkingHours.pcap_ISCX.csv
Wednesday-workingHours.pcap_ISCX.csv
--- Removing NaN and Infinity values ---
Removing 1358 Rows with NaN values
Removing 1509 Rows with Infinity values
--- Extracting labels ---
 Label
BENIGN    2271320
DDoS       128025
Name: count, dtype: int64
-- Generating normalizer --
--- Splitting labels and features ---
Zero Columns: [' Bwd PSH Flags', ' Bwd URG Flags', 'Fwd Avg Bytes/Bulk', ' Fwd Avg Packets/Bulk', ' Fwd Avg Bulk Rate', ' Bwd Avg Bytes/Bulk', ' Bwd Avg Packets/Bulk', 'Bwd Avg Bulk Rate']
-- Preprocessing data --
--- Sampling balanced data ---
Sample to shape: (2000, 79)
--- Splittin

## Split Data

In [213]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(feature_df, label_df, test_size=0.2, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(1600, 70) (400, 70) (1600, 2) (400, 2)


## Create IDS

In [214]:
import functions.intrusion_detection_system as ids
import importlib
importlib.reload(ids)

ids_model = ids.build_intrusion_detection_system(X_train, y_train, X_test, y_test)

Epoch 1/10
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 18ms/step - accuracy: 0.5798 - loss: 0.6775 - val_accuracy: 0.8500 - val_loss: 0.6123
Epoch 2/10
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.8842 - loss: 0.5952 - val_accuracy: 0.8656 - val_loss: 0.5244
Epoch 3/10
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.9131 - loss: 0.4876 - val_accuracy: 0.9625 - val_loss: 0.3888
Epoch 4/10
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.9822 - loss: 0.3422 - val_accuracy: 0.9688 - val_loss: 0.2621
Epoch 5/10
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.9789 - loss: 0.2168 - val_accuracy: 0.9656 - val_loss: 0.1747
Epoch 6/10
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.9720 - loss: 0.1459 - val_accuracy: 0.9656 - val_loss: 0.1313
Epoch 7/10
[1m13/13[0m [32m━━━━━━━━━

## Generate Attacks

In [215]:
import functions.attack_generator as ag
import importlib
importlib.reload(ag)

all_features = dataset.drop(columns=[' Label'])
art_model = ag.convert_to_art_model(ids_model, all_features)

# import numpy as np
# target_label = np.zeros_like(y_train)
# target_label[:, 0] = 1 # desired predicted label = [1, 0] = BENIGN
# print(target_label[:3])

X_adv_fgsm = ag.generate_fgsm_attacks(art_model, X_train)

import pandas as pd
X_adv_fgsm_df = pd.DataFrame(X_adv_fgsm, columns=X_train.columns)
print(X_adv_fgsm_df.iloc[:, :3].head(2))
ag.evaluate_art_model(art_model, X_adv_fgsm, y_train)

Adversarial FGSM examples generated. Shape: (1600, 70)
   Destination Port  Flow Duration  Total Fwd Packets
0          0.790196            0.1                0.0
1          0.000000            0.1                0.0
Accuracy: 0.271875
              precision    recall  f1-score   support

      BENIGN       0.26      0.24      0.25       801
      ATTACK       0.28      0.30      0.29       799

   micro avg       0.27      0.27      0.27      1600
   macro avg       0.27      0.27      0.27      1600
weighted avg       0.27      0.27      0.27      1600
 samples avg       0.27      0.27      0.27      1600

Confusion Matrix: Positive == BENIGN
TN: 241, FP: 558, FN: 607, TP: 194


0.271875

## Explainer

In [216]:
import functions.explainer as exp
import importlib
importlib.reload(exp)

explainer = exp.generate_shap_explainer(ids_model, X_train)

shap_values = exp.generate_shap_values(explainer, X_train)
print(shap_values.shape)
shap_values_df = exp.convert_shap_values_to_pd(shap_values, X_train.columns)

shap_values_adv = exp.generate_shap_values(explainer, X_adv_fgsm)
print(shap_values_adv.shape)
shap_values_adv_df = exp.convert_shap_values_to_pd(shap_values_adv, X_train.columns)

PermutationExplainer explainer: 1601it [02:00, 12.22it/s]                          


(1600, 70)


PermutationExplainer explainer: 1601it [01:43, 13.99it/s]                          


(1600, 70)


## Detector

In [217]:
import functions.detector as det
import importlib
importlib.reload(det)
import numpy as np

# create dataframe
X, y = det.build_train_datasets(shap_values_df, shap_values_adv_df)
print(X.shape, y.shape)

# create normalizer
# normalizer = det.create_min_max_normalizer(X)
# print(np.max(normalizer.data_max_))
# print(np.min(normalizer.data_min_))

# TODO: how to normalize? min/max should be consistent for all datasets
# normalize features
# X = normalizer.transform(X)
# print(np.max(X))
# print(np.min(X))

# split data
X_train_det, X_test_det, y_train_det, y_test_det = train_test_split(X, y, test_size=0.1, random_state=1503)
print(X_train_det.shape, X_test_det.shape, y_train_det.shape, y_test_det.shape)

# build detector
detector = det.build_detector(X_train_det, y_train_det, X_test_det, y_test_det)


# normalizer_adv = det.create_min_max_normalizer(shap_values_adv_df)
# print(np.max(normalizer_adv.data_max_))
# print(np.min(normalizer_adv.data_min_))

# shap_values_normalized = det.normalize_shap_values(shap_values_df)
# print(shap_values_normalized.shape)
# print(np.max(shap_values_normalized))
# print(np.min(shap_values_normalized))

(3200, 70) (3200, 2)
(2880, 70) (320, 70) (2880, 2) (320, 2)
Epoch 1/10
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.5246 - loss: 0.6903 - val_accuracy: 0.5122 - val_loss: 0.6842
Epoch 2/10
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.5134 - loss: 0.6809 - val_accuracy: 0.5451 - val_loss: 0.6663
Epoch 3/10
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.5964 - loss: 0.6578 - val_accuracy: 0.6354 - val_loss: 0.6248
Epoch 4/10
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.7346 - loss: 0.6108 - val_accuracy: 0.8021 - val_loss: 0.5528
Epoch 5/10
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8240 - loss: 0.5387 - val_accuracy: 0.9028 - val_loss: 0.4713
Epoch 6/10
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.9011 - loss: 0.4689 - val_accuracy: 0.93

---
## Test Process

In [218]:
print(X_test.shape)

(400, 70)


### Generate Attacks

In [219]:
X_test_adv = ag.generate_fgsm_attacks(art_model, X_test)

Adversarial FGSM examples generated. Shape: (400, 70)


### Generate Explanations

In [220]:
X_test_adv_shap_values = exp.generate_shap_values(explainer, X_test_adv)
X_test_shap_values = exp.generate_shap_values(explainer, X_test)
X_test_shap_values_df = exp.convert_shap_values_to_pd(X_test_shap_values, X_test.columns)
X_test_adv_shap_values_df = exp.convert_shap_values_to_pd(X_test_adv_shap_values, X_test.columns)

PermutationExplainer explainer: 401it [00:26,  9.47it/s]                         
PermutationExplainer explainer: 401it [00:30,  8.75it/s]                         


### Generate Datasets

In [221]:
X, y = det.build_train_datasets(X_test_shap_values_df, X_test_adv_shap_values_df)

### Preprocess Data

In [194]:
# print(np.max(X))
# print(np.min(X))
# X = normalizer.transform(X)
# print(np.max(X))
# print(np.min(X))

### Shuffle Data & Predict

In [222]:
from sklearn.utils import shuffle

X, y = shuffle(X, y, random_state=187)

y_pred = detector.predict(X)

[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 


### Evaluate

In [223]:
det.evaluate_model(y_pred, y)

Global Accuracy: 97.88%
              precision    recall  f1-score   support

      BENIGN       0.97      0.99      0.98       400
 ADVERSARIAL       0.99      0.97      0.98       400

    accuracy                           0.98       800
   macro avg       0.98      0.98      0.98       800
weighted avg       0.98      0.98      0.98       800

True Negative Rate: 98.75%
False Positive Rate: 1.25%
True Positive Rate: 97.00%
False Negative Rate: 3.00%


---
## Additional Evaluation - New Samples

In [None]:
import functions.data_preprocessing as dp
import importlib
importlib.reload(dp)

X_eval, y_eval = dp.preprocess_data(dataset, encoding_type, normalizer, zero_columns, 250, 187)
print(y_eval.value_counts())

--- Removing NaN and Infinity values ---
Number of rows with NaN values:  0
Removing NaN values....
Number of rows with Infinity values: 0
Removing Infinity values....
--- Sampling balanced data ---
Sample to shape: (500, 79)
--- Splitting labels and features ---
--- Encoding labels as binary one-hot values ---
--- Normalizing features using MinMaxScaler ---
BENIGN  ATTACK
False   True      250
True    False     250
Name: count, dtype: int64


In [None]:
X_adv_fgsm_eval = ag.generate_fgsm_attacks(art_model, X_eval)
X_adv_fgsm_df = pd.DataFrame(X_adv_fgsm_eval, columns=X_eval.columns)
print(X_adv_fgsm_df.iloc[:, :3].head(2))
ag.evaluate_art_model(art_model, X_adv_fgsm_eval, y_eval)

Adversarial FGSM examples generated. Shape: (500, 70)
   Destination Port  Flow Duration  Total Fwd Packets
0          0.000000       0.100411                0.0
1          0.734119       0.861346                0.0
Accuracy: 0.3
              precision    recall  f1-score   support

      BENIGN       0.25      0.21      0.23       250
      ATTACK       0.33      0.39      0.36       250

   micro avg       0.30      0.30      0.30       500
   macro avg       0.29      0.30      0.29       500
weighted avg       0.29      0.30      0.29       500
 samples avg       0.30      0.30      0.30       500

Confusion Matrix: Positive == BENIGN
TN: 98, FP: 152, FN: 198, TP: 52


0.3

In [226]:
X_eval_adv_shap_values = exp.generate_shap_values(explainer, X_adv_fgsm_eval)
X_eval_shap_values = exp.generate_shap_values(explainer, X_eval)
X_eval_shap_values_df = exp.convert_shap_values_to_pd(X_eval_shap_values, X_eval.columns)
X_eval_adv_shap_values_df = exp.convert_shap_values_to_pd(X_eval_adv_shap_values, X_eval.columns)

PermutationExplainer explainer: 501it [00:32, 11.40it/s]                         
PermutationExplainer explainer: 501it [00:35, 10.17it/s]                         


In [228]:
X_eval_detector, y_eval_detector = det.build_train_datasets(X_eval_shap_values_df, X_eval_adv_shap_values_df)
print(X_eval_detector.shape, y_eval_detector.shape)

(1000, 70) (1000, 2)


In [229]:
from sklearn.utils import shuffle

X_eval_detector, y_eval_detector = shuffle(X_eval_detector, y_eval_detector, random_state=187)

y_pred_eval_detector = detector.predict(X_eval_detector)

[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step


In [230]:
det.evaluate_model(y_pred_eval_detector, y_eval_detector)

Global Accuracy: 97.00%
              precision    recall  f1-score   support

      BENIGN       0.96      0.98      0.97       500
 ADVERSARIAL       0.98      0.96      0.97       500

    accuracy                           0.97      1000
   macro avg       0.97      0.97      0.97      1000
weighted avg       0.97      0.97      0.97      1000

True Negative Rate: 97.60%
False Positive Rate: 2.40%
True Positive Rate: 96.40%
False Negative Rate: 3.60%
