In [1]:
from ctgan.synthesizers.ae_gan import CTGANV2, AutoEncoderType
from aegan_utils import train_model, IntrusionDataset
from sklearn.model_selection import train_test_split
import datetime
import pandas as pd
import numpy as np
import pickle
import os

In [2]:
dataloader = IntrusionDataset(path="../dataset/intrusion/")

In [3]:
intrusion_train_df, intrusion_valid_df, intrusion_test_df, intrusion_discrete_columns = dataloader.load_intrusion()

### Tune based on F1 macro score on real intrusion validation dataset

### Vanilla AE

- AE dimension
    - (256, 128)
        - 0.090
    - (256, 128, 64)
        - 0.055
    - (256, 128, 64, 32)
        - 0.026
    - Best: (256, 128)

.

- G hidden dimension
    - 64
        - 0.087
    - 128
        - 0.037
    - 256
        - 0.058
    - Best: 64

.

- G and D layers
  - 256 * 2
    - 0.086
  - 256 * 4
    - 0.070
  - Best: 256 * 2

.

- D steps (1, 5) and PAC value (4, 8, 16)
  - 1, 4
    - 0.228
  - 1, 8
    - 0.097
  - 1, 16
    - 0.061
  - 5, 4
    - 0.108
  - 5, 8
    - 0.079
  - 5, 16
    - 0.101
  - Best: D steps = 1, PAC value = 4

.

- LR, batch size, epochs
  - AE LR: 1e-4, GAN LR: 1e-5, batch size: 512, GAN epochs: 300, AE epochs: 100
    - 0.039
  - AE LR: 1e-4, GAN LR: 1e-5, batch size: 256, GAN epochs: 150, AE epochs: 50
    - NA
  - AE LR: 1e-4, GAN LR: 2e-4, batch size: 512, GAN epochs: 300, AE epochs: 100
    - 0.035
  - AE LR: 1e-4, GAN LR: 2e-4, batch size: 256, GAN epochs: 150, AE epochs: 50
    - 0.109
  - AE LR: 2e-4, GAN LR: 1e-5, batch size: 512, GAN epochs: 300, AE epochs: 100
    - 0.171
  - AE LR: 2e-4, GAN LR: 1e-5, batch size: 256, GAN epochs: 150, AE epochs: 50
    - 0.117
  - AE LR: 2e-4, GAN LR: 2e-4, batch size: 512, GAN epochs: 300, AE epochs: 100
    - 0.096
  - AE LR: 2e-4, GAN LR: 2e-4, batch size: 256, GAN epochs: 150, AE epochs: 50
    - 0.120
  - AE LR: 1e-3, GAN LR: 1e-5, batch size: 512, GAN epochs: 300, AE epochs: 100
    - 0.123
  - AE LR: 1e-3, GAN LR: 1e-5, batch size: 256, GAN epochs: 150, AE epochs: 50
    - 0.147
  - AE LR: 1e-3, GAN LR: 2e-4, batch size: 512, GAN epochs: 300, AE epochs: 100
    - 0.227
  - AE LR: 1e-3, GAN LR: 2e-4, batch size: 256, GAN epochs: 150, AE epochs: 50
    - 0.055
  - Best: AE LR: 1e-3, GAN LR: 2e-4, batch size: 512, GAN epochs: 300, AE epochs: 100

### Denoising AE


- AE dimension
    - (256, 128)
        - 0.049
    - (256, 128, 64)
        - 0.048
    - (256, 128, 64, 32)
        - 0.030
    - Best: (256, 128)

.

- G hidden dimension
    - 64
        - 0.175
    - 128
        - 0.113
    - 256
        - 0.028
    - Best: 64

.

- G and D layers
  - 256 * 2
    - 0.168
  - 256 * 4
    - 0.074
  - Best: 256 * 2

.

- D steps (1, 5) and PAC value (4, 8, 16)
  - 1, 4
    - 0.003
  - 1, 8
    - 0.080
  - 1, 16
    - 0.177
  - 5, 4
    - 0.129
  - 5, 8
    - 0.022
  - 5, 16
    - 0.072
  - Best: D steps = 1, PAC value = 16

.

- LR, batch size, epochs
  - AE LR: 1e-4, GAN LR: 1e-5, batch size: 512, GAN epochs: 300, AE epochs: 100
    - 0.120
  - AE LR: 1e-4, GAN LR: 1e-5, batch size: 256, GAN epochs: 150, AE epochs: 50
    - 0.035
  - AE LR: 1e-4, GAN LR: 2e-4, batch size: 512, GAN epochs: 300, AE epochs: 100
    - 0.003
  - AE LR: 1e-4, GAN LR: 2e-4, batch size: 256, GAN epochs: 150, AE epochs: 50
    - 0.154
  - AE LR: 2e-4, GAN LR: 1e-5, batch size: 512, GAN epochs: 300, AE epochs: 100
    - 0.043
  - AE LR: 2e-4, GAN LR: 1e-5, batch size: 256, GAN epochs: 150, AE epochs: 50
    - 0.140
  - AE LR: 2e-4, GAN LR: 2e-4, batch size: 512, GAN epochs: 300, AE epochs: 100
    - 0.106
  - AE LR: 2e-4, GAN LR: 2e-4, batch size: 256, GAN epochs: 150, AE epochs: 50
    - 0.035
  - AE LR: 1e-3, GAN LR: 1e-5, batch size: 512, GAN epochs: 300, AE epochs: 100
    - 0.143
  - AE LR: 1e-3, GAN LR: 1e-5, batch size: 256, GAN epochs: 150, AE epochs: 50
    - 0.066
  - AE LR: 1e-3, GAN LR: 2e-4, batch size: 512, GAN epochs: 300, AE epochs: 100
    - 0.010
  - AE LR: 1e-3, GAN LR: 2e-4, batch size: 256, GAN epochs: 150, AE epochs: 50
    - 0.047
  - Best: AE LR: 1e-4, GAN LR: 2e-4, batch size: 256, GAN epochs: 150, AE epochs: 50

### Entity AE


- AE dimension
    - (256, 128)
        - 0.093
    - (256, 128, 64)
        - 0.008
    - (256, 128, 64, 32)
        - 0.029
    - Best: (256, 128)

.

- G hidden dimension
    - 64
        - 0.138
    - 128
        - 0.011
    - 256
        - 0.134
    - Best: 64

.

- G and D layers
  - 256 * 2
    - 0.09
  - 256 * 4
    - 0.163
  - Best: 256 * 4

.

- D steps (1, 5) and PAC value (4, 8, 16)
  - 1, 4
    - 0.120
  - 1, 8
    - 0.000
  - 1, 16
    - 0.054
  - 5, 4
    - 0.115
  - 5, 8
    - 0.100
  - 5, 16
    - 0.024
  - Best: D steps = 1, PAC value = 4

.

- LR, batch size, epochs
  - AE LR: 1e-4, GAN LR: 1e-5, batch size: 512, GAN epochs: 300, AE epochs: 100
    - 0.136
  - AE LR: 1e-4, GAN LR: 1e-5, batch size: 256, GAN epochs: 150, AE epochs: 50
    - 0.061
  - AE LR: 1e-4, GAN LR: 2e-4, batch size: 512, GAN epochs: 300, AE epochs: 100
    - 0.087
  - AE LR: 1e-4, GAN LR: 2e-4, batch size: 256, GAN epochs: 150, AE epochs: 50
    - 0.181
  - AE LR: 2e-4, GAN LR: 1e-5, batch size: 512, GAN epochs: 300, AE epochs: 100
    - 0.067
  - AE LR: 2e-4, GAN LR: 1e-5, batch size: 256, GAN epochs: 150, AE epochs: 50
    - 0.158
  - AE LR: 2e-4, GAN LR: 2e-4, batch size: 512, GAN epochs: 300, AE epochs: 100
    - 0.074
  - AE LR: 2e-4, GAN LR: 2e-4, batch size: 256, GAN epochs: 150, AE epochs: 50
    - 0.092
  - AE LR: 1e-3, GAN LR: 1e-5, batch size: 512, GAN epochs: 300, AE epochs: 100
    - 0.138
  - AE LR: 1e-3, GAN LR: 1e-5, batch size: 256, GAN epochs: 150, AE epochs: 50
    - 0.108
  - AE LR: 1e-3, GAN LR: 2e-4, batch size: 512, GAN epochs: 300, AE epochs: 100
    - 0.140
  - AE LR: 1e-3, GAN LR: 2e-4, batch size: 256, GAN epochs: 150, AE epochs: 50
    - 0.232
  - Best: AE LR: 1e-3, GAN LR: 2e-4, batch size: 256, GAN epochs: 150, AE epochs: 50

### Variational AE


- AE dimension
    - (256, 128)
        - 0.066
    - (256, 128, 64)
        - 0.072
    - (256, 128, 64, 32)
        - 0.083
    - Best: (256, 128, 64, 32)

.

- G hidden dimension
    - 64
        - 0.096
    - 128
        - 0.085
    - 256
        - 0.083
    - Best: 64

.

- G and D layers
  - 256 * 2
    - 0.09
  - 256 * 4
    - 0.06
  - Best: 256 * 2

.

- D steps (1, 5) and PAC value (4, 8, 16)
  - 1, 4
    - 0.071
  - 1, 8
    - 0.100
  - 1, 16
    - 0.123
  - 5, 4
    - 0.132
  - 5, 8
    - 0.075
  - 5, 16
    - 0.073
  - Best: D steps = 5, PAC value = 4

.

- LR, batch size, epochs
  - AE LR: 1e-4, GAN LR: 1e-5, batch size: 512, GAN epochs: 300, AE epochs: 100
    - 0.082
  - AE LR: 1e-4, GAN LR: 1e-5, batch size: 256, GAN epochs: 150, AE epochs: 50
    - 0.075
  - AE LR: 1e-4, GAN LR: 2e-4, batch size: 512, GAN epochs: 300, AE epochs: 100
    - 0.123
  - AE LR: 1e-4, GAN LR: 2e-4, batch size: 256, GAN epochs: 150, AE epochs: 50
    - 0.119
  - AE LR: 2e-4, GAN LR: 1e-5, batch size: 512, GAN epochs: 300, AE epochs: 100
    - 0.093
  - AE LR: 2e-4, GAN LR: 1e-5, batch size: 256, GAN epochs: 150, AE epochs: 50
    - 0.070
  - AE LR: 2e-4, GAN LR: 2e-4, batch size: 512, GAN epochs: 300, AE epochs: 100
    - 0.089
  - AE LR: 2e-4, GAN LR: 2e-4, batch size: 256, GAN epochs: 150, AE epochs: 50
    - 0.130
  - AE LR: 1e-3, GAN LR: 1e-5, batch size: 512, GAN epochs: 300, AE epochs: 100
    - 0.069
  - AE LR: 1e-3, GAN LR: 1e-5, batch size: 256, GAN epochs: 150, AE epochs: 50
    - 0.078
  - AE LR: 1e-3, GAN LR: 2e-4, batch size: 512, GAN epochs: 300, AE epochs: 100
    - 0.079
  - AE LR: 1e-3, GAN LR: 2e-4, batch size: 256, GAN epochs: 150, AE epochs: 50
    - 0.073
  - Best: AE LR: 2e-4, GAN LR: 2e-4, batch size: 256, GAN epochs: 150, AE epochs: 50

In [None]:
# model = CTGANV2()
# intrusion_trans = model.transform(intrusion_train_df, discrete_columns=intrusion_discrete_columns)
# # Save data transformer and transformed data to a file
# with open("../dataset/intrusion/data_transformer.pkl", "wb") as fp:
#     pickle.dump(model._transformer, fp)
# with open("../dataset/intrusion/transformed_data.pkl", "wb") as fp:
#     pickle.dump(intrusion_trans, fp)

In [None]:
# Load data transformer and transformed data
# with open("../dataset/intrusion/data_transformer.pkl", "rb") as fp:
#     dt = pickle.load(fp)
# with open("../dataset/intrusion/transformed_data.pkl", "rb") as fp:
#     intrusion_trans = pickle.load(fp)

In [None]:
# model = CTGANV2(epochs=2, ae_epochs=1)

# model.fit(intrusion_trans, discrete_columns=intrusion_discrete_columns, dt=dt, is_pre_transformed=True, target_index=intrusion_train_df.shape[1]-1)

In [None]:
# model.save(f"../models/intrusion_ae_gan_1.pkl")

In [4]:
model = CTGANV2.load("../models/tuned_intrusion_denoising.pkl")

In [5]:
# Sample fake train and validation data, if fails then re-attempt
intrusion_fake_df = model.sample(intrusion_train_df.shape[0] + intrusion_valid_df.shape[0])
intrusion_fake_X, intrusion_fake_y = dataloader.get_Xy(intrusion_fake_df)
intrusion_fake_train_X, intrusion_fake_valid_X, intrusion_fake_train_y, intrusion_fake_valid_y = train_test_split(
    intrusion_fake_X, intrusion_fake_y, test_size=intrusion_valid_df.shape[0], random_state=1, shuffle=True, stratify=intrusion_fake_y.argmax(-1)
)

In [None]:
# Get real validation data
intrusion_valid_X, intrusion_valid_y = dataloader.get_Xy(intrusion_valid_df)

In [6]:
# Get real test data
intrusion_test_X, intrusion_test_y = dataloader.get_Xy(intrusion_test_df)

In [7]:
# Check ML efficacy
test_score = train_model(
    intrusion_fake_train_X,
    intrusion_fake_train_y,
    intrusion_fake_valid_X,
    intrusion_fake_valid_y,
    intrusion_test_X,
    intrusion_test_y,
    input_dim=215,
    output_dim=23,
    batch_size=2048,
    num_epochs=300,
    model_type="classification",
    show_print_training_score=False,
    verbose=False,
    scorer_type="f1_macro",
)
print(f"Test score: {test_score}")

  2%|▏         | 6/300 [00:00<00:27, 10.87it/s]

Epoch: 1, Training Loss: 0.0433, Valid Loss: 0.0030, Valid score: 0.3015


 10%|█         | 31/300 [00:01<00:07, 36.08it/s]

Epoch: 26, Training Loss: 0.0003, Valid Loss: 0.0001, Valid score: 0.9753


 19%|█▊        | 56/300 [00:01<00:05, 43.59it/s]

Epoch: 51, Training Loss: 0.0003, Valid Loss: 0.0000, Valid score: 0.9761


 27%|██▋       | 81/300 [00:02<00:04, 44.56it/s]

Epoch: 76, Training Loss: 0.0002, Valid Loss: 0.0000, Valid score: 0.9838


 35%|███▌      | 106/300 [00:02<00:04, 46.97it/s]

Epoch: 101, Training Loss: 0.0003, Valid Loss: 0.0000, Valid score: 0.9881


 44%|████▍     | 132/300 [00:03<00:03, 47.65it/s]

Epoch: 126, Training Loss: 0.0004, Valid Loss: 0.0000, Valid score: 0.9827


 53%|█████▎    | 159/300 [00:04<00:02, 48.05it/s]

Epoch: 151, Training Loss: 0.0002, Valid Loss: 0.0001, Valid score: 0.9888


 61%|██████▏   | 184/300 [00:04<00:02, 41.66it/s]

Epoch: 176, Training Loss: 0.0003, Valid Loss: 0.0001, Valid score: 0.9886


 70%|██████▉   | 209/300 [00:05<00:01, 46.14it/s]

Epoch: 201, Training Loss: 0.0000, Valid Loss: 0.0000, Valid score: 0.9922


 76%|███████▋  | 229/300 [00:05<00:01, 40.95it/s]

Epoch: 226, Training Loss: 0.0000, Valid Loss: 0.0000, Valid score: 0.9923


 87%|████████▋ | 260/300 [00:06<00:00, 47.28it/s]

Epoch: 251, Training Loss: 0.0003, Valid Loss: 0.0001, Valid score: 0.9850


 95%|█████████▌| 285/300 [00:06<00:00, 47.20it/s]

Epoch: 276, Training Loss: 0.0000, Valid Loss: 0.0000, Valid score: 0.9923


100%|██████████| 300/300 [00:07<00:00, 41.50it/s]

Best validation score: 0.9938615862956602
Test score: 0.020918892904044032





In [None]:
n_models = 12

scores = []

for i in range(n_models):
    print(f"Processing {i+1} model")
    model_file = f"../models/intrusion_ae_gan_{i+1}.pkl"
    if not os.path.exists(model_file):
        print(f"Skipping {i+1} model")
        continue
    model = CTGANV2.load(model_file)
    # Sample fake train and validation data
    idx = 0
    while True:
        if idx >= 5:
            raise Exception(f"Failed to generate fake data after {idx} tries")
        try:
            intrusion_fake_df = model.sample(intrusion_train_df.shape[0] + intrusion_valid_df.shape[0])
            intrusion_fake_X, intrusion_fake_y = dataloader.get_Xy(intrusion_fake_df)
            intrusion_fake_train_X, intrusion_fake_valid_X, intrusion_fake_train_y, intrusion_fake_valid_y = train_test_split(
                intrusion_fake_X, intrusion_fake_y, test_size=intrusion_valid_df.shape[0], random_state=1, shuffle=True, stratify=intrusion_fake_y.argmax(-1)
            )
            break
        except:
            print("Invalid fake data generated, trying again...")
            idx+=1
            continue

    best_test_score = -float("inf")
    for i in range(5):
        test_score = train_model(
            intrusion_fake_train_X,
            intrusion_fake_train_y,
            intrusion_fake_valid_X,
            intrusion_fake_valid_y,
            intrusion_valid_X,
            intrusion_valid_y,
            input_dim=215,
            output_dim=23,
            batch_size=2048,
            num_epochs=300,
            model_type="classification",
            show_print_training_score=False,
            verbose=False,
            scorer_type="f1_macro",
        )
        if test_score > best_test_score:
            best_test_score = test_score
    print(f"Test score: {best_test_score}")
    scores.append(best_test_score)