In [1]:
from ctgan.synthesizers.ae_gan import CTGANV2, AutoEncoderType
from aegan_utils import train_model, IntrusionDataset
from sklearn.model_selection import train_test_split
import datetime
import pandas as pd
import numpy as np
import pickle
import os

In [2]:
dataloader = IntrusionDataset(path="../dataset/intrusion/")

In [3]:
intrusion_train_df, intrusion_valid_df, intrusion_test_df, intrusion_discrete_columns = dataloader.load_intrusion()

### Tune based on F1 macro score on real intrusion validation dataset

### Vanilla AE

- AE dimension
    - (256, 128)
        - 0.090
    - (256, 128, 64)
        - 0.055
    - (256, 128, 64, 32)
        - 0.026
    - Best: (256, 128)

.

- G hidden dimension
    - 64
        - 0.087
    - 128
        - 0.037
    - 256
        - 0.058
    - Best: 64

.

- G and D layers
  - 256 * 2
    - ?
  - 256 * 4
    - ?
  - Best: ?

.

- D steps (1, 5) and PAC value (4, 8, 16)
  - 1, 4
    - ?
  - 1, 8
    - ?
  - 1, 16
    - ?
  - 5, 4
    - ?
  - 5, 8
    - ?
  - 5, 16
    - ?
  - Best: ?

.

- LR, batch size, epochs
  - AE LR: 1e-4, GAN LR: 1e-5, batch size: 512, GAN epochs: 300, AE epochs: 100
    - ?
  - AE LR: 1e-4, GAN LR: 1e-5, batch size: 256, GAN epochs: 150, AE epochs: 50
    - ?
  - AE LR: 1e-4, GAN LR: 2e-4, batch size: 512, GAN epochs: 300, AE epochs: 100
    - ?
  - AE LR: 1e-4, GAN LR: 2e-4, batch size: 256, GAN epochs: 150, AE epochs: 50
    - ?
  - AE LR: 2e-4, GAN LR: 1e-5, batch size: 512, GAN epochs: 300, AE epochs: 100
    - ?
  - AE LR: 2e-4, GAN LR: 1e-5, batch size: 256, GAN epochs: 150, AE epochs: 50
    - ?
  - AE LR: 2e-4, GAN LR: 2e-4, batch size: 512, GAN epochs: 300, AE epochs: 100
    - ?
  - AE LR: 2e-4, GAN LR: 2e-4, batch size: 256, GAN epochs: 150, AE epochs: 50
    - ?
  - AE LR: 1e-3, GAN LR: 1e-5, batch size: 512, GAN epochs: 300, AE epochs: 100
    - ?
  - AE LR: 1e-3, GAN LR: 1e-5, batch size: 256, GAN epochs: 150, AE epochs: 50
    - ?
  - AE LR: 1e-3, GAN LR: 2e-4, batch size: 512, GAN epochs: 300, AE epochs: 100
    - ?
  - AE LR: 1e-3, GAN LR: 2e-4, batch size: 256, GAN epochs: 150, AE epochs: 50
    - ?
  - Best: ?

### Denoising AE


- AE dimension
    - (256, 128)
        - 0.049
    - (256, 128, 64)
        - 0.048
    - (256, 128, 64, 32)
        - 0.030
    - Best: (256, 128)

.

- G hidden dimension
    - 64
        - 0.175
    - 128
        - 0.113
    - 256
        - 0.028
    - Best: 64

.

- G and D layers
  - 256 * 2
    - ?
  - 256 * 4
    - ?
  - Best: ?

.

- D steps (1, 5) and PAC value (4, 8, 16)
  - 1, 4
    - ?
  - 1, 8
    - ?
  - 1, 16
    - ?
  - 5, 4
    - ?
  - 5, 8
    - ?
  - 5, 16
    - ?
  - Best: ?

.

- LR, batch size, epochs
  - AE LR: 1e-4, GAN LR: 1e-5, batch size: 512, GAN epochs: 300, AE epochs: 100
    - ?
  - AE LR: 1e-4, GAN LR: 1e-5, batch size: 256, GAN epochs: 150, AE epochs: 50
    - ?
  - AE LR: 1e-4, GAN LR: 2e-4, batch size: 512, GAN epochs: 300, AE epochs: 100
    - ?
  - AE LR: 1e-4, GAN LR: 2e-4, batch size: 256, GAN epochs: 150, AE epochs: 50
    - ?
  - AE LR: 2e-4, GAN LR: 1e-5, batch size: 512, GAN epochs: 300, AE epochs: 100
    - ?
  - AE LR: 2e-4, GAN LR: 1e-5, batch size: 256, GAN epochs: 150, AE epochs: 50
    - ?
  - AE LR: 2e-4, GAN LR: 2e-4, batch size: 512, GAN epochs: 300, AE epochs: 100
    - ?
  - AE LR: 2e-4, GAN LR: 2e-4, batch size: 256, GAN epochs: 150, AE epochs: 50
    - ?
  - AE LR: 1e-3, GAN LR: 1e-5, batch size: 512, GAN epochs: 300, AE epochs: 100
    - ?
  - AE LR: 1e-3, GAN LR: 1e-5, batch size: 256, GAN epochs: 150, AE epochs: 50
    - ?
  - AE LR: 1e-3, GAN LR: 2e-4, batch size: 512, GAN epochs: 300, AE epochs: 100
    - ?
  - AE LR: 1e-3, GAN LR: 2e-4, batch size: 256, GAN epochs: 150, AE epochs: 50
    - ?
  - Best: ?

### Entity AE


- AE dimension
    - (256, 128)
        - 0.093
    - (256, 128, 64)
        - 0.008
    - (256, 128, 64, 32)
        - 0.029
    - Best: (256, 128)

.

- G hidden dimension
    - 64
        - 0.138
    - 128
        - 0.011
    - 256
        - 0.134
    - Best: 64

.

- G and D layers
  - 256 * 2
    - ?
  - 256 * 4
    - ?
  - Best: ?

.

- D steps (1, 5) and PAC value (4, 8, 16)
  - 1, 4
    - ?
  - 1, 8
    - ?
  - 1, 16
    - ?
  - 5, 4
    - ?
  - 5, 8
    - ?
  - 5, 16
    - ?
  - Best: ?

.

- LR, batch size, epochs
  - AE LR: 1e-4, GAN LR: 1e-5, batch size: 512, GAN epochs: 300, AE epochs: 100
    - ?
  - AE LR: 1e-4, GAN LR: 1e-5, batch size: 256, GAN epochs: 150, AE epochs: 50
    - ?
  - AE LR: 1e-4, GAN LR: 2e-4, batch size: 512, GAN epochs: 300, AE epochs: 100
    - ?
  - AE LR: 1e-4, GAN LR: 2e-4, batch size: 256, GAN epochs: 150, AE epochs: 50
    - ?
  - AE LR: 2e-4, GAN LR: 1e-5, batch size: 512, GAN epochs: 300, AE epochs: 100
    - ?
  - AE LR: 2e-4, GAN LR: 1e-5, batch size: 256, GAN epochs: 150, AE epochs: 50
    - ?
  - AE LR: 2e-4, GAN LR: 2e-4, batch size: 512, GAN epochs: 300, AE epochs: 100
    - ?
  - AE LR: 2e-4, GAN LR: 2e-4, batch size: 256, GAN epochs: 150, AE epochs: 50
    - ?
  - AE LR: 1e-3, GAN LR: 1e-5, batch size: 512, GAN epochs: 300, AE epochs: 100
    - ?
  - AE LR: 1e-3, GAN LR: 1e-5, batch size: 256, GAN epochs: 150, AE epochs: 50
    - ?
  - AE LR: 1e-3, GAN LR: 2e-4, batch size: 512, GAN epochs: 300, AE epochs: 100
    - ?
  - AE LR: 1e-3, GAN LR: 2e-4, batch size: 256, GAN epochs: 150, AE epochs: 50
    - ?
  - Best: ?

### Variational AE


- AE dimension
    - (256, 128)
        - 0.066
    - (256, 128, 64)
        - 0.072
    - (256, 128, 64, 32)
        - 0.083
    - Best: (256, 128, 64, 32)

.

- G hidden dimension
    - 64
        - 0.096
    - 128
        - 0.085
    - 256
        - 0.083
    - Best: 64

.

- G and D layers
  - 256 * 2
    - ?
  - 256 * 4
    - ?
  - Best: ?

.

- D steps (1, 5) and PAC value (4, 8, 16)
  - 1, 4
    - ?
  - 1, 8
    - ?
  - 1, 16
    - ?
  - 5, 4
    - ?
  - 5, 8
    - ?
  - 5, 16
    - ?
  - Best: ?

.

- LR, batch size, epochs
  - AE LR: 1e-4, GAN LR: 1e-5, batch size: 512, GAN epochs: 300, AE epochs: 100
    - ?
  - AE LR: 1e-4, GAN LR: 1e-5, batch size: 256, GAN epochs: 150, AE epochs: 50
    - ?
  - AE LR: 1e-4, GAN LR: 2e-4, batch size: 512, GAN epochs: 300, AE epochs: 100
    - ?
  - AE LR: 1e-4, GAN LR: 2e-4, batch size: 256, GAN epochs: 150, AE epochs: 50
    - ?
  - AE LR: 2e-4, GAN LR: 1e-5, batch size: 512, GAN epochs: 300, AE epochs: 100
    - ?
  - AE LR: 2e-4, GAN LR: 1e-5, batch size: 256, GAN epochs: 150, AE epochs: 50
    - ?
  - AE LR: 2e-4, GAN LR: 2e-4, batch size: 512, GAN epochs: 300, AE epochs: 100
    - ?
  - AE LR: 2e-4, GAN LR: 2e-4, batch size: 256, GAN epochs: 150, AE epochs: 50
    - ?
  - AE LR: 1e-3, GAN LR: 1e-5, batch size: 512, GAN epochs: 300, AE epochs: 100
    - ?
  - AE LR: 1e-3, GAN LR: 1e-5, batch size: 256, GAN epochs: 150, AE epochs: 50
    - ?
  - AE LR: 1e-3, GAN LR: 2e-4, batch size: 512, GAN epochs: 300, AE epochs: 100
    - ?
  - AE LR: 1e-3, GAN LR: 2e-4, batch size: 256, GAN epochs: 150, AE epochs: 50
    - ?
  - Best: ?

In [4]:
# model = CTGANV2()
# intrusion_trans = model.transform(intrusion_train_df, discrete_columns=intrusion_discrete_columns)
# # Save data transformer and transformed data to a file
# with open("../dataset/intrusion/data_transformer.pkl", "wb") as fp:
#     pickle.dump(model._transformer, fp)
# with open("../dataset/intrusion/transformed_data.pkl", "wb") as fp:
#     pickle.dump(intrusion_trans, fp)

In [5]:
# Load data transformer and transformed data
# with open("../dataset/intrusion/data_transformer.pkl", "rb") as fp:
#     dt = pickle.load(fp)
# with open("../dataset/intrusion/transformed_data.pkl", "rb") as fp:
#     intrusion_trans = pickle.load(fp)

In [6]:
# model = CTGANV2(epochs=2)

# model.fit(intrusion_trans, discrete_columns=intrusion_discrete_columns, dt=dt, is_pre_transformed=True, target_index=intrusion_train_df.shape[1]-1)

In [7]:
# model.save(f"../models/intrusion_ae_gan_1.pkl")

In [5]:
# model = CTGANV2.load("../models/intrusion_ae_gan_1.pkl")

In [6]:
# Sample fake train and validation data, if fails then re-attempt
# intrusion_fake_df = model.sample(intrusion_train_df.shape[0] + intrusion_valid_df.shape[0])
# intrusion_fake_X, intrusion_fake_y = dataloader.get_Xy(intrusion_fake_df)
# intrusion_fake_train_X, intrusion_fake_valid_X, intrusion_fake_train_y, intrusion_fake_valid_y = train_test_split(
#     intrusion_fake_X, intrusion_fake_y, test_size=intrusion_valid_df.shape[0], random_state=1, shuffle=True, stratify=intrusion_fake_y.argmax(-1)
# )

In [4]:
# Get real validation data
# intrusion_valid_X, intrusion_valid_y = dataloader.get_Xy(intrusion_valid_df)

In [15]:
# Check ML efficacy
# test_score = train_model(
#     intrusion_fake_train_X,
#     intrusion_fake_train_y,
#     intrusion_fake_valid_X,
#     intrusion_fake_valid_y,
#     intrusion_test_X,
#     intrusion_test_y,
#     input_dim=215,
#     output_dim=23,
#     batch_size=2048,
#     num_epochs=300,
#     model_type="classification",
#     show_print_training_score=False,
#     verbose=False,
#     scorer_type="f1_macro",
# )
# print(f"Test score: {test_score}")

  3%|▎         | 10/300 [00:00<00:06, 45.09it/s]

Epoch: 1, Training Loss: 0.0959, Valid Loss: 0.0072, Valid score: 0.0978


 12%|█▏        | 35/300 [00:00<00:05, 47.35it/s]

Epoch: 26, Training Loss: 0.0006, Valid Loss: 0.0002, Valid score: 0.4736


 19%|█▉        | 57/300 [00:01<00:04, 49.19it/s]

Epoch: 51, Training Loss: 0.0004, Valid Loss: 0.0001, Valid score: 0.5599


 29%|██▊       | 86/300 [00:01<00:04, 51.00it/s]

Epoch: 76, Training Loss: 0.0004, Valid Loss: 0.0002, Valid score: 0.5190


 37%|███▋      | 110/300 [00:02<00:03, 50.98it/s]

Epoch: 101, Training Loss: 0.0003, Valid Loss: 0.0001, Valid score: 0.5591


 44%|████▍     | 132/300 [00:02<00:03, 46.72it/s]

Epoch: 126, Training Loss: 0.0003, Valid Loss: 0.0001, Valid score: 0.6366


 53%|█████▎    | 159/300 [00:03<00:02, 47.64it/s]

Epoch: 151, Training Loss: 0.0004, Valid Loss: 0.0002, Valid score: 0.5759


 62%|██████▏   | 185/300 [00:03<00:02, 48.52it/s]

Epoch: 176, Training Loss: 0.0003, Valid Loss: 0.0001, Valid score: 0.6317


 70%|███████   | 210/300 [00:04<00:01, 47.29it/s]

Epoch: 201, Training Loss: 0.0003, Valid Loss: 0.0001, Valid score: 0.6858


 77%|███████▋  | 231/300 [00:04<00:01, 47.65it/s]

Epoch: 226, Training Loss: 0.0005, Valid Loss: 0.0001, Valid score: 0.5760


 86%|████████▌ | 257/300 [00:05<00:00, 48.38it/s]

Epoch: 251, Training Loss: 0.0003, Valid Loss: 0.0001, Valid score: 0.7114


 95%|█████████▍| 284/300 [00:05<00:00, 49.62it/s]

Epoch: 276, Training Loss: 0.0003, Valid Loss: 0.0001, Valid score: 0.6977


100%|██████████| 300/300 [00:06<00:00, 47.88it/s]

Best validation score: 0.7241637469293323
Test score: 0.17499796797030934





In [4]:
n_models = 12

scores = []

for i in range(n_models):
    print(f"Processing {i+1} model")
    model_file = f"../models/intrusion_ae_gan_{i+1}.pkl"
    if not os.path.exists(model_file):
        print(f"Skipping {i+1} model")
        continue
    model = CTGANV2.load(model_file)
    # Sample fake train and validation data
    idx = 0
    while True:
        if idx >= 5:
            raise Exception(f"Failed to generate fake data after {idx} tries")
        try:
            intrusion_fake_df = model.sample(intrusion_train_df.shape[0] + intrusion_valid_df.shape[0])
            intrusion_fake_X, intrusion_fake_y = dataloader.get_Xy(intrusion_fake_df)
            intrusion_fake_train_X, intrusion_fake_valid_X, intrusion_fake_train_y, intrusion_fake_valid_y = train_test_split(
                intrusion_fake_X, intrusion_fake_y, test_size=intrusion_valid_df.shape[0], random_state=1, shuffle=True, stratify=intrusion_fake_y.argmax(-1)
            )
            break
        except:
            print("Invalid fake data generated, trying again...")
            idx+=1
            continue

    best_test_score = -float("inf")
    for i in range(5):
        test_score = train_model(
            intrusion_fake_train_X,
            intrusion_fake_train_y,
            intrusion_fake_valid_X,
            intrusion_fake_valid_y,
            intrusion_valid_X,
            intrusion_valid_y,
            input_dim=215,
            output_dim=23,
            batch_size=2048,
            num_epochs=300,
            model_type="classification",
            show_print_training_score=False,
            verbose=False,
            scorer_type="f1_macro",
        )
        if test_score > best_test_score:
            best_test_score = test_score
    print(f"Test score: {best_test_score}")
    scores.append(best_test_score)

Processing 1 model
