In [1]:
from ctgan.synthesizers.ae_gan import CTGANV2, AutoEncoderType
from aegan_utils import load_covtype, get_covtype_Xy, train_model
from sklearn.model_selection import train_test_split
import datetime
import pandas as pd
import numpy as np
import pickle
import os

### covtype dataset

In [2]:
covtype_train_df, covtype_valid_df, covtype_test_df, covtype_discrete_columns = load_covtype(path="../dataset/covtype/")

### Tune based on F1 macro score on real covtype validation dataset

### Vanilla AE

- AE dimension
    - (256, 128)
        - 0.12 
    - (256, 128, 64)
        - 0.10
    - (256, 128, 64, 32)
        - 0.11
    - Best: (256, 128)

.

- G hidden dimension
    - 64
        - 0.10
    - 128
        - 0.12
    - 256
        - 0.08
    - Best: 64

.

- G and D layers
  - 256 * 2
    - 0.15
  - 256 * 4
    - 0.16
  - Best: 256 * 4

.

- D steps (1, 5) and PAC value (4, 8, 16)
  - 1, 4
    - 0.134
  - 1, 8
    - 0.124
  - 1, 16
    - 0.139
  - 5, 4
    - 0.126
  - 5, 8
    - 0.113
  - 5, 16
    - 0.162
  - Best: D steps = 5, PAC value = 16

.

- LR, batch size, epochs
  - AE LR: 1e-4, GAN LR: 1e-5, batch size: 512, GAN epochs: 300, AE epochs: 100
    - 0.143
  - AE LR: 1e-4, GAN LR: 1e-5, batch size: 256, GAN epochs: 150, AE epochs: 50
    - 0.142
  - AE LR: 1e-4, GAN LR: 2e-4, batch size: 512, GAN epochs: 300, AE epochs: 100
    - 0.140
  - AE LR: 1e-4, GAN LR: 2e-4, batch size: 256, GAN epochs: 150, AE epochs: 50
    - 0.163
  - AE LR: 2e-4, GAN LR: 1e-5, batch size: 512, GAN epochs: 300, AE epochs: 100
    - 0.144
  - AE LR: 2e-4, GAN LR: 1e-5, batch size: 256, GAN epochs: 150, AE epochs: 50
    - 0.181
  - AE LR: 2e-4, GAN LR: 2e-4, batch size: 512, GAN epochs: 300, AE epochs: 100
    - 0.134
  - AE LR: 2e-4, GAN LR: 2e-4, batch size: 256, GAN epochs: 150, AE epochs: 50
    - 0.120
  - AE LR: 1e-3, GAN LR: 1e-5, batch size: 512, GAN epochs: 300, AE epochs: 100
    - 0.124 
  - AE LR: 1e-3, GAN LR: 1e-5, batch size: 256, GAN epochs: 150, AE epochs: 50
    - 0.135
  - AE LR: 1e-3, GAN LR: 2e-4, batch size: 512, GAN epochs: 300, AE epochs: 100
    - 0.134
  - AE LR: 1e-3, GAN LR: 2e-4, batch size: 256, GAN epochs: 150, AE epochs: 50
    - 0.076
  - Best: AE LR: 2e-4, GAN LR: 1e-5, batch size: 256, GAN epochs: 150, AE epochs: 50

### Denoising AE

- AE dimension
    - (256, 128)
        - 0.13
    - (256, 128, 64)
        - 0.06
    - (256, 128, 64, 32)
        - 0.009
    - Best: (256, 128)

.

- G hidden dimension
    - 64
        - 0.11
    - 128
        - 0.08
    - 256
        - 0.07
    - Best: 64

.

- G and D layers
  - 256 * 2
    - 0.10
  - 256 * 4
    - 0.06
  - Best: 256 * 2

.

- D steps (1, 5) and PAC value (4, 8, 16)
  - 1, 4
    - 0.122
  - 1, 8
    - 0.103
  - 1, 16
    - 0.116
  - 5, 4
    - 0.137
  - 5, 8
    - 0.138
  - 5, 16
    - 0.062
  - Best: D steps = 5, PAC value = 8

.

- LR, batch size, epochs
  - AE LR: 1e-4, GAN LR: 1e-5, batch size: 512, GAN epochs: 300, AE epochs: 100
    - 0.092
  - AE LR: 1e-4, GAN LR: 1e-5, batch size: 256, GAN epochs: 150, AE epochs: 50
    - 0.083
  - AE LR: 1e-4, GAN LR: 2e-4, batch size: 512, GAN epochs: 300, AE epochs: 100
    - 0.084
  - AE LR: 1e-4, GAN LR: 2e-4, batch size: 256, GAN epochs: 150, AE epochs: 50
    - 0.017
  - AE LR: 2e-4, GAN LR: 1e-5, batch size: 512, GAN epochs: 300, AE epochs: 100
    - 0.099
  - AE LR: 2e-4, GAN LR: 1e-5, batch size: 256, GAN epochs: 150, AE epochs: 50
    - 0.083
  - AE LR: 2e-4, GAN LR: 2e-4, batch size: 512, GAN epochs: 300, AE epochs: 100
    - 0.114
  - AE LR: 2e-4, GAN LR: 2e-4, batch size: 256, GAN epochs: 150, AE epochs: 50
    - 0.117
  - AE LR: 1e-3, GAN LR: 1e-5, batch size: 512, GAN epochs: 300, AE epochs: 100
    - 0.126
  - AE LR: 1e-3, GAN LR: 1e-5, batch size: 256, GAN epochs: 150, AE epochs: 50
    - 0.133
  - AE LR: 1e-3, GAN LR: 2e-4, batch size: 512, GAN epochs: 300, AE epochs: 100
    - 0.080
  - AE LR: 1e-3, GAN LR: 2e-4, batch size: 256, GAN epochs: 150, AE epochs: 50
    - 0.106
  - Best: AE LR: 1e-3, GAN LR: 1e-5, batch size: 256, GAN epochs: 150, AE epochs: 50

.

- Best
  - ?

### Entity embedding AE

- AE dimension
    - (256, 128)
        - 0.15
    - (256, 128, 64)
        - 0.10
    - (256, 128, 64, 32)
        - 0.08
    - Best: (256, 128)

.

- G hidden dimension
    - 64
        - 0.06
    - 128
        - 0.16
    - 256
        - 0.10
    - Best: 128

.

- G and D layers
  - 256 * 2
    - 0.14
  - 256 * 4
    - 0.13
  - Best: 256 * 2

.

- D steps (1, 5) and PAC value (4, 8, 16)
  - 1, 4
    - 0.098
  - 1, 8
    - 0.140
  - 1, 16
    - 0.123
  - 5, 4
    - 0.071
  - 5, 8
    - 0.157
  - 5, 16
    - 0.160
  - Best: D steps = 5, PAC value = 16

.

- LR, batch size, epochs
  - AE LR: 1e-4, GAN LR: 1e-5, batch size: 512, GAN epochs: 300, AE epochs: 100
    - 0.126
  - AE LR: 1e-4, GAN LR: 1e-5, batch size: 256, GAN epochs: 150, AE epochs: 50
    - 0.126
  - AE LR: 1e-4, GAN LR: 2e-4, batch size: 512, GAN epochs: 300, AE epochs: 100
    - 0.133
  - AE LR: 1e-4, GAN LR: 2e-4, batch size: 256, GAN epochs: 150, AE epochs: 50
    - 0.151
  - AE LR: 2e-4, GAN LR: 1e-5, batch size: 512, GAN epochs: 300, AE epochs: 100
    - 0.143
  - AE LR: 2e-4, GAN LR: 1e-5, batch size: 256, GAN epochs: 150, AE epochs: 50
    - 0.148
  - AE LR: 2e-4, GAN LR: 2e-4, batch size: 512, GAN epochs: 300, AE epochs: 100
    - 0.148
  - AE LR: 2e-4, GAN LR: 2e-4, batch size: 256, GAN epochs: 150, AE epochs: 50
    - 0.135
  - AE LR: 1e-3, GAN LR: 1e-5, batch size: 512, GAN epochs: 300, AE epochs: 100
    - 0.133
  - AE LR: 1e-3, GAN LR: 1e-5, batch size: 256, GAN epochs: 150, AE epochs: 50
    - 0.134
  - AE LR: 1e-3, GAN LR: 2e-4, batch size: 512, GAN epochs: 300, AE epochs: 100
    - 0.120
  - AE LR: 1e-3, GAN LR: 2e-4, batch size: 256, GAN epochs: 150, AE epochs: 50
    - 0.143
  - Best: AE LR: 1e-4, GAN LR: 2e-4, batch size: 256, GAN epochs: 150, AE epochs: 50

### Variational AE

- AE dimension
    - (256, 128)
        - 0.0980
    - (256, 128, 64)
        - 0.0989
    - (256, 128, 64, 32)
        - 0.0936
    - Best: (256, 128, 64)

.

- G hidden dimension
    - 64
        - 0.09
    - 128
        - 0.11
    - 256
        - 0.09
    - Best: 128

.

- G and D layers
  - 256 * 2
    - 0.10
  - 256 * 4
    - 0.09
  - Best: 256 * 2

.

- D steps (1, 5) and PAC value (4, 8, 16)
  - 1, 4
    - 0.098
  - 1, 8
    - 0.102
  - 1, 16
    - 0.095
  - 5, 4
    - 0.093
  - 5, 8
    - 0.093
  - 5, 16
    - 0.093
  - Best: D steps = 1, PAC value = 8

.

- LR, batch size, epochs
  - AE LR: 1e-4, GAN LR: 1e-5, batch size: 512, GAN epochs: 300, AE epochs: 100
    - 0.093
  - AE LR: 1e-4, GAN LR: 1e-5, batch size: 256, GAN epochs: 150, AE epochs: 50
    - 0.093
  - AE LR: 1e-4, GAN LR: 2e-4, batch size: 512, GAN epochs: 300, AE epochs: 100
    - 0.103
  - AE LR: 1e-4, GAN LR: 2e-4, batch size: 256, GAN epochs: 150, AE epochs: 50
    - 0.093
  - AE LR: 2e-4, GAN LR: 1e-5, batch size: 512, GAN epochs: 300, AE epochs: 100
    - 0.101
  - AE LR: 2e-4, GAN LR: 1e-5, batch size: 256, GAN epochs: 150, AE epochs: 50
    - 0.093
  - AE LR: 2e-4, GAN LR: 2e-4, batch size: 512, GAN epochs: 300, AE epochs: 100
    - 0.093
  - AE LR: 2e-4, GAN LR: 2e-4, batch size: 256, GAN epochs: 150, AE epochs: 50
    - 0.094
  - AE LR: 1e-3, GAN LR: 1e-5, batch size: 512, GAN epochs: 300, AE epochs: 100
    - 0.096
  - AE LR: 1e-3, GAN LR: 1e-5, batch size: 256, GAN epochs: 150, AE epochs: 50
    - 0.093
  - AE LR: 1e-3, GAN LR: 2e-4, batch size: 512, GAN epochs: 300, AE epochs: 100
    - 0.093
  - AE LR: 1e-3, GAN LR: 2e-4, batch size: 256, GAN epochs: 150, AE epochs: 50
    - 0.105
  - Best: AE LR: 1e-3, GAN LR: 2e-4, batch size: 256, GAN epochs: 150, AE epochs: 50

In [3]:
# model = CTGANV2()
# covtype_trans = model.transform(covtype_train_df, discrete_columns=covtype_discrete_columns)
# # Save data transformer and transformed data to a file
# with open("../dataset/covtype/data_transformer.pkl", "wb") as fp:
#     pickle.dump(model._transformer, fp)
# with open("../dataset/covtype/transformed_data.pkl", "wb") as fp:
#     pickle.dump(covtype_trans, fp)

In [3]:
# Load data transformer and transformed data
# with open("../dataset/covtype/data_transformer.pkl", "rb") as fp:
#     dt = pickle.load(fp)
# with open("../dataset/covtype/transformed_data.pkl", "rb") as fp:
#     covtype_trans = pickle.load(fp)

In [4]:
idx = 7
n_runs = 2
# param_dict = {
#     "ae_type": [AutoEncoderType.VARIATIONAL for i in range(n_runs)],
#     "ae_dim": [(256, 128) for i in range(n_runs)],
#     "embedding_dim": [128 for i in range(n_runs)],
#     "generator_dim": [(256, 256) for i in range(n_runs)],
#     "discriminator_steps": [5 for i in range(n_runs)],
#     "pac": [8 for i in range(n_runs)],
#     "autoencoder_lr": [1e-3, 1e-3],
#     "generator_lr": [2e-4, 2e-4],
#     "discriminator_lr": [2e-4, 2e-4],
#     "batch_size": [512, 256],
#     "ae_batch_size": [512, 256],
#     "epochs": [300, 150],
#     "ae_epochs": [100, 50],
# }
param_dict = {
    "ae_type": [AutoEncoderType.VARIATIONAL for i in range(n_runs)],
    "ae_dim": [(256, 128, 64) for i in range(n_runs)],
    "embedding_dim": [128 for i in range(n_runs)],
    "generator_dim": [(256, 256), (256, 256, 256, 256)],
    "discriminator_dim": [(256, 256), (256, 256, 256, 256)]
}


for i in range(n_runs):
    print(f"Starting with model number: {idx}")
    inner_param_dict = dict()
    for key in param_dict:
        inner_param_dict[key] = param_dict[key][i]

    model = CTGANV2(
        epochs=100,
        **inner_param_dict,
    )

    model.fit(covtype_trans, discrete_columns=covtype_discrete_columns, dt=dt, is_pre_transformed=True, target_index=covtype_train_df.shape[1]-1)

    model.save(
        f"../models/covtype_ae_gan_{idx}.pkl"
    )

    idx+=1

Starting with model number: 7
Training AE


100%|██████████| 100/100 [00:12<00:00,  8.21it/s]


Training AE-GAN


100%|██████████| 100/100 [06:08<00:00,  3.69s/it]


Starting with model number: 8
Training AE


100%|██████████| 100/100 [00:12<00:00,  8.25it/s]


Training AE-GAN


100%|██████████| 100/100 [06:44<00:00,  4.04s/it]


In [6]:
# model = CTGANV2(epochs=2)

# model.fit(covtype_trans, discrete_columns=covtype_discrete_columns, dt=dt, is_pre_transformed=True)

# model.save(f"../models/covtype_ae_gan_1.pkl")

In [7]:
# model = CTGANV2.load("../models/covtype_ae_gan_1.pkl")

In [8]:
# Sample fake train and validation data
# covtype_fake_df = model.sample(covtype_train_df.shape[0] + covtype_valid_df.shape[0])
# covtype_fake_X, covtype_fake_y = get_covtype_Xy(covtype_fake_df)
# covtype_fake_train_X, covtype_fake_valid_X, covtype_fake_train_y, covtype_fake_valid_y = train_test_split(
#     covtype_fake_X, covtype_fake_y, test_size=covtype_valid_df.shape[0], random_state=1, shuffle=True, stratify=covtype_fake_y.argmax(-1)
# )

In [3]:
# Get real validation data
covtype_valid_X, covtype_valid_y = get_covtype_Xy(covtype_valid_df)

In [10]:
# Check ML efficacy
# test_score = train_model(
#     covtype_fake_train_X,
#     covtype_fake_train_y,
#     covtype_fake_valid_X,
#     covtype_fake_valid_y,
#     covtype_valid_X,
#     covtype_valid_y,
#     input_dim=54,
#     output_dim=7,
#     batch_size=256,
#     num_epochs=100,
#     model_type="classi>
# print(f"Test score: {test_score}")

In [None]:
n_models = 48

scores = []

for i in range(n_models):
    print(f"Processing {i+1} model")
    model_file = f"../models/covtype_ae_gan_{i+1}.pkl"
    if not os.path.exists(model_file):
        print(f"Skipping {i+1} model")
        continue
    model = CTGANV2.load(model_file)
    # Sample fake train and validation data
    covtype_fake_df = model.sample(covtype_train_df.shape[0] + covtype_valid_df.shape[0])
    covtype_fake_X, covtype_fake_y = get_covtype_Xy(covtype_fake_df)
    covtype_fake_train_X, covtype_fake_valid_X, covtype_fake_train_y, covtype_fake_valid_y = train_test_split(
        covtype_fake_X, covtype_fake_y, test_size=covtype_valid_df.shape[0], random_state=1, shuffle=True
    )

    best_test_score = -float("inf")
    for i in range(5):
        test_score = train_model(
            covtype_fake_train_X,
            covtype_fake_train_y,
            covtype_fake_valid_X,
            covtype_fake_valid_y,
            covtype_valid_X,
            covtype_valid_y,
            input_dim=54,
            output_dim=7,
            batch_size=256,
            num_epochs=50,
            model_type="classification",
            show_print_training_score=False,
            verbose=False,
            scorer_type="f1_macro"
        )
        if test_score > best_test_score:
            best_test_score = test_score
    print(f"Test score: {best_test_score}")
    scores.append(best_test_score)