In [1]:
from ctgan.synthesizers.ae_gan import CTGANV2, AutoEncoderType
from aegan_utils import load_news, get_news_Xy, train_model
from sklearn.model_selection import train_test_split
import datetime
import pandas as pd
import numpy as np
import pickle
import os

## news dataset

In [2]:
news_train_df, news_valid_df, news_test_df, news_discrete_columns = load_news(path="../dataset/news/")

### R2 score on real news validation dataset

### Vanilla AE

- AE dimension
    - (256, 128)
        - -0.09
    - (256, 128, 64)
        - -0.09
    - (256, 128, 64, 32)
        - -0.04
    - Best: (256, 128, 64, 32)

.

- G hidden dimension
    - 64
        - -0.24
    - 128
        - -0.07
    - 256
        - -1.36
    - Best: 128

.

- G and D layers
  - 256 * 2
    - -0.01
  - 256 * 4
    - -0.1
  - Best: 256 * 2

.

- D steps (1, 5) and PAC value (4, 8, 16)
  - 1, 4
    - -0.32
  - 1, 8
    - -0.34
  - 1, 16
    - -0.12
  - 5, 4
    - -0.04
  - 5, 8
    - -0.20
  - 5, 16
    - -0.02
  - Best: D steps = 5, PAC value = 16

.

- LR, batch size, epochs
  - AE LR: 1e-4, GAN LR: 1e-5, batch size: 512, GAN epochs: 300, AE epochs: 100
    - -0.19
  - AE LR: 1e-4, GAN LR: 1e-5, batch size: 256, GAN epochs: 150, AE epochs: 50
    - -0.46
  - AE LR: 1e-4, GAN LR: 2e-4, batch size: 512, GAN epochs: 300, AE epochs: 100
    - -0.12
  - AE LR: 1e-4, GAN LR: 2e-4, batch size: 256, GAN epochs: 150, AE epochs: 50
    - -0.17
  - AE LR: 2e-4, GAN LR: 1e-5, batch size: 512, GAN epochs: 300, AE epochs: 100
    - -0.50
  - AE LR: 2e-4, GAN LR: 1e-5, batch size: 256, GAN epochs: 150, AE epochs: 50
      - NA
  - AE LR: 2e-4, GAN LR: 2e-4, batch size: 512, GAN epochs: 300, AE epochs: 100
      - NA
  - AE LR: 2e-4, GAN LR: 2e-4, batch size: 256, GAN epochs: 150, AE epochs: 50
      - NA
  - AE LR: 1e-3, GAN LR: 1e-5, batch size: 512, GAN epochs: 300, AE epochs: 100
      - -0.02
  - AE LR: 1e-3, GAN LR: 1e-5, batch size: 256, GAN epochs: 150, AE epochs: 50
      - -0.08
  - AE LR: 1e-3, GAN LR: 2e-4, batch size: 512, GAN epochs: 300, AE epochs: 100
      - -0.04
  - AE LR: 1e-3, GAN LR: 2e-4, batch size: 256, GAN epochs: 150, AE epochs: 50
      - 1.50
  - Best: AE LR: 1e-3, GAN LR: 1e-5, batch size: 512, GAN epochs: 300, AE epochs: 100

.

- Best
  - AE dimension
    - (256, 128, 64, 32)
  - G hidden dimension
    - 128
  - G and D layers
    - 256 * 2
  - D steps
    - 5
  - PAC
    - 16
  - AE LR
    - 1e-3
  - GAN LR
    - 1e-5
  - Batch size
    - 512
  - GAN epochs
    - 300
  - AE epochs 100

### DENOSING AE

- AE dimension
    - (256, 128)
        - -5.71
    - (256, 128, 64)
        - -0.28
    - (256, 128, 64, 32)
        - -1.26
    - Best: (256, 128, 64)

.

- G hidden dimension
    - 64
      - -2.06
    - 128
      - -0.16
    - 256
      - -2.40
    - Best: 128

.

- G and D layers
  - 256 * 2
    - -0.50
  - 256 * 4
    - -0.06
  - Best: 256 * 4

.

- D steps (1, 5) and PAC value (4, 8, 16)
  - 1, 4
    - -0.04
  - 1, 8
    - -0.18
  - 1, 16
    - -0.01
  - 5, 4
    - -0.51
  - 5, 8
    - -0.06
  - 5, 16
    - -0.14
  - Best: D steps = 1, PAC value = 16

.

- LR, batch size, epochs
  - AE LR: 1e-4, GAN LR: 1e-5, batch size: 512, GAN epochs: 300, AE epochs: 100
    - ?
  - AE LR: 1e-4, GAN LR: 1e-5, batch size: 256, GAN epochs: 150, AE epochs: 50
    - ?
  - AE LR: 1e-4, GAN LR: 2e-4, batch size: 512, GAN epochs: 300, AE epochs: 100
    - ?
  - AE LR: 1e-4, GAN LR: 2e-4, batch size: 256, GAN epochs: 150, AE epochs: 50
    - ?
  - AE LR: 2e-4, GAN LR: 1e-5, batch size: 512, GAN epochs: 300, AE epochs: 100
    - ?
  - AE LR: 2e-4, GAN LR: 1e-5, batch size: 256, GAN epochs: 150, AE epochs: 50
      - ?
  - AE LR: 2e-4, GAN LR: 2e-4, batch size: 512, GAN epochs: 300, AE epochs: 100
      - ?
  - AE LR: 2e-4, GAN LR: 2e-4, batch size: 256, GAN epochs: 150, AE epochs: 50
      - ?
  - AE LR: 1e-3, GAN LR: 1e-5, batch size: 512, GAN epochs: 300, AE epochs: 100
      - ?
  - AE LR: 1e-3, GAN LR: 1e-5, batch size: 256, GAN epochs: 150, AE epochs: 50
      - ?
  - AE LR: 1e-3, GAN LR: 2e-4, batch size: 512, GAN epochs: 300, AE epochs: 100
      - ?
  - AE LR: 1e-3, GAN LR: 2e-4, batch size: 256, GAN epochs: 150, AE epochs: 50
      - ?
  - Best: ?

### ENTITY AE

- AE dimension
    - (256, 128)
        - -0.03
    - (256, 128, 64)
        - -0.02
    - (256, 128, 64, 32)
        - -0.26
    - Best: (256, 128, 64)

.

- G hidden dimension
    - 64
        - -0.35
    - 128
        - -0.05
    - 256
        - -0.19
    - Best: 128

.

- G and D layers
  - 256 * 2
    - -0.39
  - 256 * 4
    - -0.34
  - Best: 256 * 4

.

- D steps (1, 5) and PAC value (4, 8, 16)
  - 1, 4
    - -0.19
  - 1, 8
    - -0.67
  - 1, 16
    - -0.01
  - 5, 4
    - -0.03
  - 5, 8
    - -0.10
  - 5, 16
    - -0.52
  - Best: D steps = 1, PAC values = 16 

.

- LR, batch size, epochs
  - AE LR: 1e-4, GAN LR: 1e-5, batch size: 512, GAN epochs: 300, AE epochs: 100
    - ?
  - AE LR: 1e-4, GAN LR: 1e-5, batch size: 256, GAN epochs: 150, AE epochs: 50
    - ?
  - AE LR: 1e-4, GAN LR: 2e-4, batch size: 512, GAN epochs: 300, AE epochs: 100
    - ?
  - AE LR: 1e-4, GAN LR: 2e-4, batch size: 256, GAN epochs: 150, AE epochs: 50
    - ?
  - AE LR: 2e-4, GAN LR: 1e-5, batch size: 512, GAN epochs: 300, AE epochs: 100
    - ?
  - AE LR: 2e-4, GAN LR: 1e-5, batch size: 256, GAN epochs: 150, AE epochs: 50
      - ?
  - AE LR: 2e-4, GAN LR: 2e-4, batch size: 512, GAN epochs: 300, AE epochs: 100
      - ?
  - AE LR: 2e-4, GAN LR: 2e-4, batch size: 256, GAN epochs: 150, AE epochs: 50
      - ?
  - AE LR: 1e-3, GAN LR: 1e-5, batch size: 512, GAN epochs: 300, AE epochs: 100
      - ?
  - AE LR: 1e-3, GAN LR: 1e-5, batch size: 256, GAN epochs: 150, AE epochs: 50
      - ?
  - AE LR: 1e-3, GAN LR: 2e-4, batch size: 512, GAN epochs: 300, AE epochs: 100
      - ?
  - AE LR: 1e-3, GAN LR: 2e-4, batch size: 256, GAN epochs: 150, AE epochs: 50
      - ?
  - Best: ?

### VARIATIONAL AE

- AE dimension
    - (256, 128)
        - -0.13
    - (256, 128, 64)
        - -0.29
    - (256, 128, 64, 32)
        - -0.17
    - Best: (256, 128)

.

- G hidden dimension
    - 64
        - -0.19
    - 128
        - -0.05
    - 256
        - -0.21
    - Best: 128

.

- G and D layers
  - 256 * 2
    - -0.10
  - 256 * 4
    - -0.15
  - Best: 256 * 2

.

- D steps (1, 5) and PAC value (4, 8, 16)
  - 1, 4
    - -0.14
  - 1, 8
    - -0.30
  - 1, 16
    - -0.14
  - 5, 4
    - -0.22
  - 5, 8
    - -0.08
  - 5, 16
    - -0.12
  - Best: D steps = 5, PAC values = 8

.

- LR, batch size, epochs
  - AE LR: 1e-4, GAN LR: 1e-5, batch size: 512, GAN epochs: 300, AE epochs: 100
    - ?
  - AE LR: 1e-4, GAN LR: 1e-5, batch size: 256, GAN epochs: 150, AE epochs: 50
    - ?
  - AE LR: 1e-4, GAN LR: 2e-4, batch size: 512, GAN epochs: 300, AE epochs: 100
    - ?
  - AE LR: 1e-4, GAN LR: 2e-4, batch size: 256, GAN epochs: 150, AE epochs: 50
    - ?
  - AE LR: 2e-4, GAN LR: 1e-5, batch size: 512, GAN epochs: 300, AE epochs: 100
    - ?
  - AE LR: 2e-4, GAN LR: 1e-5, batch size: 256, GAN epochs: 150, AE epochs: 50
      - ?
  - AE LR: 2e-4, GAN LR: 2e-4, batch size: 512, GAN epochs: 300, AE epochs: 100
      - ?
  - AE LR: 2e-4, GAN LR: 2e-4, batch size: 256, GAN epochs: 150, AE epochs: 50
      - ?
  - AE LR: 1e-3, GAN LR: 1e-5, batch size: 512, GAN epochs: 300, AE epochs: 100
      - ?
  - AE LR: 1e-3, GAN LR: 1e-5, batch size: 256, GAN epochs: 150, AE epochs: 50
      - ?
  - AE LR: 1e-3, GAN LR: 2e-4, batch size: 512, GAN epochs: 300, AE epochs: 100
      - ?
  - AE LR: 1e-3, GAN LR: 2e-4, batch size: 256, GAN epochs: 150, AE epochs: 50
      - ?
  - Best: ?

In [3]:
# synthesizer = "ae_gan_vanilla"

In [4]:
# model = CTGANV2(
#     ae_type=AutoEncoderType.VANILLA,
#     ae_dim=(256, 128),
# )

In [5]:
# news_trans = model.transform(news_train_df, discrete_columns=news_discrete_columns)

In [6]:
# Save data transformer and transformed data to a file
# with open("../dataset/news/data_transformer.pkl", "wb") as fp:
#     pickle.dump(model._transformer, fp)
# with open("../dataset/news/transformed_data.pkl", "wb") as fp:
#     pickle.dump(news_trans, fp)

In [7]:
# Load data transformer and transformed data
with open("../dataset/news/data_transformer.pkl", "rb") as fp:
    dt = pickle.load(fp)
with open("../dataset/news/transformed_data.pkl", "rb") as fp:
    news_trans = pickle.load(fp)

In [8]:
idx = 35
n_runs = 2
param_dict = {
    "ae_type": [AutoEncoderType.VARIATIONAL for i in range(n_runs)],
    "ae_dim": [(256, 128) for i in range(n_runs)],
    "embedding_dim": [128 for i in range(n_runs)],
    "generator_dim": [(256, 256) for i in range(n_runs)],
    "discriminator_steps": [5 for i in range(n_runs)],
    "pac": [8 for i in range(n_runs)],
    "autoencoder_lr": [1e-3, 1e-3],
    "generator_lr": [2e-4, 2e-4],
    "discriminator_lr": [2e-4, 2e-4],
    "batch_size": [512, 256],
    "ae_batch_size": [512, 256],
    "epochs": [300, 150],
    "ae_epochs": [100, 50],
}


for i in range(n_runs):
    inner_param_dict = dict()
    for key in param_dict:
        inner_param_dict[key] = param_dict[key][i]

    model = CTGANV2(
        **inner_param_dict,
    )

    model.fit(news_trans, discrete_columns=news_discrete_columns, dt=dt, is_pre_transformed=True)

    now = datetime.datetime.now()
    current_time = now.strftime("%d-%m-%Y-%H-%M-%S")
    model.save(
        f"../models/news_ae_gan_{idx}.pkl"
    )

    idx+=1

TypeError: ctgan.synthesizers.ae_gan.CTGANV2() got multiple values for keyword argument 'epochs'

In [None]:
# model.fit(news_trans, discrete_columns=news_discrete_columns, dt=dt, is_pre_transformed=True, epochs=1)

In [None]:
# now = datetime.datetime.now()
# current_time = now.strftime("%d-%m-%Y-%H-%M-%S")
# model.save(
#     f"../models/news_{synthesizer}.pkl"
# >)

In [None]:
# model = CTGANV2.load("../models/news_ae_gan_vanilla_3.pkl")

In [None]:
# Sample fake train and validation data
# news_fake_df = model.sample(news_train_df.shape[0] + news_valid_df.shape[0])
# news_fake_X, news_fake_y = get_news_Xy(news_fake_df)
# news_fake_train_X, news_fake_valid_X, news_fake_train_y, news_fake_valid_y = train_test_split(
#     news_fake_X, news_fake_y, test_size=news_valid_df.shape[0], random_state=1, shuffle=True
# )

In [None]:
# Get real validation data
# news_valid_X, news_valid_y = get_news_Xy(news_valid_df)

In [None]:
# Check ML efficacy
# test_score = train_model(
#     news_fake_train_X,
#     news_fake_train_y,
#     news_fake_valid_X,
#     news_fake_valid_y,
#     news_valid_X,
#     news_valid_y,
#     input_dim=59,
#     output_dim=1,
#     batch_size=256,
#     num_epochs=100,
#     model_type="regression",
#     show_print_training_score=False,
#     scorer_type="r2",
#     verbose=False
# )
# print(f"Test score: {test_score}")

In [None]:
# news_valid_X, news_valid_y = get_news_Xy(news_valid_df)

# n_models = 18

# scores = []

# for i in range(n_models):
#     print(f"Processing {i+1} model")
#     model_file = f"../models/news_ae_gan_{i+1}.pkl"
#     if not os.path.exists(model_file):
#         print(f"Skipping {i+1} model")
#         continue
#     model = CTGANV2.load(model_file)
#     # Sample fake train and validation data
#     news_fake_df = model.sample(news_train_df.shape[0] + news_valid_df.shape[0])
#     news_fake_X, news_fake_y = get_news_Xy(news_fake_df)
#     news_fake_train_X, news_fake_valid_X, news_fake_train_y, news_fake_valid_y = train_test_split(
#         news_fake_X, news_fake_y, test_size=news_valid_df.shape[0], random_state=1, shuffle=True
#     )

#     best_test_score = -float("inf")
#     for i in range(5):
#         test_score = train_model(
#             news_fake_train_X,
#             news_fake_train_y,
#             news_fake_valid_X,
#             news_fake_valid_y,
#             news_valid_X,
#             news_valid_y,
#             input_dim=59,
#             output_dim=1,
#             batch_size=256,
#             num_epochs=100,
#             model_type="regression",
#             show_print_training_score=False,
#             scorer_type="r2",
#             verbose=False
#         )
#         if test_score > best_test_score:
#             best_test_score = test_score
#     print(f"Test score: {best_test_score}")
#     scores.append(best_test_score)