from utils import write_transform_to_file
write_transform_to_file('adult') # needs to be run only once per dataset

#### Load the data transformer and the transformed data

In [1]:
import json
import os

from ctgan.synthesizers.ae_gan import CTGANV2
from benchmark import train_model
from utils import load_transform_from_file, read_target_idx, sample_fake_adult


AE_TYPES = ['vanilla', 'denoising', 'vae', 'ee']
HP = {
    'ae_dim' : [(256,128), (256,128,64), (256,128,64,32)], # (1)
    'embedding_dim' : [64, 128, 256], #(2)
    'gan_layers': [(256,256), (256,256,256,256)], # (3)
    'discriminator_steps': [1, 5], # (4)
    'pac': [5, 10, 15], # (4)
    'gan_lr': [1e-5, 2e-4], # (5)
    'gan_batch_epoch': [(256,150), (512,300)], # (5)
    'ae_lr': [1e-4, 2e-4, 1e-3], # (5)
    'ae_batch_epoch': [(256,40), (512,100)], # (5)
}
if os.path.exists("adult_opt_hp.json"):
    with open("adult_opt_hp.json", 'r') as f:
        hp_vals = json.load(f)
else:
    hp_vals = {k:dict.fromkeys(HP.keys()) for k in AE_TYPES}

# Load the transformed data
dt, td = load_transform_from_file('adult')
target_idx = read_target_idx('adult')

Data transformer loaded
Transformed data loaded


#### (1) Find the optimal AE dimension

In [2]:
hp = 'ae_dim'

for ae_type in AE_TYPES:
    hp_scores = dict()
    for v in HP[hp]:
        k = '_'.join(str(i) for i in v)
        if not os.path.exists(f"../../models/ae_gan_{ae_type}_{hp}_{k}.pth"):
            # Train the AE_GAN
            print(f"Training CTGAN: {ae_type} - {hp} - {v}")
            ae_gan = CTGANV2(ae_type=ae_type, ae_dim=v)
            ae_gan.fit(td['train'],dt=dt,is_transformed=True, target_index=target_idx)
            print("Training Complete!")
            # save model
            ae_gan.save(f"../../models/ae_gan_{ae_type}_{hp}_{k}.pth")
        else:
            print(f"Loading CTGAN: {ae_type} - {hp} - {v}")
            ae_gan = CTGANV2().load(f"../../models/ae_gan_{ae_type}_{hp}_{k}.pth")

        # Sample fake data 
        fake_data, real_data = sample_fake_adult(dt,td,ae_gan)
        
        print("Benchmarking on MLP100")
        # Train the MLP on fake validation data
        best_score = 0
        for _ in range(3):
            test_score = train_model(
                            fake_data[0],
                            fake_data[1],
                            fake_data[2],
                            fake_data[3],
                            real_data[0],
                            real_data[1],
                            batch_size=256,
                            num_epochs=100,
                            model_type="classification",
                            verbose=False,
                            scorer_type="accuracy",
                        )
            if test_score > best_score:
                best_score = test_score
        print("Benchmark Complete!")
        hp_scores[k] = best_score.item()
    
    hp_vals[ae_type][hp] = hp_scores
    with open("adult_opt_hp.json", 'w') as f:
        f.write(json.dumps(hp_vals))

Loading CTGAN: vanilla - ae_dim - (256, 128)
Benchmarking on MLP100


  0%|          | 0/100 [00:00<?, ?it/s]

Best validation score: 0.8885135054588318


  0%|          | 0/100 [00:00<?, ?it/s]

Best validation score: 0.9006449580192566


  0%|          | 0/100 [00:00<?, ?it/s]

Best validation score: 0.9055589437484741
Benchmark Complete!
<class 'float'>
Loading CTGAN: vanilla - ae_dim - (256, 128, 64)
Benchmarking on MLP100


  0%|          | 0/100 [00:00<?, ?it/s]

Best validation score: 1.0


  0%|          | 0/100 [00:00<?, ?it/s]

Best validation score: 1.0


  0%|          | 0/100 [00:00<?, ?it/s]

Best validation score: 1.0
Benchmark Complete!
<class 'float'>
Loading CTGAN: vanilla - ae_dim - (256, 128, 64, 32)
Benchmarking on MLP100


  0%|          | 0/100 [00:00<?, ?it/s]

Best validation score: 1.0


  0%|          | 0/100 [00:00<?, ?it/s]

Best validation score: 1.0


  0%|          | 0/100 [00:00<?, ?it/s]

Best validation score: 1.0
Benchmark Complete!
<class 'float'>
Training CTGAN: denoising - ae_dim - (256, 128)
Training Complete!
Benchmarking on MLP100


  0%|          | 0/100 [00:00<?, ?it/s]

Best validation score: 0.8991093635559082


  0%|          | 0/100 [00:00<?, ?it/s]

Best validation score: 0.896652340888977


  0%|          | 0/100 [00:00<?, ?it/s]

Best validation score: 0.9018734693527222
Benchmark Complete!
<class 'float'>
Training CTGAN: denoising - ae_dim - (256, 128, 64)
Training Complete!
Benchmarking on MLP100


  0%|          | 0/100 [00:00<?, ?it/s]

Best validation score: 0.898034393787384


  0%|          | 0/100 [00:00<?, ?it/s]

Best validation score: 0.9077088236808777


  0%|          | 0/100 [00:00<?, ?it/s]

Best validation score: 0.8992629051208496
Benchmark Complete!
<class 'float'>
Training CTGAN: denoising - ae_dim - (256, 128, 64, 32)
Training Complete!
Benchmarking on MLP100


  0%|          | 0/100 [00:00<?, ?it/s]

Best validation score: 1.0


  0%|          | 0/100 [00:00<?, ?it/s]

Best validation score: 1.0


  0%|          | 0/100 [00:00<?, ?it/s]

Best validation score: 1.0
Benchmark Complete!
<class 'float'>
Training CTGAN: vae - ae_dim - (256, 128)
Training Complete!
Benchmarking on MLP100


  0%|          | 0/100 [00:00<?, ?it/s]

Best validation score: 1.0


  0%|          | 0/100 [00:00<?, ?it/s]

Best validation score: 1.0


  0%|          | 0/100 [00:00<?, ?it/s]

Best validation score: 1.0
Benchmark Complete!
<class 'float'>
Training CTGAN: vae - ae_dim - (256, 128, 64)
Training Complete!
Benchmarking on MLP100


  fake_df[col_name] = 0


  0%|          | 0/100 [00:00<?, ?it/s]

Best validation score: 1.0


  0%|          | 0/100 [00:00<?, ?it/s]

Best validation score: 1.0


  0%|          | 0/100 [00:00<?, ?it/s]

Best validation score: 1.0
Benchmark Complete!
<class 'float'>
Training CTGAN: vae - ae_dim - (256, 128, 64, 32)
Training Complete!
Benchmarking on MLP100


  fake_df[col_name] = 0


  0%|          | 0/100 [00:00<?, ?it/s]

Best validation score: 1.0


  0%|          | 0/100 [00:00<?, ?it/s]

Best validation score: 1.0


  0%|          | 0/100 [00:00<?, ?it/s]

Best validation score: 1.0
Benchmark Complete!
<class 'float'>
Training CTGAN: ee - ae_dim - (256, 128)
Training Complete!
Benchmarking on MLP100


  0%|          | 0/100 [00:00<?, ?it/s]

Best validation score: 0.8929668068885803


  0%|          | 0/100 [00:00<?, ?it/s]

Best validation score: 0.8657862544059753


  0%|          | 0/100 [00:00<?, ?it/s]

Best validation score: 0.8650184273719788
Benchmark Complete!
<class 'float'>
Training CTGAN: ee - ae_dim - (256, 128, 64)
Training Complete!
Benchmarking on MLP100


  0%|          | 0/100 [00:00<?, ?it/s]

Best validation score: 0.999539315700531


  0%|          | 0/100 [00:00<?, ?it/s]

Best validation score: 0.999539315700531


  0%|          | 0/100 [00:00<?, ?it/s]

Best validation score: 0.999539315700531
Benchmark Complete!
<class 'float'>
Training CTGAN: ee - ae_dim - (256, 128, 64, 32)
Training Complete!
Benchmarking on MLP100


  fake_df[col_name] = 0
  fake_df[col_name] = 0


  0%|          | 0/100 [00:00<?, ?it/s]

Best validation score: 1.0


  0%|          | 0/100 [00:00<?, ?it/s]

Best validation score: 1.0


  0%|          | 0/100 [00:00<?, ?it/s]

Best validation score: 1.0
Benchmark Complete!
<class 'float'>
{
    "vanilla": {
        "ae_dim": {
            "256_128": 0.7751842737197876,
            "256_128_64": 0.7592137455940247,
            "256_128_64_32": 0.7598280310630798
        },
        "embedding_dim": null,
        "gan_layers": null,
        "discriminator_steps": null,
        "pac": null,
        "gan_lr": null,
        "gan_batch_epoch": null,
        "ae_lr": null,
        "ae_batch_epoch": null
    },
    "denoising": {
        "ae_dim": {
            "256_128": 0.7714987993240356,
            "256_128_64": 0.30850738286972046,
            "256_128_64_32": 0.7624385952949524
        },
        "embedding_dim": null,
        "gan_layers": null,
        "discriminator_steps": null,
        "pac": null,
        "gan_lr": null,
        "gan_batch_epoch": null,
        "ae_lr": null,
        "ae_batch_epoch": null
    },
    "vae": {
        "ae_dim": {
            "256_128": 0.7592137455940247,
            "256

#### (2) Find the optimal G latent dimension

In [None]:
hp = 'embedding_dim'

for ae_type in AE_TYPES:
    hp_scores = dict()
    ae_dim = tuple(map(int, 
                       max(hp_vals[ae_type]['ae_dim'].items(),
                           key=lambda x: x[1]
                           )[0].split('_')))
    for v in HP[hp]:
        if not os.path.exists(f"../../models/ae_gan_{ae_type}_{hp}_{v}.pth"):
            # Train the AE_GAN
            print(f"Training CTGAN: {ae_type} - {hp} - {v}")
            ae_gan = CTGANV2(ae_type=ae_type, ae_dim=ae_dim, embedding_dim=v)
            ae_gan.fit(td['train'],dt=dt,is_transformed=True, target_index=target_idx)
            print("Training Complete!")
            # save model
            ae_gan.save(f"../../models/ae_gan_{ae_type}_{hp}_{v}.pth")
        else:
            print(f"Loading CTGAN: {ae_type} - {hp} - {v}")
            ae_gan = CTGANV2().load(f"../../models/ae_gan_{ae_type}_{hp}_{v}.pth")

        # Sample fake data 
        fake_data, real_data = sample_fake_adult(dt,td,ae_gan)
        
        print("Benchmarking on MLP100")
        # Train the MLP on fake validation data
        best_score = 0
        for _ in range(3):
            test_score = train_model(
                            fake_data[0],
                            fake_data[1],
                            fake_data[2],
                            fake_data[3],
                            real_data[0],
                            real_data[1],
                            batch_size=256,
                            num_epochs=100,
                            model_type="classification",
                            verbose=False,
                            scorer_type="accuracy",
                        )
            if test_score > best_score:
                best_score = test_score
        print("Benchmark Complete!")
        hp_scores[v] = best_score.item()
    
    hp_vals[ae_type][hp] = hp_scores
    with open("adult_opt_hp.json", 'w') as f:
        f.write(json.dumps(hp_vals))

#### (3) Find the optimal no G/D layers

In [None]:
hp = 'gan_layers'

for ae_type in AE_TYPES:
    hp_scores = dict()
    ae_dim = tuple(map(int, 
                       max(hp_vals[ae_type]['ae_dim'].items(),
                           key=lambda x: x[1]
                           )[0].split('_')))
    embedding_dim = int(max(hp_vals[ae_type]['embedding_dim'].items(), key=lambda x: x[1]))
    for v in HP[hp]:
        k = '_'.join(str(i) for i in v)
        if not os.path.exists(f"../../models/ae_gan_{ae_type}_{hp}_{k}.pth"):
            # Train the AE_GAN
            print(f"Training CTGAN: {ae_type} - {hp} - {v}")
            ae_gan = CTGANV2(ae_type=ae_type, ae_dim=(256_128_64_32),
                             embedding_dim=embedding_dim, generator_dim=v, discriminator_dim=v)
            ae_gan.fit(td['train'],dt=dt,is_transformed=True, target_index=target_idx)
            print("Training Complete!")
            # save model
            ae_gan.save(f"../../models/ae_gan_{ae_type}_{hp}_{k}.pth")
        else:
            print(f"Loading CTGAN: {ae_type} - {hp} - {v}")
            ae_gan = CTGANV2().load(f"../../models/ae_gan_{ae_type}_{hp}_{k}.pth")

        # Sample fake data 
        fake_data, real_data = sample_fake_adult(dt,td,ae_gan)
        
        print("Benchmarking on MLP100")
        # Train the MLP on fake validation data
        best_score = 0
        for _ in range(3):
            test_score = train_model(
                            fake_data[0],
                            fake_data[1],
                            fake_data[2],
                            fake_data[3],
                            real_data[0],
                            real_data[1],
                            batch_size=256,
                            num_epochs=100,
                            model_type="classification",
                            verbose=False,
                            scorer_type="accuracy",
                        )
            if test_score > best_score:
                best_score = test_score
        print("Benchmark Complete!")
        hp_scores[k] = best_score.item()
    
    hp_vals[ae_type][hp] = hp_scores
    with open("adult_opt_hp.json", 'w') as f:
        f.write(json.dumps(hp_vals))

#### (4) Find the optimal no D_steps and pac

#### (5) Find the optimal GAN LR, GAN BatchSize-Epoch, AE LR, AE BatchSize-Epoch