In [1]:
import os
os.chdir('../..')

In [2]:
from platform import python_version
print(python_version())

3.7.9


In [3]:
import numpy as np
import pandas as pd
import json
import sys
import pickle

import matplotlib.pyplot as plt

# For the Python notebook
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [4]:
dataset = 'adult'
n_models = 5
n_data = 5

if 'adult' in dataset:
    models = ['TGAN', 'CTGAN', 'TVAE', 'CTABGAN', 'WGGP_WI', 'WGAN_WI', 'LINEAR']
else:
    models = ['TGAN', 'CTGAN', 'CTABGAN', 'TVAE']

    for i in ['WGAN', 'SGAN', 'WGGP']:
        for j in ['WI', 'OR', 'WO']:
            models.append('{}_{}'.format(i,j))
            
models = ['LINEAR']
        
if n_models > 1:
    tmp = []
    
    for i in range(n_models):
        for m in models:
            tmp.append(m + '_{:0>2d}'.format(i+1))
        
    tmp.sort()
    models = tmp

In [5]:
df = pd.read_csv('../data/' + dataset + '/data.csv')

In [6]:
input_folder = '../output/' + dataset + '/'
output_folder = '../synth_data/' + dataset + '/'

In [7]:
def is_a_DATGAN(name):
    if any(x in name for x in ['TGAN', 'CTGAN', 'TVAE', 'CTABGAN']):
        return False
    else:
        return True
    
def round_some_vars(samples, dataset):
    if 'Chicago' in dataset:
        samples.age = np.round(samples.age)
    elif 'LPMC' in dataset:
        samples.age = np.round(samples.age)
        samples.distance = np.round(samples.distance)
    elif 'adult' in dataset:
        for c in ['age', 'capital-gain', 'capital-loss','hours-per-week']:
            samples[c] = np.round(samples[c])
        
    return samples

In [8]:
for i, m in enumerate(models):
    
    name = m.split('_')[0]
    
    print("Sampling synthetic data from model \033[1m{}\033[0m ({}/{})".format(m, i+1, len(models)))

    
    model = None

    if 'CTGAN' in name:
        import ctgan
        from ctgan import CTGANSynthesizer
        
        ctgan = CTGANSynthesizer()
        model = ctgan.load(input_folder + m + '/trained.pickle')

    elif 'TGAN' in name:
        from tgan.model import TGANModel
        model = TGANModel.load(input_folder + m + '/trained.pickle')
        
    elif 'TVAE' in name:
        from ctgan import TVAESynthesizer
        
        tvae = TVAESynthesizer()
        model = tvae.load(input_folder + m + '/trained.pickle')
        
    elif 'CTABGAN' in name:
        from CTABGAN.model.ctabgan import CTABGAN
        
        with open(input_folder + m + '/trained.pickle', 'rb') as handle:
            model = pickle.load(handle)
    else:

        LIB = None

        if 'WGAN' in name:
            from modules.datgan import DATWGAN as LIB
        if any(x in name for x in ('WGANGP','WGGP')) :
            from modules.datgan import DATWGANGP as LIB
        elif 'SGAN' in name:
            from modules.datgan import DATSGAN as LIB
        else:
            from modules.datgan import DATWGAN as LIB

        model = LIB.load(input_folder + m + '/', 'trained')
    
    for k in range(n_data):
            
        if is_a_DATGAN(name):
            if any(x in dataset for x in ['Chicago', 'LPMC']):
                samp_types =  ['NO', 'BO', 'OD', 'OC']
            else:
                samp_types = ['NO']
                
            for samp_type in samp_types:
                samples = model.sample(len(df), samp_type)

                samples = round_some_vars(samples, dataset)
                if 'LINEAR' in m:
                    if n_data == 1:
                        samples.to_csv(output_folder + m + '.csv', index=False)
                    else:
                        samples.to_csv(output_folder + m + '_{:0>2d}.csv'.format(k+1), index=False)
                else:
                    if n_data == 1:
                        samples.to_csv(output_folder + m + '_{}.csv'.format(samp_type), index=False)
                    else:
                        samples.to_csv(output_folder + m + '_{}_{:0>2d}.csv'.format(samp_type, k+1), index=False)
        elif 'CTABGAN' in name:
            samples = model.generate_samples()
            
            if n_data == 1:
                samples.to_csv(output_folder + m + '.csv', index=False)
            else:
                samples.to_csv(output_folder + m + '_{:0>2d}.csv'.format(k+1), index=False)
            
        else:
            samples = model.sample(len(df))

            samples = round_some_vars(samples, dataset)

            if n_data == 1:
                samples.to_csv(output_folder + m + '.csv', index=False)
            else:
                samples.to_csv(output_folder + m + '_{:0>2d}.csv'.format(k+1), index=False)
                
print("\033[1mFINISHED!\033[0m")

Sampling synthetic data from model [1mLINEAR_01[0m (1/5)














Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.
[32m[0103 15:06:13 @DATSGANModel.py:213][0m [91mCreating cell for age (in-edges: 0; ancestors: 0)

Instructions for updating:
Please use `layer.add_weight` method instead.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
[32m[0103 15:06:13 @registry.py:126][0m gen/LSTM/age/FC input: [500, 100]

Instructions for updating:
Please use `layer.__call__` method instead.
[32m[0103 15:06:13 @registry.py:134][0m gen/LSTM/age/FC output: [500, 50]
[32m[0103 15:06:13 @registry.py:126][0m gen/LSTM/age/FC_val input: [500, 50]
[32m[0103 15:06:13 @registry.py:134][0m gen/LSTM/age/FC_val output: [500, 6]
[32m[0103 15:06:13 @registry.py:126][0m gen/LSTM/age/FC_prob input: [500, 50]
[32m[0103 15:06:13 @registry.py:134][0m g

[32m[0103 15:06:13 @DATSGANModel.py:213][0m [91mCreating cell for occupation (in-edges: 1; ancestors: 5)
[32m[0103 15:06:13 @registry.py:126][0m gen/LSTM/occupation/FC input: [500, 100]
[32m[0103 15:06:13 @registry.py:134][0m gen/LSTM/occupation/FC output: [500, 50]
[32m[0103 15:06:13 @registry.py:126][0m gen/LSTM/occupation/FC_prob input: [500, 50]
[32m[0103 15:06:13 @registry.py:134][0m gen/LSTM/occupation/FC_prob output: [500, 14]
[32m[0103 15:06:13 @registry.py:126][0m gen/LSTM/occupation/FC_input input: [500, 14]
[32m[0103 15:06:14 @registry.py:134][0m gen/LSTM/occupation/FC_input output: [500, 100]
[32m[0103 15:06:14 @DATSGANModel.py:213][0m [91mCreating cell for relationship (in-edges: 1; ancestors: 6)
[32m[0103 15:06:14 @registry.py:126][0m gen/LSTM/relationship/FC input: [500, 100]
[32m[0103 15:06:14 @registry.py:134][0m gen/LSTM/relationship/FC output: [500, 50]
[32m[0103 15:06:14 @registry.py:126][0m gen/LSTM/relationship/FC_prob input: [500, 50]
[32

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
[32m[0103 15:06:15 @sessinit.py:114][0m Restoring checkpoint from ../output/adult/LINEAR_01/model\model-134999 ...
INFO:tensorflow:Restoring parameters from ../output/adult/LINEAR_01/model\model-134999
[32m[0103 15:06:15 @DATSGAN.py:267][0m Loading Preprocessor!


|                                                                                         |90/?[00:00<00:00,105.77it/s]


[32m[0103 15:06:17 @DATSGAN.py:267][0m Loading Preprocessor!


|                                                                                         |90/?[00:00<00:00,166.26it/s]


[32m[0103 15:06:17 @DATSGAN.py:267][0m Loading Preprocessor!


|                                                                                         |90/?[00:00<00:00,167.71it/s]


[32m[0103 15:06:18 @DATSGAN.py:267][0m Loading Preprocessor!


|                                                                                         |90/?[00:00<00:00,167.71it/s]


[32m[0103 15:06:19 @DATSGAN.py:267][0m Loading Preprocessor!


|                                                                                         |90/?[00:00<00:00,164.66it/s]


Sampling synthetic data from model [1mLINEAR_02[0m (2/5)
[32m[0103 15:06:25 @DATSGANModel.py:213][0m [91mCreating cell for age (in-edges: 0; ancestors: 0)
[32m[0103 15:06:25 @DATSGANModel.py:213][0m [91mCreating cell for workclass (in-edges: 1; ancestors: 1)
[32m[0103 15:06:25 @DATSGANModel.py:213][0m [91mCreating cell for education (in-edges: 1; ancestors: 2)
[32m[0103 15:06:25 @DATSGANModel.py:213][0m [91mCreating cell for educational-num (in-edges: 1; ancestors: 3)
[32m[0103 15:06:25 @DATSGANModel.py:213][0m [91mCreating cell for marital-status (in-edges: 1; ancestors: 4)
[32m[0103 15:06:25 @DATSGANModel.py:213][0m [91mCreating cell for occupation (in-edges: 1; ancestors: 5)
[32m[0103 15:06:25 @DATSGANModel.py:213][0m [91mCreating cell for relationship (in-edges: 1; ancestors: 6)
[32m[0103 15:06:25 @DATSGANModel.py:213][0m [91mCreating cell for race (in-edges: 1; ancestors: 7)
[32m[0103 15:06:25 @DATSGANModel.py:213][0m [91mCreating cell for gender (in-e

|                                                                                         |90/?[00:00<00:00,122.90it/s]


[32m[0103 15:06:28 @DATSGAN.py:267][0m Loading Preprocessor!


|                                                                                         |90/?[00:00<00:00,167.24it/s]


[32m[0103 15:06:29 @DATSGAN.py:267][0m Loading Preprocessor!


|                                                                                         |90/?[00:00<00:00,157.78it/s]


[32m[0103 15:06:30 @DATSGAN.py:267][0m Loading Preprocessor!


|                                                                                         |90/?[00:00<00:00,155.62it/s]


[32m[0103 15:06:30 @DATSGAN.py:267][0m Loading Preprocessor!


|                                                                                         |90/?[00:00<00:00,162.34it/s]


Sampling synthetic data from model [1mLINEAR_03[0m (3/5)
[32m[0103 15:06:37 @DATSGANModel.py:213][0m [91mCreating cell for age (in-edges: 0; ancestors: 0)
[32m[0103 15:06:37 @DATSGANModel.py:213][0m [91mCreating cell for workclass (in-edges: 1; ancestors: 1)
[32m[0103 15:06:37 @DATSGANModel.py:213][0m [91mCreating cell for education (in-edges: 1; ancestors: 2)
[32m[0103 15:06:37 @DATSGANModel.py:213][0m [91mCreating cell for educational-num (in-edges: 1; ancestors: 3)
[32m[0103 15:06:37 @DATSGANModel.py:213][0m [91mCreating cell for marital-status (in-edges: 1; ancestors: 4)
[32m[0103 15:06:37 @DATSGANModel.py:213][0m [91mCreating cell for occupation (in-edges: 1; ancestors: 5)
[32m[0103 15:06:37 @DATSGANModel.py:213][0m [91mCreating cell for relationship (in-edges: 1; ancestors: 6)
[32m[0103 15:06:37 @DATSGANModel.py:213][0m [91mCreating cell for race (in-edges: 1; ancestors: 7)
[32m[0103 15:06:38 @DATSGANModel.py:213][0m [91mCreating cell for gender (in-e

|                                                                                          |90/?[00:01<00:00,85.23it/s]


[32m[0103 15:06:41 @DATSGAN.py:267][0m Loading Preprocessor!


|                                                                                         |90/?[00:00<00:00,120.33it/s]


[32m[0103 15:06:43 @DATSGAN.py:267][0m Loading Preprocessor!


|                                                                                         |90/?[00:00<00:00,122.85it/s]


[32m[0103 15:06:44 @DATSGAN.py:267][0m Loading Preprocessor!


|                                                                                         |90/?[00:00<00:00,122.52it/s]


[32m[0103 15:06:45 @DATSGAN.py:267][0m Loading Preprocessor!


|                                                                                         |90/?[00:00<00:00,119.53it/s]


Sampling synthetic data from model [1mLINEAR_04[0m (4/5)
[32m[0103 15:06:53 @DATSGANModel.py:213][0m [91mCreating cell for age (in-edges: 0; ancestors: 0)
[32m[0103 15:06:53 @DATSGANModel.py:213][0m [91mCreating cell for workclass (in-edges: 1; ancestors: 1)
[32m[0103 15:06:53 @DATSGANModel.py:213][0m [91mCreating cell for education (in-edges: 1; ancestors: 2)
[32m[0103 15:06:53 @DATSGANModel.py:213][0m [91mCreating cell for educational-num (in-edges: 1; ancestors: 3)
[32m[0103 15:06:53 @DATSGANModel.py:213][0m [91mCreating cell for marital-status (in-edges: 1; ancestors: 4)
[32m[0103 15:06:53 @DATSGANModel.py:213][0m [91mCreating cell for occupation (in-edges: 1; ancestors: 5)
[32m[0103 15:06:53 @DATSGANModel.py:213][0m [91mCreating cell for relationship (in-edges: 1; ancestors: 6)
[32m[0103 15:06:53 @DATSGANModel.py:213][0m [91mCreating cell for race (in-edges: 1; ancestors: 7)
[32m[0103 15:06:53 @DATSGANModel.py:213][0m [91mCreating cell for gender (in-e

|                                                                                          |90/?[00:01<00:00,84.32it/s]


[32m[0103 15:06:57 @DATSGAN.py:267][0m Loading Preprocessor!


|                                                                                         |90/?[00:00<00:00,119.69it/s]


[32m[0103 15:06:58 @DATSGAN.py:267][0m Loading Preprocessor!


|                                                                                         |90/?[00:00<00:00,117.22it/s]


[32m[0103 15:07:00 @DATSGAN.py:267][0m Loading Preprocessor!


|                                                                                         |90/?[00:00<00:00,117.98it/s]


[32m[0103 15:07:01 @DATSGAN.py:267][0m Loading Preprocessor!


|                                                                                         |90/?[00:00<00:00,116.09it/s]


Sampling synthetic data from model [1mLINEAR_05[0m (5/5)
[32m[0103 15:07:08 @DATSGANModel.py:213][0m [91mCreating cell for age (in-edges: 0; ancestors: 0)
[32m[0103 15:07:09 @DATSGANModel.py:213][0m [91mCreating cell for workclass (in-edges: 1; ancestors: 1)
[32m[0103 15:07:09 @DATSGANModel.py:213][0m [91mCreating cell for education (in-edges: 1; ancestors: 2)
[32m[0103 15:07:09 @DATSGANModel.py:213][0m [91mCreating cell for educational-num (in-edges: 1; ancestors: 3)
[32m[0103 15:07:09 @DATSGANModel.py:213][0m [91mCreating cell for marital-status (in-edges: 1; ancestors: 4)
[32m[0103 15:07:09 @DATSGANModel.py:213][0m [91mCreating cell for occupation (in-edges: 1; ancestors: 5)
[32m[0103 15:07:09 @DATSGANModel.py:213][0m [91mCreating cell for relationship (in-edges: 1; ancestors: 6)
[32m[0103 15:07:09 @DATSGANModel.py:213][0m [91mCreating cell for race (in-edges: 1; ancestors: 7)
[32m[0103 15:07:09 @DATSGANModel.py:213][0m [91mCreating cell for gender (in-e

|                                                                                          |90/?[00:01<00:00,85.75it/s]


[32m[0103 15:07:13 @DATSGAN.py:267][0m Loading Preprocessor!


|                                                                                         |90/?[00:00<00:00,121.70it/s]


[32m[0103 15:07:14 @DATSGAN.py:267][0m Loading Preprocessor!


|                                                                                         |90/?[00:00<00:00,118.60it/s]


[32m[0103 15:07:15 @DATSGAN.py:267][0m Loading Preprocessor!


|                                                                                         |90/?[00:00<00:00,127.78it/s]


[32m[0103 15:07:16 @DATSGAN.py:267][0m Loading Preprocessor!


|                                                                                         |90/?[00:00<00:00,123.77it/s]


[1mFINISHED![0m
