In [1]:
import os
os.chdir('../..')

In [2]:
from platform import python_version
print(python_version())

3.7.9


In [3]:
import chevron
import sys
import os
import copy
import logging
import tensorflow as tf
import torch
import matplotlib.pyplot as plt
import pickle

from distutils.dir_util import copy_tree

import pandas as pd
import networkx as nx

import time

# For the Python notebook
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [4]:
init_modules = copy.deepcopy(list(sys.modules.keys()))
tf.logging.set_verbosity(tf.logging.ERROR)

In [5]:
class redirect_output(object):
    """context manager for reditrecting stdout/err to files"""


    def __init__(self, out=''):
        self.log = open(out, 'w')
        
        self.old_stdout = sys.stdout
        self.old_stderr = sys.stderr
                
    def __enter__(self):
        sys.stdout = self.log
        sys.stderr = self.log
        
    def __exit__(self, exc_type, exc_value, traceback):
        sys.stdout = self.old_stdout
        sys.stderr = self.old_stderr
        self.log.close()

In [6]:
def delete_loaded_modules(init_modules):
    to_del = []
    for m in sys.modules.keys(  ):
        if m not in init_modules:
            to_del.append(m)
        
    for m in to_del:
        del(sys.modules[m])

In [7]:
def time_to_str(elapsed):
    
    hours, rem = divmod(elapsed, 3600)
    minutes, seconds = divmod(rem, 60)
    str_ = "{:0>2} seconds".format(int(seconds))
    if minutes > 0 or hours > 0:
        str_ = "{:0>1} minutes and ".format(int(minutes)) + str_
    if hours > 0:
        str_ = "{:0>1} hours ".format(int(hours)) + str_
        
    return str_

In [8]:
def is_a_DATGAN(name):
    if any(x in name for x in ['TGAN', 'CTGAN', 'TVAE', 'CTABGAN']):
        return False
    else:
        return True
    
def check_already_trained(dataset, name):
    
    return os.path.isfile('../output/{}/{}/trained.tar.gz'.format(dataset, name)) or os.path.isfile('../output/{}/{}/trained.pickle'.format(dataset, name))

In [9]:
def dag(dataset):

    # personalised graph
    graph = nx.DiGraph()

    if 'Chicago' in dataset:
        graph.add_edges_from([
            ("age", "license"),
            ("age", "education_level"),
            ("gender", "work_status"),
            ("education_level", "work_status"),
            ("education_level", "hh_income"),
            ("work_status", "hh_income"),
            ("hh_income", "hh_descr"),
            ("hh_income", "hh_size"),
            ("hh_size", "hh_vehicles"),
            ("hh_size", "hh_bikes"),
            ("work_status", "trip_purpose"),
            ("trip_purpose", "departure_time"),
            ("trip_purpose", "distance"),
            ("travel_dow", "choice"),
            ("distance", "choice"),
            ("departure_time", "choice"),
            ("hh_vehicles", "choice"),
            ("hh_bikes", "choice"),
            ("license", "choice"),
            # Links removed when doing trans red
            ("education_level", "hh_size"),
            ("work_status", "hh_descr"),
            ("work_status", "hh_size"),
            ("hh_income", "hh_bikes"),
            ("hh_income", "hh_vehicles"),
            ("trip_purpose", "choice")
        ])
    elif 'LPMC' in dataset:
        graph.add_edges_from([
            ("travel_year", "travel_month"),
            ("travel_date", "day_of_week"),
            ("travel_month", "travel_date"),
            ("travel_month", "driving_traffic_percent"),
            ("travel_month", "day_of_week"),
            ("travel_month", "travel_mode"),
            ("travel_date", "day_of_week"),
            ("day_of_week", "driving_traffic_percent"),
            ("day_of_week", "cost_driving_con_charge"),
            ("day_of_week", "purpose"),
            ("day_of_week", "start_time_linear"),
            ("day_of_week", "travel_mode"),
            ("purpose", "distance"),
            ("purpose", "start_time_linear"),
            ("purpose", "travel_mode"),
            ("start_time_linear", "driving_traffic_percent"),
            ("start_time_linear", "cost_driving_con_charge"),
            ("start_time_linear", "travel_mode"),
            ("car_ownership", "fueltype"),
            ("car_ownership", "driving_license"),
            ("car_ownership", "travel_mode"),
            ("fueltype", "cost_driving_con_charge"),
            ("fueltype", "cost_driving_fuel"),
            ("female", "driving_license"),
            ("female", "travel_mode"),
            ("age", "bus_scale"),
            ("age", "driving_license"),
            ("age", "faretype"),
            ("age", "travel_mode"),
            ("driving_license", "travel_mode"),
            ("faretype", "cost_transit"),
            ("faretype", "bus_scale"),
            ("faretype", "travel_mode"),
            ("bus_scale", "cost_transit"),
            ("distance", "cost_driving_fuel"),
            ("distance", "dur_driving"),
            ("distance", "dur_walking"),
            ("distance", "dur_cycling"),
            ("distance", "dur_pt_access"),
            ("distance", "dur_pt_rail"),
            ("distance", "dur_pt_bus"),
            ("distance", "dur_pt_int"),
            ("distance", "pt_n_interchanges"),
            ("distance", "travel_mode"),
            ("pt_n_interchanges", "dur_pt_rail"),
            ("pt_n_interchanges", "dur_pt_bus"),
            ("pt_n_interchanges", "dur_pt_int"),
            ("pt_n_interchanges", "cost_transit"),
            ("driving_traffic_percent", "cost_driving_con_charge"),
            ("driving_traffic_percent", "travel_mode"),
            ("cost_driving_fuel", "cost_driving_con_charge"),
            ("cost_driving_fuel", "travel_mode"),
            ("cost_driving_con_charge", "travel_mode"),
            ("dur_driving", "travel_mode"),
            ("dur_walking", "travel_mode"),
            ("dur_cycling", "travel_mode"),
            ("dur_pt_access", "travel_mode"),
            ("dur_pt_rail", "cost_transit"),
            ("dur_pt_rail", "travel_mode"),
            ("dur_pt_bus", "cost_transit"),
            ("dur_pt_bus", "travel_mode"),
            ("dur_pt_int", "travel_mode"),
            ("cost_transit", "travel_mode")
        ])
    elif 'adult' in dataset:
        graph.add_edges_from([
            ("age", "marital-status"),
            ("age", "education"),
            ("age", "income"),
            ("age", "occupation"),
            ("gender", "marital-status"),
            ("gender", "education"),
            ("gender", "income"),
            ("native-country", "race"),
            ("native-country", "marital-status"),
            ("native-country", "education"),
            ("marital-status", "relationship"),
            ("relationship", "occupation"),
            ("race", "education"),
            ("race", "income"),
            ("race", "occupation"),
            ("education", "income"),
            ("education", "educational-num"),
            ("education", "occupation"),
            ("educational-num", "income"),
            ("occupation", "hours-per-week"),
            ("occupation", "workclass"),
            ("occupation", "capital-gain"),
            ("occupation", "capital-loss"),
            ("occupation", "income"),
            ("workclass", "hours-per-week"),
            ("workclass", "capital-gain"),
            ("workclass", "capital-loss"),
            ("hours-per-week", "income"),
            ("capital-gain", "income"),
            ("capital-loss", "income")
        ])
        
    return graph

In [13]:
def train_DATGAN(dataset, loss, name, nt='WI'):
    
    df = pd.read_csv('../data/{}/data.csv'.format(dataset), index_col=False)

    if 'Chicago' in dataset:
        continuous_columns = ["distance", "age", "departure_time"]
    elif 'LPMC' in dataset:
        continuous_columns = ['start_time_linear', 'age', 'distance', 'dur_walking', 'dur_cycling', 'dur_pt_access',
                              'dur_pt_rail', 'dur_pt_bus', 'dur_pt_int', 'dur_driving', 'cost_transit',
                              'cost_driving_fuel', 'driving_traffic_percent']
    elif 'adult' in dataset:
        continuous_columns = ['age', 'capital-gain', 'capital-loss', 'hours-per-week']
    elif 'covertype' in dataset:
        continuous_columns = ['Elevation', 'Aspect', 'Slope', 
                              'Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology',
                              'Horizontal_Distance_To_Roadways', 'Hillshade_9am', 'Hillshade_Noon', 
                              'Hillshade_3pm', 'Horizontal_Distance_To_Fire_Points']

    if loss == 'WGAN':
        from modules.datgan import DATWGAN as LIB
    elif loss == 'SGAN':
        from modules.datgan import DATSGAN as LIB
    elif loss == 'WGGP':
        from modules.datgan import DATWGANGP as LIB

    output_folder = '../output/{}/{}/'.format(dataset, name)

    datgan = LIB(continuous_columns, max_epoch=1000, batch_size=500, 
                 output=output_folder, gpu=0, noisy_training=nt)
    
    if 'LINEAR' in name or 'covertype' in dataset:
        graph = nx.DiGraph()
        list_ = []
        for i in range(len(df.columns)-1):
            list_.append((df.columns[i], df.columns[i+1]))
        graph.add_edges_from(list_)
        datgan.fit(df, graph)
        
    else:
        datgan.fit(df, dag(dataset))

    datgan.save('trained', force=True)
    
def train_TGAN(dataset, name):
    
    df = pd.read_csv('../data/{}/data.csv'.format(dataset), index_col=False)

    if 'Chicago' in dataset:
        continuous_columns = [3, 10, 14]
    elif 'LPMC' in dataset:
        continuous_columns = [9, 10, 14, 15, 16, 17, 18, 19, 20, 22, 23, 24, 26]
    elif 'adult' in dataset:
        continuous_columns = [0, 9, 10, 11]
    elif 'covertype' in dataset:
        continuous_columns = list(range(10))
        
    from tgan.model import TGANModel

    output_folder = '../output/{}/{}/'.format(dataset, name)
    
    bs = 500
    steps_per_epoch = max(len(df) // bs, 1)
    
    tgan = TGANModel(continuous_columns, max_epoch=1000, steps_per_epoch=steps_per_epoch, 
                     batch_size=bs, output=output_folder, gpu=0)

    tgan.fit(df)

    tgan.save(output_folder + 'trained.pickle', force=True)
    
def train_CTGAN(dataset, name):
    
    df = pd.read_csv('../data/{}/data.csv'.format(dataset), index_col=False)
    
    if 'Chicago' in dataset:
        discrete_columns = [
            'choice',
            'travel_dow',
            'trip_purpose',
            'hh_vehicles',
            'hh_size',
            'hh_bikes',
            'hh_descr',
            'hh_income',
            'gender',
            'license',
            'education_level',
            'work_status'
        ]
    elif 'LPMC' in dataset:
        discrete_columns = [
            'travel_mode',
             'purpose',
             'fueltype',
             'faretype',
             'bus_scale',
             'travel_year',
             'travel_month',
             'travel_date',
             'day_of_week',
             'female',
             'driving_license',
             'car_ownership',
             'pt_n_interchanges',
             'cost_driving_con_charge'
        ]
    elif 'adult' in dataset:
        discrete_columns = [
            'race',
            'workclass',
            'income',
            'marital-status',
            'relationship',
            'gender',
            'education',
            'native-country',
            'occupation',
            'educational-num'
        ]
    elif 'covertype' in dataset:
        continuous_cols = ['Elevation', 'Aspect', 'Slope', 
                           'Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology',
                           'Horizontal_Distance_To_Roadways', 'Hillshade_9am', 'Hillshade_Noon', 
                           'Hillshade_3pm', 'Horizontal_Distance_To_Fire_Points']
        discrete_columns = list(set(df.columns) - set(continuous_cols))
    
    from ctgan import CTGANSynthesizer
        
    output_folder = '../output/{}/{}/'.format(dataset, name)
    os.makedirs(output_folder)
    
    ctgan = CTGANSynthesizer(verbose=True, cuda=True, batch_size=500)
    
    ctgan.fit(df, discrete_columns, epochs=1000)
    
    ctgan.save(output_folder + 'trained.pickle')
    
def train_TVAE(dataset, name):
    df = pd.read_csv('../data/{}/data.csv'.format(dataset), index_col=False)
    
    if 'Chicago' in dataset:
        discrete_columns = [
            'choice',
            'travel_dow',
            'trip_purpose',
            'hh_vehicles',
            'hh_size',
            'hh_bikes',
            'hh_descr',
            'hh_income',
            'gender',
            'license',
            'education_level',
            'work_status'
        ]
    elif 'LPMC' in dataset:
        discrete_columns = [
            'travel_mode',
             'purpose',
             'fueltype',
             'faretype',
             'bus_scale',
             'travel_year',
             'travel_month',
             'travel_date',
             'day_of_week',
             'female',
             'driving_license',
             'car_ownership',
             'pt_n_interchanges',
             'cost_driving_con_charge'
        ]
    elif 'adult' in dataset:
        discrete_columns = [
            'race',
            'workclass',
            'income',
            'marital-status',
            'relationship',
            'gender',
            'education',
            'native-country',
            'occupation',
            'educational-num'
        ]
    elif 'covertype' in dataset:
        continuous_cols = ['Elevation', 'Aspect', 'Slope', 
                           'Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology',
                           'Horizontal_Distance_To_Roadways', 'Hillshade_9am', 'Hillshade_Noon', 
                           'Hillshade_3pm', 'Horizontal_Distance_To_Fire_Points']
        discrete_columns = list(set(df.columns) - set(continuous_cols))
        
    from ctgan import TVAESynthesizer
        
    output_folder = '../output/{}/{}/'.format(dataset, name)
    os.makedirs(output_folder)
    
    tvae = TVAESynthesizer(epochs=1000, cuda=True, batch_size=500)
    
    tvae.fit(df, discrete_columns)
    
    tvae.save(output_folder + 'trained.pickle')
    
def train_CTABGAN(dataset, name):
    df = pd.read_csv('../data/{}/data.csv'.format(dataset), index_col=False)
    
    if 'Chicago' in dataset:
        discrete_columns = [
            'choice',
            'travel_dow',
            'trip_purpose',
            'hh_vehicles',
            'hh_size',
            'hh_bikes',
            'hh_descr',
            'hh_income',
            'gender',
            'license',
            'education_level',
            'work_status'
        ]
    elif 'LPMC' in dataset:
        discrete_columns = [
            'travel_mode',
             'purpose',
             'fueltype',
             'faretype',
             'bus_scale',
             'travel_year',
             'travel_month',
             'travel_date',
             'day_of_week',
             'female',
             'driving_license',
             'car_ownership',
             'pt_n_interchanges',
             'cost_driving_con_charge'
        ]
    elif 'adult' in dataset:
        discrete_columns = [
            'race',
            'workclass',
            'income',
            'marital-status',
            'relationship',
            'gender',
            'education',
            'native-country',
            'occupation',
            'educational-num'
        ]
    elif 'covertype' in dataset:
        continuous_cols = ['Elevation', 'Aspect', 'Slope', 
                           'Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology',
                           'Horizontal_Distance_To_Roadways', 'Hillshade_9am', 'Hillshade_Noon', 
                           'Hillshade_3pm', 'Horizontal_Distance_To_Fire_Points']
        discrete_columns = list(set(df.columns) - set(continuous_cols))
        
    from CTABGAN.model.ctabgan import CTABGAN
    
    output_folder = '../output/{}/{}/'.format(dataset, name)
    os.makedirs(output_folder)
    
    if 'Chicago' in dataset:
        synthesizer =  CTABGAN(raw_csv_path = '../data/{}/data.csv'.format(dataset),
                         test_ratio = 0.2,
                         categorical_columns = discrete_columns, 
                         log_columns = [],
                         mixed_columns= {},
                         integer_columns = ['age'],
                         problem_type= {'Classification': 'choice'},
                         epochs = 1000)
    elif 'LPMC' in dataset:
        synthesizer =  CTABGAN(raw_csv_path = '../data/{}/data.csv'.format(dataset),
                     test_ratio = 0.2,
                     categorical_columns = discrete_columns, 
                     log_columns = [],
                     mixed_columns= {'dur_pt_rail': [0.0], 'dur_pt_bus': [0.0], 'dur_pt_int': [0.0], 'cost_transit': [0.0, 1.5]},
                     integer_columns = ['age', 'distance'],
                     problem_type= {'Classification': 'travel_mode'},
                     epochs = 1000)
    elif 'adult' in dataset:
        synthesizer =  CTABGAN(raw_csv_path = '../data/{}/data.csv'.format(dataset),
                     test_ratio = 0.2,
                     categorical_columns = discrete_columns, 
                     log_columns = [],
                     mixed_columns= {'capital-loss':[0.0],'capital-gain':[0.0]},
                     integer_columns = ['age', 'capital-gain', 'capital-loss','hours-per-week'],
                     problem_type= {"Classification": 'income'},
                     epochs = 1000) 
    elif 'covertype' in dataset:
        synthesizer =  CTABGAN(raw_csv_path = '../data/{}/data.csv'.format(dataset),
                     test_ratio = 0.2,
                     categorical_columns = discrete_columns, 
                     log_columns = [],
                     mixed_columns= {},
                     integer_columns = continuous_cols,
                     problem_type= {"Classification": 'Cover_Type'},
                     epochs = 1000) 
        
    synthesizer.fit()
    
    with open(output_folder + 'trained.pickle', 'wb') as handle:
        pickle.dump(synthesizer, handle)

In [11]:
dataset = 'adult'
n_models = 5

if 'adult' in dataset:
    models = ['TGAN', 'CTGAN', 'TVAE', 'CTABGAN', 'WGGP_WI', 'WGAN_WI', 'LINEAR']
elif 'covertype' in dataset:
    models = ['TGAN', 'CTGAN', 'TVAE', 'CTABGAN']
else:
    models = ['TGAN', 'CTGAN', 'TVAE', 'CTABGAN']

    for i in ['WGAN', 'WGGP', 'SGAN']:
        for j in ['WI', 'OR', 'WO']:
            models.append('{}_{}'.format(i,j))

reuse_data = False
        
if n_models > 1:
    tmp = []
    
    for i in range(n_models):
        for m in models:
            tmp.append(m + '_{:0>2d}'.format(i+1))
        
    tmp.sort()
    models = tmp

In [17]:
for i, m in enumerate(models):
    
    if check_already_trained(dataset, m):
        print("Model \033[1m{}\033[0m ({}/{}) has already been trained.".format(m, i+1, len(models)))
        
    else:
    
        print("\rTraining model \033[1m{}\033[0m ({}/{}) ... ".format(m, i+1, len(models)), end="")
        
        # Cannot delete tensorflow modules sadly =(
        tf.reset_default_graph()

        delete_loaded_modules(init_modules)

        if reuse_data and is_a_DATGAN(m):
            copy_tree('../output/{}/{}/data'.format(dataset, 'WGAN_WI'), '../output/{}/{}/data'.format(dataset, m))

        start_time = time.time()

        if 'CTGAN' in m:
            with redirect_output('training.log'):
                train_CTGAN(dataset, m)
        elif 'TGAN' in m:
            with redirect_output('training.log'):
                train_TGAN(dataset, m)
        elif 'TVAE' in m:
            with redirect_output('training.log'):
                train_TVAE(dataset, m) 
        elif 'CTABGAN' in m:
            with redirect_output('training.log'):
                train_CTABGAN(dataset, m)   
        elif is_a_DATGAN(m):
            with redirect_output('training.log'):
                if 'LINEAR' in m:
                    train_DATGAN(dataset, 'WGAN', m)
                else:
                    train_DATGAN(dataset, m.split('_')[0], m, m.split('_')[1])

        elapsed = time.time() - start_time

        time_taken = time_to_str(elapsed)

        print("Done in {}.".format(time_taken))

        for handler in logging.getLogger('tensorpack').handlers:
            handler.close()

        logging.getLogger('tensorpack').handlers = []

print("FINISHED!")

Model [1mCTABGAN_01[0m (1/35) has already been trained.
Model [1mCTABGAN_02[0m (2/35) has already been trained.
Model [1mCTABGAN_03[0m (3/35) has already been trained.
Model [1mCTABGAN_04[0m (4/35) has already been trained.
Model [1mCTABGAN_05[0m (5/35) has already been trained.
Model [1mCTGAN_01[0m (6/35) has already been trained.
Model [1mCTGAN_02[0m (7/35) has already been trained.
Model [1mCTGAN_03[0m (8/35) has already been trained.
Model [1mCTGAN_04[0m (9/35) has already been trained.
Model [1mCTGAN_05[0m (10/35) has already been trained.
Training model [1mLINEAR_01[0m (11/35) ... Done in 41 minutes and 25 seconds.
Training model [1mLINEAR_02[0m (12/35) ... Done in 41 minutes and 15 seconds.
Training model [1mLINEAR_03[0m (13/35) ... Done in 41 minutes and 03 seconds.
Training model [1mLINEAR_04[0m (14/35) ... Done in 41 minutes and 32 seconds.
Training model [1mLINEAR_05[0m (15/35) ... Done in 41 minutes and 19 seconds.
Model [1mTGAN_01[0m (16/35)