In [1]:
import os
os.chdir('../..')

In [2]:
from platform import python_version
print(python_version())

3.7.9


In [3]:
import chevron
import sys
import os
import copy
import logging
import tensorflow as tf
import torch
import matplotlib.pyplot as plt

from distutils.dir_util import copy_tree

import pandas as pd
import networkx as nx

import time

# For the Python notebook
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [4]:
init_modules = copy.deepcopy(list(sys.modules.keys()))
tf.logging.set_verbosity(tf.logging.ERROR)

In [5]:
class redirect_output(object):
    """context manager for reditrecting stdout/err to files"""


    def __init__(self, out=''):
        self.log = open(out, 'w')
        
        self.old_stdout = sys.stdout
        self.old_stderr = sys.stderr
                
    def __enter__(self):
        sys.stdout = self.log
        sys.stderr = self.log
        
    def __exit__(self, exc_type, exc_value, traceback):
        sys.stdout = self.old_stdout
        sys.stderr = self.old_stderr
        self.log.close()

In [6]:
def delete_loaded_modules(init_modules):
    to_del = []
    for m in sys.modules.keys(  ):
        if m not in init_modules:
            to_del.append(m)
        
    for m in to_del:
        del(sys.modules[m])

In [7]:
def time_to_str(elapsed):
    
    hours, rem = divmod(elapsed, 3600)
    minutes, seconds = divmod(rem, 60)
    str_ = "{:0>2} seconds".format(int(seconds))
    if minutes > 0 or hours > 0:
        str_ = "{:0>1} minutes and ".format(int(minutes)) + str_
    if hours > 0:
        str_ = "{:0>1} hours ".format(int(hours)) + str_
        
    return str_

In [8]:
def dag(dataset):

    # personalised graph
    graph = nx.DiGraph()

    if dataset is 'Chicago':
        graph.add_edges_from([
            ("age", "license"),
            ("age", "education_level"),
            ("gender", "work_status"),
            ("education_level", "work_status"),
            ("education_level", "hh_income"),
            ("work_status", "hh_income"),
            ("hh_income", "hh_descr"),
            ("hh_income", "hh_size"),
            ("hh_size", "hh_vehicles"),
            ("hh_size", "hh_bikes"),
            ("work_status", "trip_purpose"),
            ("trip_purpose", "departure_time"),
            ("trip_purpose", "distance"),
            ("travel_dow", "choice"),
            ("distance", "choice"),
            ("departure_time", "choice"),
            ("hh_vehicles", "choice"),
            ("hh_bikes", "choice"),
            ("license", "choice"),
            # Non necessary links
            ("education_level", "hh_size"),
            ("work_status", "hh_descr"),
            ("work_status", "hh_size"),
            ("hh_income", "hh_bikes"),
            ("hh_income", "hh_vehicles"),
            ("trip_purpose", "choice")
        ])
    elif dataset is 'LPMC':
        graph.add_edges_from([
            ("travel_year", "survey_year"),
            ("travel_date", "day_of_week"),
            ("day_of_week", "purpose"),
            ("purpose", "start_time_linear"),
            ("purpose", "cost_driving_con_charge"),
            ("purpose", "distance"),
            ("day_of_week", "driving_traffic_percent"),
            ("day_of_week", "cost_driving_con_charge"),
            ("start_time_linear", "driving_traffic_percent"),
            ("start_time_linear", "cost_driving_con_charge"),
            ("driving_traffic_percent", "cost_driving_con_charge"),
            ("female", "driving_license"),
            ("age", "bus_scale"),
            ("age", "car_ownership"),
            ("age", "driving_license"),
            ("age", "faretype"),
            ("driving_license", "car_ownership"),
            ("car_ownership", "fueltype"),
            ("fueltype", "cost_driving_con_charge"),
            ("fueltype", "cost_driving_fuel"),
            ("distance", "cost_driving_fuel"),
            ("distance", "dur_driving"),
            ("distance", "dur_walking"),
            ("distance", "dur_cycling"),
            ("distance", "dur_pt_access"),
            ("distance", "dur_pt_rail"),
            ("distance", "dur_pt_bus"),
            ("distance", "dur_pt_int"),
            ("dur_pt_bus", "cost_transit"),
            ("dur_pt_rail", "cost_transit"),
            ("pt_n_interchanges", "dur_pt_int"),
            ("pt_n_interchanges", "cost_transit"),
            ("faretype", "cost_transit"),
            ("bus_scale", "cost_transit"),
            ("car_ownership", "travel_mode"),
            ("age", "travel_mode"),
            ("cost_driving_con_charge", "travel_mode"),
            ("driving_traffic_percent", "travel_mode"),
            ("female", "travel_mode"),
            ("purpose", "travel_mode"),
            ("cost_transit", "travel_mode"),
            ("cost_driving_fuel", "travel_mode"),
            ("dur_driving", "travel_mode"),
            ("dur_walking", "travel_mode"),
            ("dur_cycling", "travel_mode"),
            ("dur_pt_access", "travel_mode"),
            ("dur_pt_rail", "travel_mode"),
            ("dur_pt_bus", "travel_mode"),
            ("dur_pt_int", "travel_mode")
        ])
        graph.add_node("travel_month")
        
    return graph

In [9]:
def train_DATGAN(dataset, loss, name, nt='WI'):
    
    df = pd.read_csv('../data/{}/data.csv'.format(dataset), index_col=False)

    if dataset is 'Chicago':
        continuous_columns = ["distance", "age", "departure_time"]
    elif dataset is 'LPMC':
        continuous_columns = ['start_time_linear', 'age', 'distance', 'dur_walking', 'dur_cycling', 'dur_pt_access',
                              'dur_pt_rail', 'dur_pt_bus', 'dur_pt_int', 'dur_driving', 'cost_transit',
                              'cost_driving_fuel', 'driving_traffic_percent']
    if loss == 'WGAN':
        from modules.datgan import DATWGAN as LIB
        lr = 2e-4
    elif loss == 'SGAN':
        from modules.datgan import DATSGAN as LIB
        lr = 1e-3
    elif loss == 'WGGP':
        from modules.datgan import DATWGANGP as LIB
        lr = 1e-4
    else:
        from modules.datgan import DATWGAN as LIB
        lr = 2e-4

    output_folder = '../output/{}/{}/'.format(dataset, name)

    datgan = LIB(continuous_columns, max_epoch=1000, batch_size=500, output=output_folder, gpu=0,
                     learning_rate=lr, noisy_training=nt)

    datgan.fit(df, dag(dataset))

    datgan.save('trained', force=True)
    
def train_TGAN(dataset, name):
    
    df = pd.read_csv('../data/{}/data.csv'.format(dataset), index_col=False)

    if dataset is 'Chicago':
        continuous_columns = [3, 10, 14]
    elif dataset is 'LPMC':
        continuous_columns = [10, 11, 15, 16, 17, 18, 19, 20, 21, 23, 24, 25, 27]
    
    from tgan.model import TGANModel

    output_folder = '../output/{}/{}/'.format(dataset, name)
    
    bs = 500
    steps_per_epoch = max(len(df) // bs, 1)
    
    tgan = TGANModel(continuous_columns, max_epoch=1000, steps_per_epoch=steps_per_epoch, 
                     batch_size=bs, output=output_folder, gpu=0)

    tgan.fit(df)

    tgan.save(output_folder + 'trained.pickle', force=True)
    
def train_CTGAN(dataset, name):
    
    df = pd.read_csv('../data/{}/data.csv'.format(dataset), index_col=False)
    
    if dataset is 'Chicago':
        discrete_columns = [
            'choice',
            'travel_dow',
            'trip_purpose',
            'hh_vehicles',
            'hh_size',
            'hh_bikes',
            'hh_descr',
            'hh_income',
            'gender',
            'license',
            'education_level',
            'work_status'
        ]
    elif dataset is 'LPMC':
        discrete_columns = [
            'travel_mode',
            'purpose',
            'fueltype',
            'faretype',
            'bus_scale',
            'survey_year',
            'travel_year',
            'travel_month',
            'travel_date',
            'day_of_week',
            'female',
            'driving_license',
            'car_ownership',
            'pt_n_interchanges',
            'cost_driving_con_charge'
        ]
        
    from ctgan import CTGANSynthesizer
        
    output_folder = '../output/{}/{}/'.format(dataset, name)
    os.makedirs(output_folder)
    
    ctgan = CTGANSynthesizer(verbose=True, cuda=True, batch_size=500)
    
    ctgan.fit(df, discrete_columns, epochs=1000)
    
    ctgan.save(output_folder + 'trained.pickle')
    
def is_a_DATGAN(name):
    if 'TGAN' in name or 'CTGAN' in name:
        return False
    else:
        return True
    
def check_already_trained(dataset, name):
    
    return os.path.isfile('../output/{}/{}/trained.tar.gz'.format(dataset, name)) or os.path.isfile('../output/{}/{}/trained.pickle'.format(dataset, name))

In [10]:
dataset = 'Chicago'
n_models = 5

models = ['TGAN', 'CTGAN']

reuse_data = False

for i in ['WGAN', 'SGAN', 'WGGP']:
    for j in ['WI', 'OR', 'WO']:
        models.append('{}_{}'.format(i,j))

tmp = []
        
for i in range(n_models):
    for m in models:
        tmp.append(m + '_{:0>2d}'.format(i+1))
        
tmp.sort()
models = tmp

In [11]:
for i, m in enumerate(models):
    
    if check_already_trained(dataset, m):
        print("Model \033[1m{}\033[0m ({}/{}) has already been trained.".format(m, i+1, len(models)))
        
    else:
    
        print("\rTraining model \033[1m{}\033[0m ({}/{}) ... ".format(m, i+1, len(models)), end="")
        
        # Cannot delete tensorflow modules sadly =(
        tf.reset_default_graph()

        delete_loaded_modules(init_modules)

        if reuse_data and is_a_DATGAN(m):
            copy_tree('../output/{}/{}/data'.format(dataset, 'WGAN_WI'), '../output/{}/{}/data'.format(dataset, m))

        start_time = time.time()

        if 'CTGAN' in m:
            with redirect_output('training.log'):
                train_CTGAN(dataset, m)
        elif 'TGAN' in m:
            with redirect_output('training.log'):
                train_TGAN(dataset, m)
        elif is_a_DATGAN(m):
            with redirect_output('training.log'):
                train_DATGAN(dataset, m.split('_')[0], m, m.split('_')[1])

        elapsed = time.time() - start_time

        time_taken = time_to_str(elapsed)

        print("Done in {}.".format(time_taken))

        for handler in logging.getLogger('tensorpack').handlers:
            handler.close()

        logging.getLogger('tensorpack').handlers = []

print("FINISHED!")

Model [1mCTGAN_01[0m (1/55) has already been trained.
Model [1mCTGAN_02[0m (2/55) has already been trained.
Model [1mCTGAN_03[0m (3/55) has already been trained.
Model [1mCTGAN_04[0m (4/55) has already been trained.
Model [1mCTGAN_05[0m (5/55) has already been trained.
Model [1mSGAN_OR_01[0m (6/55) has already been trained.
Model [1mSGAN_OR_02[0m (7/55) has already been trained.
Model [1mSGAN_OR_03[0m (8/55) has already been trained.
Model [1mSGAN_OR_04[0m (9/55) has already been trained.
Model [1mSGAN_OR_05[0m (10/55) has already been trained.
Model [1mSGAN_WI_01[0m (11/55) has already been trained.
Model [1mSGAN_WI_02[0m (12/55) has already been trained.
Model [1mSGAN_WI_03[0m (13/55) has already been trained.
Model [1mSGAN_WI_04[0m (14/55) has already been trained.
Model [1mSGAN_WI_05[0m (15/55) has already been trained.
Model [1mSGAN_WO_01[0m (16/55) has already been trained.
Model [1mSGAN_WO_02[0m (17/55) has already been trained.
Model [1mSGAN_W