In [1]:
import os
os.chdir('../..')

In [2]:
import numpy as np
import pandas as pd

import ctgan
from ctgan import CTGANSynthesizer

# For the Python notebook
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [3]:
dataset = 'LPMC'

In [4]:
df = pd.read_csv('../data/{}/data.csv'.format(dataset), index_col=False)

In [5]:
df.head()

Unnamed: 0,travel_mode,purpose,fueltype,faretype,bus_scale,survey_year,travel_year,travel_month,travel_date,day_of_week,...,dur_pt_access,dur_pt_rail,dur_pt_bus,dur_pt_int,pt_n_interchanges,dur_driving,cost_transit,cost_driving_fuel,cost_driving_con_charge,driving_traffic_percent
0,drive,HBO,Petrol_Car,full,1.0,1,2012,4,1,7,...,0.134444,0.0,0.016667,0.0,0,0.052222,1.5,0.14,0.0,0.111702
1,drive,HBO,Petrol_Car,full,1.0,1,2012,4,1,7,...,0.109444,0.0,0.055556,0.0,0,0.059444,1.5,0.15,0.0,0.11215
2,drive,HBO,Petrol_Car,full,1.0,1,2012,4,1,7,...,0.203056,0.0,0.210278,0.0,0,0.236667,1.5,0.79,0.0,0.203052
3,drive,HBO,Petrol_Car,full,1.0,1,2012,4,1,7,...,0.205556,0.0,0.258611,0.0,0,0.233333,1.5,0.78,0.0,0.160714
4,drive,HBO,Petrol_Car,dis,1.0,1,2012,4,1,7,...,0.203056,0.0,0.189444,0.0,0,0.229167,1.5,0.78,0.0,0.130909


In [6]:
if dataset is 'Chicago':
    discrete_columns = [
        'choice',
        'travel_dow',
        'trip_purpose',
        'hh_vehicles',
        'hh_size',
        'hh_bikes',
        'hh_descr',
        'hh_income',
        'gender',
        'license',
        'education_level',
        'work_status'
    ]
elif dataset is 'LPMC':
    discrete_columns = [
        'travel_mode',
        'purpose',
        'fueltype',
        'faretype',
        'bus_scale',
        'survey_year',
        'travel_year',
        'travel_month',
        'travel_date',
        'day_of_week',
        'female',
        'driving_license',
        'car_ownership',
        'pt_n_interchanges',
        'cost_driving_con_charge'
    ]

In [7]:
output_folder = '../output/' + dataset + '/CTGAN/'

# Train

In [9]:
ctgan = CTGANSynthesizer(verbose=True, cuda=True, batch_size=200)

In [11]:
%%time
ctgan.fit(df, discrete_columns, epochs=5)



Epoch 1, Loss G: -1.1299,Loss D: -0.1475
Epoch 2, Loss G: -1.7544,Loss D: -0.1426
Epoch 3, Loss G: -1.5050,Loss D: -0.7093
Epoch 4, Loss G: -1.1806,Loss D:  0.1803
Epoch 5, Loss G: -0.8280,Loss D: -0.4476
Wall time: 3min 46s


In [13]:
ctgan.save(output_folder + 'trained.pkl')

In [12]:
samples = ctgan.sample(len(df))

In [None]:
if dataset is 'Chicago':
    samples.age = np.round(samples.age)

In [16]:
output_synth = '../synth_data/' + dataset + '/CTGAN.csv'
samples.to_csv(output_synth, index=False)

# Load & Sample

In [8]:
ctgan = CTGANSynthesizer()
model = ctgan.load(output_folder + 'trained.pkl')

In [9]:
output_synth = '../synth_data/' + dataset + '/CTGAN.csv'

In [10]:
samples = model.sample(len(df))

In [11]:
samples

Unnamed: 0,travel_mode,purpose,fueltype,faretype,bus_scale,survey_year,travel_year,travel_month,travel_date,day_of_week,...,dur_pt_access,dur_pt_rail,dur_pt_bus,dur_pt_int,pt_n_interchanges,dur_driving,cost_transit,cost_driving_fuel,cost_driving_con_charge,driving_traffic_percent
0,drive,HBE,Diesel_Car,full,0.0,1,2013,2,10,4,...,0.353446,-0.002789,-0.001942,0.432463,4,0.082981,0.014894,0.627597,0.0,0.315386
1,pt,HBE,Petrol_Car,full,1.0,1,2014,5,7,2,...,0.130895,0.406518,0.161757,0.105037,2,0.176589,5.325004,0.357698,10.5,0.350559
2,pt,HBE,Petrol_Car,full,1.0,1,2014,4,27,1,...,0.110946,0.271397,-0.004728,-0.000223,1,0.409215,3.920611,0.293013,0.0,0.717953
3,walk,HBO,Average_Car,free,1.0,1,2014,12,22,6,...,0.340151,-0.001795,0.099688,0.000115,0,0.616288,6.337212,0.270679,0.0,0.199271
4,pt,HBE,Petrol_Car,full,1.0,2,2015,3,26,2,...,0.135442,-0.002098,0.079470,0.059338,1,0.193803,0.029249,0.307566,0.0,0.200688
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81081,pt,HBE,Petrol_Car,full,1.0,2,2012,5,16,3,...,0.206813,0.016143,-0.005335,0.003809,1,0.301628,7.383692,0.236376,0.0,0.595222
81082,drive,HBO,Average_Car,16+,1.0,3,2013,4,8,4,...,0.094609,0.000911,0.055642,-0.000123,0,0.098227,1.483080,0.300675,10.5,0.328507
81083,pt,HBO,Petrol_Car,full,1.0,1,2013,3,2,1,...,0.353700,0.343675,0.250927,0.065706,2,0.115635,5.503579,0.511108,0.0,0.212260
81084,pt,HBO,Average_Car,dis,1.0,3,2013,11,8,2,...,0.182997,-0.000008,-0.006378,-0.002461,0,0.117478,0.011907,1.142688,0.0,0.169982


In [12]:
samples.age = np.round(samples.age)

In [13]:
samples

Unnamed: 0,choice,travel_dow,trip_purpose,distance,hh_vehicles,hh_size,hh_bikes,hh_descr,hh_income,gender,age,license,education_level,work_status,departure_time
0,pt,2,HOME_OTHER,3.235251,2,4,5,2,5,0,2,0.0,2,FTE,8.645555
1,drive,4,HOME_WORK,5.375124,2,2,0,1,6,0,39,1.0,6,FTE,10.993568
2,pt,6,ESCORT_TRANSFER,5.950628,3,5,1,1,7,0,10,1.0,1,Retired,9.475652
3,drive,4,SHOPPING,-0.200294,2,4,3,3,7,1,0,1.0,2,FTE,13.563149
4,drive,3,ESCORT_TRANSFER,0.362917,3,4,0,1,7,0,58,1.0,5,Retired,5.971349
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87941,drive,5,SHOPPING,22.145845,2,1,0,1,2,0,51,1.0,3,FTE,14.062827
87942,drive,2,SHOPPING,3.594238,1,3,3,1,7,1,70,1.0,3,PTE,14.343722
87943,drive,4,SHOPPING,2.434216,4,2,0,1,7,1,61,1.0,3,FTE,14.002752
87944,passenger,3,HOME_OTHER,5.527949,3,1,2,1,1,0,46,1.0,2,FTE,11.450231


In [14]:
samples.to_csv(output_synth, index=False)