In [3]:
from datgan import DATGAN
import datgan

import numpy as np
import pandas as pd
import networkx as nx

# For the Python notebook
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [4]:
df = pd.read_csv('../../data/LPMC/trips.csv', index_col=False)

In [5]:
df.head()

Unnamed: 0,travel_mode,purpose,fueltype,faretype,bus_scale,travel_year,travel_month,travel_date,day_of_week,start_time_linear,...,dur_driving,cost_transit,cost_driving_fuel,cost_driving_con_charge,driving_traffic_percent,hh_vehicles,hh_borough,hh_income,hh_people,dur_pt_int
0,drive,HBO,Diesel_Car,full,1.0,2012,8,7,2,20.0,...,0.208611,1.5,0.57,0.0,0.098535,1,Bexley,35-50k,2,0.0
1,drive,HBW,Diesel_Car,full,1.0,2013,2,8,5,15.0,...,0.471944,3.0,1.62,0.0,0.354915,1,Harrow,5-10k,3,0.133333
2,pt,HBO,Average_Car,full,0.5,2014,10,8,3,14.0,...,0.238333,0.75,0.62,0.0,0.212121,0,Lambeth,50-75k,4,0.0
3,pt,HBE,Average_Car,dis,0.5,2014,3,10,1,10.5,...,0.308889,0.75,0.6,10.5,0.684353,0,Hackney,10-15k,2,0.0
4,walk,HBW,Petrol_Car,full,1.0,2013,1,24,4,16.833333,...,0.0775,1.5,0.19,0.0,0.046595,2,Lambeth,50-75k,5,0.0


In [6]:
# First, define the specificities of continuous variables
data_info = {
    'start_time_linear': {
        'type': 'continuous',
        'bounds': [0.0, 23.999],
        'discrete': False,
    },
    'age': {
        'type': 'continuous',
        'bounds': [0, 100],
        'discrete': True
    },
    'distance': {
        'type': 'continuous',
        'bounds': [0, np.infty],
        'discrete': True,
        'apply_func': (lambda x: np.log(x+1))
    },
    'dur_walking': {
        'type': 'continuous',
        'bounds': [0, np.infty],
        'enforce_bounds': True,
        'discrete': False,
        'apply_func': (lambda x: np.log(x+1))
    },
    'dur_cycling': {
        'type': 'continuous',
        'bounds': [0, np.infty],
        'enforce_bounds': True,
        'discrete': False,
        'apply_func': (lambda x: np.log(x+1))
    },
    'dur_pt_access': {
        'type': 'continuous',
        'bounds': [0, np.infty],
        'enforce_bounds': True,
        'discrete': False,
        'apply_func': (lambda x: np.log(x+1))
    },
    'dur_pt_rail': {
        'type': 'continuous',
        'bounds': [0, np.infty],
        'enforce_bounds': True,
        'discrete': False,
    },
    'dur_pt_bus': {
        'type': 'continuous',
        'bounds': [0, np.infty],
        'enforce_bounds': True,
        'discrete': False,
    },
    'dur_pt_int': {
        'type': 'continuous',
        'bounds': [0, np.infty],
        'enforce_bounds': True,
        'discrete': False,
    },
    'dur_driving': {
        'type': 'continuous',
        'bounds': [0, np.infty],
        'enforce_bounds': True,
        'discrete': False,
        'apply_func': (lambda x: np.log(x+1))
    },
    'cost_transit': {
        'type': 'continuous',
        'bounds': [0, np.infty],
        'enforce_bounds': True,
        'discrete': False,
    },
    'cost_driving_fuel': {
        'type': 'continuous',
        'bounds': [0, np.infty],
        'enforce_bounds': True,
        'discrete': False,
        'apply_func': (lambda x: np.log(x+1))
    },
    'driving_traffic_percent': {
        'type': 'continuous',
        'bounds': [0, np.infty],
        'discrete': False,
    },
}

# Add the other variables as categorical
for c in df.columns:
    if c not in data_info.keys():
        data_info[c] = {'type': 'categorical'}

In [7]:
# personalised graph
graph = nx.DiGraph()

graph.add_edges_from([
    ("travel_year", "travel_month"),
    ("travel_date", "day_of_week"),
    ("travel_month", "travel_date"),
    ("travel_month", "driving_traffic_percent"),
    ("travel_month", "day_of_week"),
    ("travel_month", "travel_mode"),
    ("travel_date", "day_of_week"),
    ("day_of_week", "driving_traffic_percent"),
    ("day_of_week", "cost_driving_con_charge"),
    ("day_of_week", "purpose"),
    ("day_of_week", "start_time_linear"),
    ("day_of_week", "travel_mode"),
    ("purpose", "distance"),
    ("purpose", "start_time_linear"),
    ("purpose", "travel_mode"),
    ("start_time_linear", "driving_traffic_percent"),
    ("start_time_linear", "cost_driving_con_charge"),
    ("start_time_linear", "travel_mode"),
    ("hh_vehicles", "fueltype"),
    ("hh_vehicles", "driving_license"),
    ("hh_vehicles", "travel_mode"),
    ("fueltype", "cost_driving_con_charge"),
    ("fueltype", "cost_driving_fuel"),
    ("female", "driving_license"),
    ("female", "travel_mode"),
    ("age", "bus_scale"),
    ("age", "driving_license"),
    ("age", "faretype"),
    ("age", "travel_mode"),
    ("driving_license", "travel_mode"),
    ("faretype", "cost_transit"),
    ("faretype", "bus_scale"),
    ("faretype", "travel_mode"),
    ("bus_scale", "cost_transit"),
    ("distance", "cost_driving_fuel"),
    ("distance", "dur_driving"),
    ("distance", "dur_walking"),
    ("distance", "dur_cycling"),
    ("distance", "dur_pt_access"),
    ("distance", "dur_pt_rail"),
    ("distance", "dur_pt_bus"),
    ("distance", "dur_pt_int"),
    ("distance", "pt_n_interchanges"),
    ("distance", "travel_mode"),
    ("pt_n_interchanges", "dur_pt_rail"),
    ("pt_n_interchanges", "dur_pt_bus"),
    ("pt_n_interchanges", "dur_pt_int"),
    ("pt_n_interchanges", "cost_transit"),
    ("driving_traffic_percent", "cost_driving_con_charge"),
    ("driving_traffic_percent", "travel_mode"),
    ("cost_driving_fuel", "cost_driving_con_charge"),
    ("cost_driving_fuel", "travel_mode"),
    ("cost_driving_con_charge", "travel_mode"),
    ("dur_driving", "travel_mode"),
    ("dur_walking", "travel_mode"),
    ("dur_cycling", "travel_mode"),
    ("dur_pt_access", "travel_mode"),
    ("dur_pt_rail", "cost_transit"),
    ("dur_pt_rail", "travel_mode"),
    ("dur_pt_bus", "cost_transit"),
    ("dur_pt_bus", "travel_mode"),
    ("dur_pt_int", "travel_mode"),
    ("cost_transit", "travel_mode"),
    ("hh_borough", "hh_income"),
    ("hh_borough", "travel_mode"),
    ("hh_income", "hh_vehicles"),
    ("hh_income", "age"),
    ("hh_income", "hh_people"),
    ("hh_people", "age"),
    ("hh_people", "female")
])


In [8]:
#datgan.advise(df, graph)

# Vanilla DATGAN

In [None]:
name = 'LPMC'

In [8]:
output_folder = '../output/{}/'.format(name)

In [17]:
datgan = DATGAN(output=output_folder,
                loss_function='WGGP',
                batch_size=1101,
                num_epochs=1000)

In [10]:
datgan.fit(df, data_info, graph, preprocessed_data_path='../output/encoded_LPMC')

Preprocessing the data!
Encoding categorical variable "travel_mode"...
Encoding categorical variable "purpose"...
Encoding categorical variable "fueltype"...
Encoding categorical variable "faretype"...
Encoding categorical variable "bus_scale"...
Encoding categorical variable "travel_year"...
Encoding categorical variable "travel_month"...
Encoding categorical variable "travel_date"...
Encoding categorical variable "day_of_week"...
Encoding continuous variable "start_time_linear"...
Encoding continuous variable "age"...
Encoding categorical variable "female"...
Encoding categorical variable "driving_license"...
Encoding continuous variable "distance"...
Encoding continuous variable "dur_walking"...
Encoding continuous variable "dur_cycling"...
Encoding continuous variable "dur_pt_access"...
Encoding continuous variable "dur_pt_rail"...
Encoding continuous variable "dur_pt_bus"...
Encoding categorical variable "pt_n_interchanges"...
Encoding continuous variable "dur_driving"...
Encoding

Training DATGAN: 100%|██████████| 1000/1000 [1:06:32<00:00,  3.99s/it]

DATGAN has finished training (29/04/2022 12:52:57) - Training time: 01 hour, 06 minutes, and 33 seconds





In [11]:
samp = datgan.sample(len(df))#, inputs=df[datgan.conditional_inputs])
samp.to_csv('../../data/synthetic/LPMC.csv', index=False)

Sampling from DATGAN: 100%|██████████| 17616/17616 [00:22<00:00, 783.18it/s]


# DATGAN with conditional inputs

In [None]:
name = 'LPMC_cond'

In [10]:
output_folder = '../output/{}/'.format(name)

In [13]:
datgan = DATGAN(output=output_folder,
                loss_function='WGGP',
                conditional_inputs=['age', 'female', 'hh_borough'],
                batch_size=1101,
                num_epochs=1000)

In [16]:
datgan.fit(df, data_info, graph, preprocessed_data_path='../output/encoded_LPMC')

Preprocessed data have been loaded!
Start training DATGAN with the WGGP loss (29/04/2022 13:44:05).


Training DATGAN: 100%|██████████| 1000/1000 [1:05:53<00:00,  3.95s/it]

DATGAN has finished training (29/04/2022 14:49:58) - Training time: 01 hour, 05 minutes, and 53 seconds





In [17]:
samp = datgan.sample(len(df), inputs=df[datgan.conditional_inputs])
samp.to_csv('../../data/synthetic/LPMC_cond.csv', index=False)

Sampling from DATGAN: 100%|██████████| 17616/17616 [00:22<00:00, 786.86it/s]


In [21]:
small = pd.read_csv('../../data/nomis/100k.csv')

In [24]:
samp = datgan.sample(len(small), inputs=small)
samp.to_csv('../../data/synthetic/LPMC_cond_100k.csv', index=False)

Sampling from DATGAN: 100%|██████████| 100000/100000 [02:04<00:00, 805.94it/s]


In [25]:
large = pd.read_csv('../../data/nomis/1M.csv')

In [26]:
samp = datgan.sample(len(large), inputs=large)
samp.to_csv('../../data/synthetic/LPMC_cond_1M.csv', index=False)

Sampling from DATGAN: 100%|██████████| 1000000/1000000 [21:48<00:00, 764.49it/s]
