In [1]:
from datgan import DATGAN
import datgan

import numpy as np
import pandas as pd
import networkx as nx

# For the Python notebook
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [2]:
df = pd.read_csv('../../data/LPMC/trips.csv', index_col=False)

In [3]:
df.head()

Unnamed: 0,travel_mode,purpose,fueltype,faretype,bus_scale,travel_year,travel_month,travel_date,day_of_week,start_time_linear,...,dur_driving,cost_transit,cost_driving_fuel,cost_driving_con_charge,driving_traffic_percent,hh_vehicles,hh_borough,hh_income,hh_people,dur_pt_int
0,drive,HBO,Diesel_Car,full,1.0,2012,8,7,2,20.0,...,0.208611,1.5,0.57,0.0,0.098535,1,Bexley,35-50k,2,0.0
1,drive,HBW,Diesel_Car,full,1.0,2013,2,8,5,15.0,...,0.471944,3.0,1.62,0.0,0.354915,1,Harrow,5-10k,3,0.133333
2,pt,HBO,Average_Car,full,0.5,2014,10,8,3,14.0,...,0.238333,0.75,0.62,0.0,0.212121,0,Lambeth,50-75k,4,0.0
3,pt,HBE,Average_Car,dis,0.5,2014,3,10,1,10.5,...,0.308889,0.75,0.6,10.5,0.684353,0,Hackney,10-15k,2,0.0
4,walk,HBW,Petrol_Car,full,1.0,2013,1,24,4,16.833333,...,0.0775,1.5,0.19,0.0,0.046595,2,Lambeth,50-75k,5,0.0


In [4]:
# First, define the specificities of continuous variables
data_info = {
    'start_time_linear': {
        'type': 'continuous',
        'bounds': [0.0, 23.999],
        'discrete': False,
    },
    'age': {
        'type': 'continuous',
        'bounds': [0, 100],
        'discrete': True
    },
    'distance': {
        'type': 'continuous',
        'bounds': [0, np.infty],
        'discrete': True,
        'apply_func': (lambda x: np.log(x+1))
    },
    'dur_walking': {
        'type': 'continuous',
        'bounds': [0, np.infty],
        'enforce_bounds': True,
        'discrete': False,
        'apply_func': (lambda x: np.log(x+1))
    },
    'dur_cycling': {
        'type': 'continuous',
        'bounds': [0, np.infty],
        'enforce_bounds': True,
        'discrete': False,
        'apply_func': (lambda x: np.log(x+1))
    },
    'dur_pt_access': {
        'type': 'continuous',
        'bounds': [0, np.infty],
        'enforce_bounds': True,
        'discrete': False,
        'apply_func': (lambda x: np.log(x+1))
    },
    'dur_pt_rail': {
        'type': 'continuous',
        'bounds': [0, np.infty],
        'enforce_bounds': True,
        'discrete': False,
    },
    'dur_pt_bus': {
        'type': 'continuous',
        'bounds': [0, np.infty],
        'enforce_bounds': True,
        'discrete': False,
    },
    'dur_pt_int': {
        'type': 'continuous',
        'bounds': [0, np.infty],
        'enforce_bounds': True,
        'discrete': False,
    },
    'dur_driving': {
        'type': 'continuous',
        'bounds': [0, np.infty],
        'enforce_bounds': True,
        'discrete': False,
        'apply_func': (lambda x: np.log(x+1))
    },
    'cost_transit': {
        'type': 'continuous',
        'bounds': [0, np.infty],
        'enforce_bounds': True,
        'discrete': False,
    },
    'cost_driving_fuel': {
        'type': 'continuous',
        'bounds': [0, np.infty],
        'enforce_bounds': True,
        'discrete': False,
        'apply_func': (lambda x: np.log(x+1))
    },
    'driving_traffic_percent': {
        'type': 'continuous',
        'bounds': [0, np.infty],
        'discrete': False,
    },
}

# Add the other variables as categorical
for c in df.columns:
    if c not in data_info.keys():
        data_info[c] = {'type': 'categorical'}

In [5]:
# personalised graph
graph = nx.DiGraph()

graph.add_edges_from([
    ("travel_year", "travel_month"),
    ("travel_date", "day_of_week"),
    ("travel_month", "travel_date"),
    ("travel_month", "driving_traffic_percent"),
    ("travel_month", "day_of_week"),
    ("travel_month", "travel_mode"),
    ("travel_date", "day_of_week"),
    ("day_of_week", "driving_traffic_percent"),
    ("day_of_week", "cost_driving_con_charge"),
    ("day_of_week", "purpose"),
    ("day_of_week", "start_time_linear"),
    ("day_of_week", "travel_mode"),
    ("purpose", "distance"),
    ("purpose", "start_time_linear"),
    ("purpose", "travel_mode"),
    ("start_time_linear", "driving_traffic_percent"),
    ("start_time_linear", "cost_driving_con_charge"),
    ("start_time_linear", "travel_mode"),
    ("hh_vehicles", "fueltype"),
    ("hh_vehicles", "driving_license"),
    ("hh_vehicles", "travel_mode"),
    ("fueltype", "cost_driving_con_charge"),
    ("fueltype", "cost_driving_fuel"),
    ("female", "driving_license"),
    ("female", "travel_mode"),
    ("age", "bus_scale"),
    ("age", "driving_license"),
    ("age", "faretype"),
    ("age", "travel_mode"),
    ("driving_license", "travel_mode"),
    ("faretype", "cost_transit"),
    ("faretype", "bus_scale"),
    ("faretype", "travel_mode"),
    ("bus_scale", "cost_transit"),
    ("distance", "cost_driving_fuel"),
    ("distance", "dur_driving"),
    ("distance", "dur_walking"),
    ("distance", "dur_cycling"),
    ("distance", "dur_pt_access"),
    ("distance", "dur_pt_rail"),
    ("distance", "dur_pt_bus"),
    ("distance", "dur_pt_int"),
    ("distance", "pt_n_interchanges"),
    ("distance", "travel_mode"),
    ("pt_n_interchanges", "dur_pt_rail"),
    ("pt_n_interchanges", "dur_pt_bus"),
    ("pt_n_interchanges", "dur_pt_int"),
    ("pt_n_interchanges", "cost_transit"),
    ("driving_traffic_percent", "cost_driving_con_charge"),
    ("driving_traffic_percent", "travel_mode"),
    ("cost_driving_fuel", "cost_driving_con_charge"),
    ("cost_driving_fuel", "travel_mode"),
    ("cost_driving_con_charge", "travel_mode"),
    ("dur_driving", "travel_mode"),
    ("dur_walking", "travel_mode"),
    ("dur_cycling", "travel_mode"),
    ("dur_pt_access", "travel_mode"),
    ("dur_pt_rail", "cost_transit"),
    ("dur_pt_rail", "travel_mode"),
    ("dur_pt_bus", "cost_transit"),
    ("dur_pt_bus", "travel_mode"),
    ("dur_pt_int", "travel_mode"),
    ("cost_transit", "travel_mode"),
    ("hh_borough", "hh_income"),
    ("hh_borough", "travel_mode"),
    ("hh_income", "hh_vehicles"),
    ("hh_income", "age"),
    ("hh_income", "hh_people"),
    ("hh_people", "age"),
    ("hh_people", "female")
])


In [7]:
name = 'LPMC_cond'

In [8]:
output_folder = '../output/{}/'.format(name)

In [9]:
datgan = DATGAN(output=output_folder,
                loss_function='WGGP',
                conditional_inputs=['age', 'female', 'hh_borough'],
                batch_size=1101,
                num_epochs=1000)

In [None]:
datgan.fit(df, data_info, graph, preprocessed_data_path='../output/encoded_LPMC')

Preprocessed data have been loaded!
Start training DATGAN with the WGGP loss (04/05/2022 13:47:40).


Training DATGAN:  51%|█████     | 512/1000 [33:41<31:21,  3.86s/it]  

In [11]:
for i in range(5):
    samp = datgan.sample(len(df), inputs=df[datgan.conditional_inputs], randomize=False)
    samp.to_csv('../../data/synthetic/test/ciDATGAN_{:02d}.csv'.format(i+1), index=False)

Sampling from DATGAN: 100%|██████████| 16904/16904 [00:23<00:00, 717.82it/s]
Sampling from DATGAN: 100%|██████████| 16904/16904 [00:21<00:00, 797.91it/s]
Sampling from DATGAN: 100%|██████████| 16904/16904 [00:21<00:00, 773.76it/s]
Sampling from DATGAN: 100%|██████████| 16904/16904 [00:20<00:00, 821.44it/s]
Sampling from DATGAN: 100%|██████████| 16904/16904 [00:21<00:00, 769.14it/s]
Sampling from DATGAN: 100%|██████████| 16904/16904 [00:20<00:00, 823.22it/s]
Sampling from DATGAN: 100%|██████████| 16904/16904 [00:23<00:00, 727.25it/s]
Sampling from DATGAN: 100%|██████████| 16904/16904 [00:20<00:00, 826.43it/s]
Sampling from DATGAN: 100%|██████████| 16904/16904 [00:21<00:00, 772.81it/s]
Sampling from DATGAN: 100%|██████████| 16904/16904 [00:20<00:00, 835.58it/s]


## All the boroughs

In [10]:
count = 1
for r in df.hh_borough.unique():

    tmp = pd.read_csv('../../data/nomis/{}.csv'.format(r))

    print('Sampling for {} ({}/{})'.format(r, count, len(df.hh_borough.unique())))

    samp = datgan.sample(len(tmp), inputs=tmp, randomize=False)

    samp.to_csv('../../data/synthetic/ciDATGAN/{}.csv'.format(r), index=False)

    count += 1

Sampling for Bexley (1/33)


Sampling from DATGAN: 100%|██████████| 231997/231997 [04:48<00:00, 803.48it/s]


Sampling for Harrow (2/33)


Sampling from DATGAN: 100%|██████████| 239056/239056 [04:50<00:00, 823.59it/s]


Sampling for Lambeth (3/33)


Sampling from DATGAN: 100%|██████████| 303086/303086 [06:09<00:00, 820.08it/s]


Sampling for Hackney (4/33)


Sampling from DATGAN: 100%|██████████| 246270/246270 [04:57<00:00, 828.92it/s]


Sampling for Bromley (5/33)


Sampling from DATGAN: 100%|██████████| 309392/309392 [06:11<00:00, 833.25it/s]


Sampling for Haringey (6/33)


Sampling from DATGAN: 100%|██████████| 254926/254926 [05:05<00:00, 834.86it/s]


Sampling for Hounslow (7/33)


Sampling from DATGAN: 100%|██████████| 253957/253957 [05:03<00:00, 836.39it/s]


Sampling for Tower Hamlets (8/33)


Sampling from DATGAN: 100%|██████████| 254096/254096 [05:04<00:00, 834.39it/s]


Sampling for Richmond upon Thames (9/33)


Sampling from DATGAN: 100%|██████████| 186990/186990 [03:49<00:00, 816.13it/s]


Sampling for Camden (10/33)


Sampling from DATGAN: 100%|██████████| 220338/220338 [04:33<00:00, 805.62it/s]


Sampling for Merton (11/33)


Sampling from DATGAN: 100%|██████████| 199693/199693 [04:04<00:00, 817.17it/s]


Sampling for Kingston upon Thames (12/33)


Sampling from DATGAN: 100%|██████████| 160060/160060 [03:13<00:00, 826.27it/s]


Sampling for Brent (13/33)


Sampling from DATGAN: 100%|██████████| 311215/311215 [06:20<00:00, 818.29it/s]


Sampling for Hillingdon (14/33)


Sampling from DATGAN: 100%|██████████| 273936/273936 [05:32<00:00, 823.46it/s]


Sampling for Croydon (15/33)


Sampling from DATGAN: 100%|██████████| 363378/363378 [07:22<00:00, 821.93it/s]


Sampling for Barnet (16/33)


Sampling from DATGAN: 100%|██████████| 356386/356386 [07:12<00:00, 823.55it/s]


Sampling for Havering (17/33)


Sampling from DATGAN: 100%|██████████| 237232/237232 [04:44<00:00, 833.09it/s]


Sampling for Enfield (18/33)


Sampling from DATGAN: 100%|██████████| 312466/312466 [06:14<00:00, 834.65it/s]


Sampling for Southwark (19/33)


Sampling from DATGAN: 100%|██████████| 288283/288283 [05:48<00:00, 827.44it/s]


Sampling for Hammersmith & Fulham (20/33)


Sampling from DATGAN: 100%|██████████| 182493/182493 [03:41<00:00, 823.83it/s]


Sampling for Wandsworth (21/33)


Sampling from DATGAN: 100%|██████████| 306995/306995 [06:13<00:00, 822.09it/s]


Sampling for Newham (22/33)


Sampling from DATGAN: 100%|██████████| 307984/307984 [06:13<00:00, 824.80it/s]


Sampling for Barking and Dagenham (23/33)


Sampling from DATGAN: 100%|██████████| 185911/185911 [03:42<00:00, 837.01it/s]


Sampling for Ealing (24/33)


Sampling from DATGAN: 100%|██████████| 338449/338449 [06:50<00:00, 824.78it/s]


Sampling for Kensington and Chelsea (25/33)


Sampling from DATGAN: 100%|██████████| 158649/158649 [03:12<00:00, 823.08it/s]


Sampling for Redbridge (26/33)


Sampling from DATGAN: 100%|██████████| 278970/278970 [05:38<00:00, 823.32it/s]


Sampling for Waltham Forest (27/33)


Sampling from DATGAN: 100%|██████████| 258249/258249 [05:12<00:00, 826.05it/s]


Sampling for Islington (28/33)


Sampling from DATGAN: 100%|██████████| 206125/206125 [04:08<00:00, 829.28it/s]


Sampling for Sutton (29/33)


Sampling from DATGAN: 100%|██████████| 190146/190146 [03:50<00:00, 825.50it/s]


Sampling for Greenwich (30/33)


Sampling from DATGAN: 100%|██████████| 254557/254557 [05:13<00:00, 811.69it/s]


Sampling for City of London (31/33)


Sampling from DATGAN: 100%|██████████| 7375/7375 [00:11<00:00, 639.58it/s]


Sampling for Lewisham (32/33)


Sampling from DATGAN: 100%|██████████| 275885/275885 [05:35<00:00, 822.63it/s]


Sampling for Westminster (33/33)


Sampling from DATGAN: 100%|██████████| 219396/219396 [04:28<00:00, 818.02it/s]
