In [1]:
from datgan import DATGAN
import datgan

import numpy as np
import pandas as pd
import networkx as nx

# For the Python notebook
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [2]:
df = pd.read_csv('../../data/LPMC/trips.csv', index_col=False)

In [3]:
df.head()

Unnamed: 0,travel_mode,purpose,fueltype,faretype,bus_scale,travel_year,travel_month,travel_date,day_of_week,start_time_linear,...,dur_driving,cost_transit,cost_driving_fuel,cost_driving_con_charge,driving_traffic_percent,hh_vehicles,hh_borough,hh_income,hh_people,dur_pt_int
0,drive,HBO,Diesel_Car,full,1.0,2012,8,7,2,20.0,...,0.208611,1.5,0.57,0.0,0.098535,1,Bexley,35-50k,2,0.0
1,drive,HBW,Diesel_Car,full,1.0,2013,2,8,5,15.0,...,0.471944,3.0,1.62,0.0,0.354915,1,Harrow,5-10k,3,0.133333
2,pt,HBO,Average_Car,full,0.5,2014,10,8,3,14.0,...,0.238333,0.75,0.62,0.0,0.212121,0,Lambeth,50-75k,4,0.0
3,pt,HBE,Average_Car,dis,0.5,2014,3,10,1,10.5,...,0.308889,0.75,0.6,10.5,0.684353,0,Hackney,10-15k,2,0.0
4,walk,HBW,Petrol_Car,full,1.0,2013,1,24,4,16.833333,...,0.0775,1.5,0.19,0.0,0.046595,2,Lambeth,50-75k,5,0.0


In [4]:
# First, define the specificities of continuous variables
data_info = {
    'start_time_linear': {
        'type': 'continuous',
        'bounds': [0.0, 23.999],
        'discrete': False,
    },
    'age': {
        'type': 'continuous',
        'bounds': [0, 100],
        'discrete': True
    },
    'distance': {
        'type': 'continuous',
        'bounds': [0, np.infty],
        'discrete': True,
        'apply_func': (lambda x: np.log(x+1))
    },
    'dur_walking': {
        'type': 'continuous',
        'bounds': [0, np.infty],
        'enforce_bounds': True,
        'discrete': False,
        'apply_func': (lambda x: np.log(x+1))
    },
    'dur_cycling': {
        'type': 'continuous',
        'bounds': [0, np.infty],
        'enforce_bounds': True,
        'discrete': False,
        'apply_func': (lambda x: np.log(x+1))
    },
    'dur_pt_access': {
        'type': 'continuous',
        'bounds': [0, np.infty],
        'enforce_bounds': True,
        'discrete': False,
        'apply_func': (lambda x: np.log(x+1))
    },
    'dur_pt_rail': {
        'type': 'continuous',
        'bounds': [0, np.infty],
        'enforce_bounds': True,
        'discrete': False,
    },
    'dur_pt_bus': {
        'type': 'continuous',
        'bounds': [0, np.infty],
        'enforce_bounds': True,
        'discrete': False,
    },
    'dur_pt_int': {
        'type': 'continuous',
        'bounds': [0, np.infty],
        'enforce_bounds': True,
        'discrete': False,
    },
    'dur_driving': {
        'type': 'continuous',
        'bounds': [0, np.infty],
        'enforce_bounds': True,
        'discrete': False,
        'apply_func': (lambda x: np.log(x+1))
    },
    'cost_transit': {
        'type': 'continuous',
        'bounds': [0, np.infty],
        'enforce_bounds': True,
        'discrete': False,
    },
    'cost_driving_fuel': {
        'type': 'continuous',
        'bounds': [0, np.infty],
        'enforce_bounds': True,
        'discrete': False,
        'apply_func': (lambda x: np.log(x+1))
    },
    'driving_traffic_percent': {
        'type': 'continuous',
        'bounds': [0, np.infty],
        'discrete': False,
    },
}

# Add the other variables as categorical
for c in df.columns:
    if c not in data_info.keys():
        data_info[c] = {'type': 'categorical'}

In [5]:
# personalised graph
graph = nx.DiGraph()

graph.add_edges_from([
    ("travel_year", "travel_month"),
    ("travel_date", "day_of_week"),
    ("travel_month", "travel_date"),
    ("travel_month", "driving_traffic_percent"),
    ("travel_month", "day_of_week"),
    ("travel_month", "travel_mode"),
    ("travel_date", "day_of_week"),
    ("day_of_week", "driving_traffic_percent"),
    ("day_of_week", "cost_driving_con_charge"),
    ("day_of_week", "purpose"),
    ("day_of_week", "start_time_linear"),
    ("day_of_week", "travel_mode"),
    ("purpose", "distance"),
    ("purpose", "start_time_linear"),
    ("purpose", "travel_mode"),
    ("start_time_linear", "driving_traffic_percent"),
    ("start_time_linear", "cost_driving_con_charge"),
    ("start_time_linear", "travel_mode"),
    ("hh_vehicles", "fueltype"),
    ("hh_vehicles", "driving_license"),
    ("hh_vehicles", "travel_mode"),
    ("fueltype", "cost_driving_con_charge"),
    ("fueltype", "cost_driving_fuel"),
    ("female", "driving_license"),
    ("female", "travel_mode"),
    ("age", "bus_scale"),
    ("age", "driving_license"),
    ("age", "faretype"),
    ("age", "travel_mode"),
    ("driving_license", "travel_mode"),
    ("faretype", "cost_transit"),
    ("faretype", "bus_scale"),
    ("faretype", "travel_mode"),
    ("bus_scale", "cost_transit"),
    ("distance", "cost_driving_fuel"),
    ("distance", "dur_driving"),
    ("distance", "dur_walking"),
    ("distance", "dur_cycling"),
    ("distance", "dur_pt_access"),
    ("distance", "dur_pt_rail"),
    ("distance", "dur_pt_bus"),
    ("distance", "dur_pt_int"),
    ("distance", "pt_n_interchanges"),
    ("distance", "travel_mode"),
    ("pt_n_interchanges", "dur_pt_rail"),
    ("pt_n_interchanges", "dur_pt_bus"),
    ("pt_n_interchanges", "dur_pt_int"),
    ("pt_n_interchanges", "cost_transit"),
    ("driving_traffic_percent", "cost_driving_con_charge"),
    ("driving_traffic_percent", "travel_mode"),
    ("cost_driving_fuel", "cost_driving_con_charge"),
    ("cost_driving_fuel", "travel_mode"),
    ("cost_driving_con_charge", "travel_mode"),
    ("dur_driving", "travel_mode"),
    ("dur_walking", "travel_mode"),
    ("dur_cycling", "travel_mode"),
    ("dur_pt_access", "travel_mode"),
    ("dur_pt_rail", "cost_transit"),
    ("dur_pt_rail", "travel_mode"),
    ("dur_pt_bus", "cost_transit"),
    ("dur_pt_bus", "travel_mode"),
    ("dur_pt_int", "travel_mode"),
    ("cost_transit", "travel_mode"),
    ("hh_borough", "hh_income"),
    ("hh_borough", "travel_mode"),
    ("hh_income", "hh_vehicles"),
    ("hh_income", "age"),
    ("hh_income", "hh_people"),
    ("hh_people", "age"),
    ("hh_people", "female")
])


In [6]:
#datgan.advise(df, graph)

# Vanilla DATGAN

In [None]:
name = 'LPMC'

In [8]:
output_folder = '../output/{}/'.format(name)

In [17]:
datgan = DATGAN(output=output_folder,
                loss_function='WGGP',
                batch_size=1101,
                num_epochs=1000)

In [10]:
datgan.fit(df, data_info, graph, preprocessed_data_path='../output/encoded_LPMC')

Preprocessing the data!
Encoding categorical variable "travel_mode"...
Encoding categorical variable "purpose"...
Encoding categorical variable "fueltype"...
Encoding categorical variable "faretype"...
Encoding categorical variable "bus_scale"...
Encoding categorical variable "travel_year"...
Encoding categorical variable "travel_month"...
Encoding categorical variable "travel_date"...
Encoding categorical variable "day_of_week"...
Encoding continuous variable "start_time_linear"...
Encoding continuous variable "age"...
Encoding categorical variable "female"...
Encoding categorical variable "driving_license"...
Encoding continuous variable "distance"...
Encoding continuous variable "dur_walking"...
Encoding continuous variable "dur_cycling"...
Encoding continuous variable "dur_pt_access"...
Encoding continuous variable "dur_pt_rail"...
Encoding continuous variable "dur_pt_bus"...
Encoding categorical variable "pt_n_interchanges"...
Encoding continuous variable "dur_driving"...
Encoding

Training DATGAN: 100%|██████████| 1000/1000 [1:06:32<00:00,  3.99s/it]

DATGAN has finished training (29/04/2022 12:52:57) - Training time: 01 hour, 06 minutes, and 33 seconds





In [11]:
samp = datgan.sample(len(df))#, inputs=df[datgan.conditional_inputs])
samp.to_csv('../../data/synthetic/LPMC.csv', index=False)

Sampling from DATGAN: 100%|██████████| 17616/17616 [00:22<00:00, 783.18it/s]


# DATGAN with conditional inputs

In [7]:
name = 'LPMC_cond'

In [8]:
output_folder = '../output/{}/'.format(name)

In [9]:
datgan = DATGAN(output=output_folder,
                loss_function='WGGP',
                conditional_inputs=['age', 'female', 'hh_borough'],
                batch_size=1101,
                num_epochs=1000)

In [10]:
datgan.fit(df, data_info, graph, preprocessed_data_path='../output/encoded_LPMC')

Preprocessed data have been loaded!
Start training DATGAN with the WGGP loss (30/04/2022 17:52:08).
Restored models from epoch 1000.


Training DATGAN: 0it [00:00, ?it/s]

DATGAN has finished training (30/04/2022 17:52:09) - Training time: 00 second





In [17]:
samp = datgan.sample(len(df), inputs=df[datgan.conditional_inputs])
samp.to_csv('../../data/synthetic/LPMC_cond.csv', index=False)

Sampling from DATGAN: 100%|██████████| 17616/17616 [00:22<00:00, 786.86it/s]


In [21]:
small = pd.read_csv('../../data/nomis/100k.csv')

In [24]:
samp = datgan.sample(len(small), inputs=small)
samp.to_csv('../../data/synthetic/LPMC_cond_100k.csv', index=False)

Sampling from DATGAN: 100%|██████████| 100000/100000 [02:04<00:00, 805.94it/s]


In [25]:
large = pd.read_csv('../../data/nomis/1M.csv')

In [26]:
samp = datgan.sample(len(large), inputs=large)
samp.to_csv('../../data/synthetic/LPMC_cond_1M.csv', index=False)

Sampling from DATGAN: 100%|██████████| 1000000/1000000 [21:48<00:00, 764.49it/s]


## All the boroughs

In [11]:
count = 1
for r in df.hh_borough.unique():

    tmp = pd.read_csv('../../data/nomis/{}.csv'.format(r))

    print('Sampling for {} ({}/{})'.format(r, count, len(df.hh_borough.unique())))

    samp = datgan.sample(len(tmp), inputs=tmp, randomize=False)

    samp.to_csv('../../data/synthetic/LPMC_cond_{}.csv'.format(r), index=False)

    count += 1

Sampling for Bexley (1/49)


Sampling from DATGAN: 100%|██████████| 231997/231997 [04:56<00:00, 782.63it/s]


Sampling for Harrow (2/49)


Sampling from DATGAN: 100%|██████████| 239056/239056 [05:01<00:00, 793.06it/s]


Sampling for Lambeth (3/49)


Sampling from DATGAN: 100%|██████████| 303086/303086 [06:20<00:00, 796.50it/s]


Sampling for Hackney (4/49)


Sampling from DATGAN: 100%|██████████| 246270/246270 [05:17<00:00, 774.82it/s]


Sampling for Bromley (5/49)


Sampling from DATGAN: 100%|██████████| 309392/309392 [06:34<00:00, 783.70it/s]


Sampling for Haringey (6/49)


Sampling from DATGAN: 100%|██████████| 254926/254926 [05:23<00:00, 787.55it/s]


Sampling for Hounslow (7/49)


Sampling from DATGAN: 100%|██████████| 253957/253957 [05:22<00:00, 787.37it/s]


Sampling for Tower Hamlets (8/49)


Sampling from DATGAN: 100%|██████████| 254096/254096 [05:23<00:00, 784.42it/s]


Sampling for Richmond upon Thames (9/49)


Sampling from DATGAN: 100%|██████████| 186990/186990 [03:56<00:00, 789.79it/s]


Sampling for Camden (10/49)


Sampling from DATGAN: 100%|██████████| 220338/220338 [04:39<00:00, 787.20it/s]


Sampling for Merton (11/49)


Sampling from DATGAN: 100%|██████████| 199693/199693 [04:12<00:00, 790.70it/s]


Sampling for Kingston upon Thames (12/49)


Sampling from DATGAN: 100%|██████████| 160060/160060 [03:20<00:00, 797.91it/s]


Sampling for Brent (13/49)


Sampling from DATGAN: 100%|██████████| 311215/311215 [06:32<00:00, 793.00it/s]


Sampling for Hillingdon (14/49)


Sampling from DATGAN: 100%|██████████| 273936/273936 [05:45<00:00, 793.49it/s]


Sampling for Croydon (15/49)


Sampling from DATGAN: 100%|██████████| 363378/363378 [07:49<00:00, 773.84it/s]


Sampling for Barnet (16/49)


Sampling from DATGAN: 100%|██████████| 356386/356386 [07:38<00:00, 777.13it/s]


Sampling for Havering (17/49)


Sampling from DATGAN: 100%|██████████| 237232/237232 [05:01<00:00, 785.71it/s]


Sampling for Enfield (18/49)


Sampling from DATGAN: 100%|██████████| 312466/312466 [06:39<00:00, 781.29it/s]


Sampling for Southwark (19/49)


Sampling from DATGAN: 100%|██████████| 288283/288283 [06:09<00:00, 780.91it/s]


Sampling for Hammersmith & Fulham (20/49)


Sampling from DATGAN: 100%|██████████| 182493/182493 [03:54<00:00, 779.23it/s]


Sampling for Dartford (21/49)


Sampling from DATGAN: 100%|██████████| 97365/97365 [02:05<00:00, 776.32it/s]


Sampling for Wandsworth (22/49)


Sampling from DATGAN: 100%|██████████| 306995/306995 [06:29<00:00, 789.09it/s]


Sampling for Newham (23/49)


Sampling from DATGAN: 100%|██████████| 307984/307984 [06:30<00:00, 787.77it/s]


Sampling for Barking and Dagenham (24/49)


Sampling from DATGAN: 100%|██████████| 185911/185911 [03:56<00:00, 785.41it/s]


Sampling for Ealing (25/49)


Sampling from DATGAN: 100%|██████████| 338449/338449 [07:10<00:00, 786.58it/s]


Sampling for Kensington and Chelsea (26/49)


Sampling from DATGAN: 100%|██████████| 158649/158649 [03:20<00:00, 789.62it/s]


Sampling for Redbridge (27/49)


Sampling from DATGAN: 100%|██████████| 278970/278970 [05:53<00:00, 789.43it/s]


Sampling for Waltham Forest (28/49)


Sampling from DATGAN: 100%|██████████| 258249/258249 [05:24<00:00, 794.70it/s]


Sampling for Islington (29/49)


Sampling from DATGAN: 100%|██████████| 206125/206125 [04:19<00:00, 792.97it/s]


Sampling for Sutton (30/49)


Sampling from DATGAN: 100%|██████████| 190146/190146 [04:00<00:00, 790.84it/s]


Sampling for Greenwich (31/49)


Sampling from DATGAN: 100%|██████████| 254557/254557 [05:18<00:00, 798.76it/s]


Sampling for City of London (32/49)


Sampling from DATGAN: 100%|██████████| 7375/7375 [00:11<00:00, 629.71it/s]


Sampling for Lewisham (33/49)


Sampling from DATGAN: 100%|██████████| 275885/275885 [05:49<00:00, 789.76it/s]


Sampling for Epping Forest (34/49)


Sampling from DATGAN: 100%|██████████| 124659/124659 [02:39<00:00, 781.12it/s]


Sampling for Westminster (35/49)


Sampling from DATGAN: 100%|██████████| 219396/219396 [04:33<00:00, 803.32it/s]


Sampling for Tandridge (36/49)


Sampling from DATGAN: 100%|██████████| 82998/82998 [01:44<00:00, 791.20it/s]


Sampling for Hertsmere (37/49)


Sampling from DATGAN: 100%|██████████| 100031/100031 [02:05<00:00, 794.70it/s]


Sampling for Three Rivers (38/49)


Sampling from DATGAN: 100%|██████████| 87317/87317 [01:49<00:00, 795.20it/s]


Sampling for Epsom and Ewell (39/49)


Sampling from DATGAN: 100%|██████████| 75102/75102 [01:34<00:00, 795.08it/s]


Sampling for Elmbridge (40/49)


Sampling from DATGAN: 100%|██████████| 130875/130875 [02:46<00:00, 783.84it/s]


Sampling for Spelthorne (41/49)


Sampling from DATGAN: 100%|██████████| 95598/95598 [02:00<00:00, 790.29it/s]


Sampling for Sevenoaks (42/49)


Sampling from DATGAN: 100%|██████████| 114893/114893 [02:24<00:00, 796.86it/s]


Sampling for South Bucks (43/49)


Sampling from DATGAN: 100%|██████████| 66867/66867 [01:25<00:00, 783.22it/s]


Sampling for Reigate and Banstead (44/49)


Sampling from DATGAN: 100%|██████████| 137835/137835 [02:53<00:00, 795.45it/s]


Sampling for Watford (45/49)


Sampling from DATGAN: 100%|██████████| 90301/90301 [01:54<00:00, 790.53it/s]


Sampling for Thurrock (46/49)


Sampling from DATGAN: 100%|██████████| 157705/157705 [03:18<00:00, 794.63it/s]


Sampling for Runnymede (47/49)


Sampling from DATGAN: 100%|██████████| 80510/80510 [01:42<00:00, 784.80it/s]


Sampling for Mole Valley (48/49)


Sampling from DATGAN: 100%|██████████| 85375/85375 [01:48<00:00, 788.62it/s]


Sampling for Woking (49/49)


Sampling from DATGAN: 100%|██████████| 99198/99198 [02:04<00:00, 795.61it/s]
