In [1]:
from datgan import DATGAN
import datgan

import numpy as np
import pandas as pd
import networkx as nx

# For the Python notebook
%matplotlib inline
%reload_ext autoreload
%autoreload 2

import tensorflow as tf

#tf.config.run_functions_eagerly(True)

In [2]:
df = pd.read_csv('../../data/LPMC/trips.csv', index_col=False)

In [3]:
df.head()

Unnamed: 0,travel_mode,purpose,faretype,day_of_week,start_time_linear,age,female,driving_license,distance,dur_walking,dur_cycling,dur_driving,driving_traffic_percent,hh_vehicles,hh_income,hh_people,dur_pt,hh_region
0,walk,HBO,full,6,15.833333,30,0,1,2145,0.553056,0.1575,0.158333,0.473684,0,35-50k,2,0.436389,Central London
1,pt,HBO,full,7,10.0,50,1,1,1789,0.473333,0.160556,0.135,0.547325,0,15-20k,5,0.271111,Central London
2,pt,HBW,full,5,17.0,55,1,1,10036,2.411667,0.761389,0.638056,0.543317,2,75-100k,2,0.830833,South London
3,drive,HBO,full,6,16.883333,51,1,1,1531,0.423889,0.168611,0.110556,0.268844,1,>100k,2,0.295556,East London
4,pt,HBW,full,3,7.5,39,1,1,1124,0.275833,0.123611,0.081667,0.156463,1,>100k,4,0.124722,South London


In [4]:
# First, define the specificities of continuous variables
data_info = {
    'start_time_linear': {
        'type': 'continuous',
        'bounds': [0.0, 23.999],
        'discrete': False,
    },
    'age': {
        'type': 'continuous',
        'bounds': [0, 100],
        'discrete': True
    },
    'distance': {
        'type': 'continuous',
        'bounds': [0, np.infty],
        'discrete': True,
        'apply_func': (lambda x: np.log(x+1))
    },
    'dur_walking': {
        'type': 'continuous',
        'bounds': [0, np.infty],
        'enforce_bounds': True,
        'discrete': False,
        'apply_func': (lambda x: np.log(x+1))
    },
    'dur_cycling': {
        'type': 'continuous',
        'bounds': [0, np.infty],
        'enforce_bounds': True,
        'discrete': False,
        'apply_func': (lambda x: np.log(x+1))
    },
    'dur_pt': {
        'type': 'continuous',
        'bounds': [0, np.infty],
        'enforce_bounds': True,
        'discrete': False,
        'apply_func': (lambda x: np.log(x+1))
    },
    'dur_driving': {
        'type': 'continuous',
        'bounds': [0, np.infty],
        'enforce_bounds': True,
        'discrete': False,
        'apply_func': (lambda x: np.log(x+1))
    },
    'driving_traffic_percent': {
        'type': 'continuous',
        'bounds': [0, np.infty],
        'discrete': False,
    },
}

# Add the other variables as categorical
for c in df.columns:
    if c not in data_info.keys():
        data_info[c] = {'type': 'categorical'}

In [5]:
df.columns

Index(['travel_mode', 'purpose', 'faretype', 'day_of_week',
       'start_time_linear', 'age', 'female', 'driving_license', 'distance',
       'dur_walking', 'dur_cycling', 'dur_driving', 'driving_traffic_percent',
       'hh_vehicles', 'hh_income', 'hh_people', 'dur_pt', 'hh_region'],
      dtype='object')

In [7]:
# personalised graph
graph = nx.DiGraph()

graph.add_edges_from([
    ('hh_region', 'hh_people'),
    ('hh_region', 'distance'),
    ('hh_region', 'hh_income'),
    ('hh_region', 'travel_mode'),
    ('hh_income', 'hh_vehicles'),
    ('hh_people', 'hh_vehicles'),
    ('age', 'hh_people'),
    ('age', 'faretype'),
    ('age', 'driving_license'),
    ('age', 'purpose'),
    ('age', 'travel_mode'),
    ('female', 'driving_license'),
    ('female', 'hh_people'),
    ('driving_license', 'travel_mode'),
    ('hh_vehicles', 'driving_license'),
    ('hh_vehicles', 'travel_mode'),
    ('faretype', 'travel_mode'),
    ('day_of_week', 'purpose'),
    ('day_of_week', 'start_time_linear'),
    ('day_of_week', 'driving_traffic_percent'),
    ('purpose', 'start_time_linear'),
    ('purpose', 'travel_mode'),
    ('purpose', 'distance'),
    ('start_time_linear', 'driving_traffic_percent'),
    ('driving_traffic_percent', 'dur_driving'),
    ('distance', 'driving_traffic_percent'),
    ('distance', 'dur_walking'),
    ('distance', 'dur_cycling'),
    ('distance', 'dur_pt'),
    ('distance', 'dur_driving'),
    ('distance', 'travel_mode')
])

In [8]:
name = 'ciDATGAN'

In [9]:
output_folder = '../output/{}/'.format(name)

In [10]:
datgan = DATGAN(output=output_folder,
                loss_function='WGGP',
                conditional_inputs=['age', 'female', 'hh_region'],
                batch_size=1101,
                num_epochs=1000)

In [11]:
datgan.fit(df, data_info, graph, preprocessed_data_path='../output/encoded_LPMC')

Preprocessed data have been loaded!
Start training DATGAN with the WGGP loss (12/05/2022 16:18:08).
Restored models from epoch 1000.


Training DATGAN: 0it [00:00, ?it/s]

DATGAN has finished training (12/05/2022 16:18:08) - Training time: 00 second





In [12]:
for i in range(5):
    samp = datgan.sample(len(df), inputs=df[datgan.conditional_inputs])
    samp.to_csv('../../data/synthetic/test/ciDATGAN_{:02d}.csv'.format(i+1), index=False)

Sampling from DATGAN: 100%|██████████| 16904/16904 [00:33<00:00, 499.15it/s] 
Sampling from DATGAN: 100%|██████████| 16904/16904 [00:41<00:00, 404.94it/s]
Sampling from DATGAN: 100%|██████████| 16904/16904 [00:16<00:00, 1002.95it/s]
Sampling from DATGAN: 100%|██████████| 16904/16904 [00:17<00:00, 970.27it/s] 
Sampling from DATGAN: 100%|██████████| 16904/16904 [00:17<00:00, 963.79it/s] 


# All the regions

In [12]:
count = 1
for r in df.hh_region.unique()[:2]:

    tmp = pd.read_csv('../../data/nomis/{}.csv'.format(r))

    print('Sampling for {} ({}/{})'.format(r, count, len(df.hh_region.unique())))

    samp = datgan.sample(len(tmp), inputs=tmp, randomize=False)

    samp.to_csv('../../data/synthetic/regions/ciDATGAN_ALL/{}.csv'.format(r), index=False)

    count += 1

Sampling for Central London (1/5)


Sampling from DATGAN: 100%|██████████| 1314313/1314313 [22:57<00:00, 954.18it/s] 


Sampling for South London (2/5)


Sampling from DATGAN: 100%|██████████| 1421383/1421383 [25:15<00:00, 937.78it/s] 


# All the boroughs

In [11]:
count = 1
for r in df.hh_borough.unique():

    tmp = pd.read_csv('../../data/nomis/{}.csv'.format(r))

    print('Sampling for {} ({}/{})'.format(r, count, len(df.hh_borough.unique())))

    samp = datgan.sample(len(tmp), inputs=tmp, randomize=False)

    samp.to_csv('../../data/synthetic/ciDATGAN/{}.csv'.format(r), index=False)

    count += 1

Sampling for Southwark (1/33)


Sampling from DATGAN: 100%|██████████| 288283/288283 [04:52<00:00, 985.98it/s] 


Sampling for Westminster (2/33)


Sampling from DATGAN: 100%|██████████| 219396/219396 [03:35<00:00, 1017.09it/s]


Sampling for Merton (3/33)


Sampling from DATGAN: 100%|██████████| 199693/199693 [03:18<00:00, 1004.75it/s]


Sampling for Lewisham (4/33)


Sampling from DATGAN: 100%|██████████| 275885/275885 [04:34<00:00, 1005.91it/s]


Sampling for Bromley (5/33)


Sampling from DATGAN: 100%|██████████| 309392/309392 [05:13<00:00, 987.80it/s] 


Sampling for Tower Hamlets (6/33)


Sampling from DATGAN: 100%|██████████| 254096/254096 [04:15<00:00, 994.94it/s] 


Sampling for Lambeth (7/33)


Sampling from DATGAN: 100%|██████████| 303086/303086 [05:05<00:00, 990.52it/s] 


Sampling for Hackney (8/33)


Sampling from DATGAN: 100%|██████████| 246270/246270 [04:07<00:00, 997.01it/s] 


Sampling for Wandsworth (9/33)


Sampling from DATGAN: 100%|██████████| 306995/306995 [05:08<00:00, 995.08it/s] 


Sampling for Enfield (10/33)


Sampling from DATGAN: 100%|██████████| 312466/312466 [05:14<00:00, 992.15it/s] 


Sampling for Bexley (11/33)


Sampling from DATGAN: 100%|██████████| 231997/231997 [03:50<00:00, 1006.49it/s]


Sampling for City of London (12/33)


Sampling from DATGAN: 100%|██████████| 7375/7375 [00:08<00:00, 852.62it/s] 


Sampling for Greenwich (13/33)


Sampling from DATGAN: 100%|██████████| 254557/254557 [04:13<00:00, 1002.87it/s]


Sampling for Hillingdon (14/33)


Sampling from DATGAN: 100%|██████████| 273936/273936 [04:31<00:00, 1008.31it/s]


Sampling for Brent (15/33)


Sampling from DATGAN: 100%|██████████| 311215/311215 [05:10<00:00, 1002.89it/s]


Sampling for Havering (16/33)


Sampling from DATGAN: 100%|██████████| 237232/237232 [03:55<00:00, 1009.39it/s]


Sampling for Croydon (17/33)


Sampling from DATGAN: 100%|██████████| 363378/363378 [06:07<00:00, 988.33it/s] 


Sampling for Hounslow (18/33)


Sampling from DATGAN: 100%|██████████| 253957/253957 [04:15<00:00, 993.09it/s] 


Sampling for Harrow (19/33)


Sampling from DATGAN: 100%|██████████| 239056/239056 [03:59<00:00, 996.18it/s] 


Sampling for Richmond upon Thames (20/33)


Sampling from DATGAN: 100%|██████████| 186990/186990 [03:07<00:00, 995.50it/s] 


Sampling for Hammersmith & Fulham (21/33)


Sampling from DATGAN: 100%|██████████| 182493/182493 [03:04<00:00, 987.49it/s] 


Sampling for Islington (22/33)


Sampling from DATGAN: 100%|██████████| 206125/206125 [03:28<00:00, 990.52it/s] 


Sampling for Sutton (23/33)


Sampling from DATGAN: 100%|██████████| 190146/190146 [03:13<00:00, 984.24it/s] 


Sampling for Haringey (24/33)


Sampling from DATGAN: 100%|██████████| 254926/254926 [04:17<00:00, 989.42it/s] 


Sampling for Kensington and Chelsea (25/33)


Sampling from DATGAN: 100%|██████████| 158649/158649 [02:39<00:00, 992.76it/s] 


Sampling for Barnet (26/33)


Sampling from DATGAN: 100%|██████████| 356386/356386 [05:59<00:00, 991.82it/s] 


Sampling for Camden (27/33)


Sampling from DATGAN: 100%|██████████| 220338/220338 [03:38<00:00, 1007.22it/s]


Sampling for Barking and Dagenham (28/33)


Sampling from DATGAN: 100%|██████████| 185911/185911 [03:05<00:00, 1000.23it/s]


Sampling for Ealing (29/33)


Sampling from DATGAN: 100%|██████████| 338449/338449 [05:38<00:00, 999.03it/s] 


Sampling for Waltham Forest (30/33)


Sampling from DATGAN: 100%|██████████| 258249/258249 [04:16<00:00, 1006.43it/s]


Sampling for Newham (31/33)


Sampling from DATGAN: 100%|██████████| 307984/307984 [05:08<00:00, 998.28it/s] 


Sampling for Kingston upon Thames (32/33)


Sampling from DATGAN: 100%|██████████| 160060/160060 [02:39<00:00, 1003.74it/s]


Sampling for Redbridge (33/33)


Sampling from DATGAN: 100%|██████████| 278970/278970 [04:43<00:00, 984.34it/s] 
