In [1]:
from datgan import DATGAN

import numpy as np
import pandas as pd
import networkx as nx
import tensorflow as tf

# Run the model eagerly
#tf.config.run_functions_eagerly(True)

# For the Python notebook
%matplotlib inline
%reload_ext autoreload
%autoreload 2

# Load the original data

In [2]:
df = pd.read_csv('./data/CMAP.csv', index_col=False)

In [3]:
df.head()

Unnamed: 0,choice,travel_dow,trip_purpose,distance,hh_vehicles,hh_size,hh_bikes,hh_descr,hh_income,gender,age,license,education_level,work_status,departure_time
0,drive,7,HOME_OTHER,3.93477,2,3,3,detached,6,0,30,1,4,PTE,20.166667
1,drive,2,SHOPPING,0.31557,3,3,3,detached,7,0,54,1,5,FTE,17.5
2,drive,2,SHOPPING,0.28349,1,1,0,detached,3,0,80,1,3,PTE,9.333333
3,drive,2,OTHER,0.69417,2,2,0,detached,5,1,42,1,5,FTE,13.783333
4,passenger,1,SHOPPING,4.30666,2,2,1,detached,4,0,32,0,3,Unemployed,11.566667


# Provide information on the data

We need to provide the data type for each columns and give a bit more information about continuous columns.

In our case, the `distance` is an exponential distriubtion => we transform it using a logarithm in order to make it easier for the generator to learn the distribution.

In [4]:
data_info = {
    'distance': {
        'type': 'continuous',
        'bounds': [0.0, np.infty],
        'discrete': False,
        'apply_func': (lambda x: np.log(x+1)),
    },
    'age': {
        'type': 'continuous',
        'bounds': [0, 100],
        'discrete': True
    },
    'departure_time': {
        'type': 'continuous',
        'bounds': [0, 23.999],
        'discrete': False
    }
}

for c in df.columns:
    if c not in data_info.keys():
        data_info[c] = {'type': 'categorical'}

# DAG

We need to define the DAG using the library `networkx`.

In [5]:
graph = nx.DiGraph()
graph.add_edges_from([
    ("age", "license"),
    ("age", "education_level"),
    ("gender", "work_status"),
    ("education_level", "work_status"),
    ("education_level", "hh_income"),
    ("work_status", "hh_income"),
    ("hh_income", "hh_descr"),
    ("hh_income", "hh_size"),
    ("hh_size", "hh_vehicles"),
    ("hh_size", "hh_bikes"),
    ("work_status", "trip_purpose"),
    ("trip_purpose", "departure_time"),
    ("trip_purpose", "distance"),
    ("travel_dow", "choice"),
    ("distance", "choice"),
    ("departure_time", "choice"),
    ("hh_vehicles", "choice"),
    ("hh_bikes", "choice"),
    ("license", "choice"),
    ("education_level", "hh_size"),
    ("work_status", "hh_descr"),
    ("work_status", "hh_size"),
    ("hh_income", "hh_bikes"),
    ("hh_income", "hh_vehicles"),
    ("trip_purpose", "choice")
])

# Training the DATGAN

In [6]:
output_folder = './output/'

In [7]:
# Choosing the right batch size allows to not waste data while training the models
batch_size = 1116

In [8]:
datgan = DATGAN(output=output_folder, batch_size=batch_size, num_epochs=1000)

It is possible to preprocess the data and save it somewhere. Since it takes a bit of time to do it, it helps to test multiple models faster. 

In [9]:
datgan.preprocess(df, data_info, preprocessed_data_path='./encoded_data')

Preprocessed data have been loaded!


If the data has been preprocessed, you need to provide the path where it was saved. The model will then load the preprocessed data and work with it. If no preprocessed data are found in the specified folder, the model will still preprocess them before training the networks.

In [10]:
datgan.fit(df, data_info, graph, preprocessed_data_path='./encoded_data')

Preprocessed data have been loaded!
Start training DATGAN with the WGAN loss (04/03/2022 11:46:12).


Training DATGAN: 100%|█████████████████████████████████████████████████████████████| 1000/1000 [23:21<00:00,  1.40s/it]

DATGAN has finished training (04/03/2022 12:09:33) - Training time: 23 minutes and 21 seconds





Sample the synthetic data

In [11]:
samples = datgan.sample(len(df))

Save the new dataset

In [12]:
samples.to_csv('./data/CMAP_synthetic.csv', index=False)