In [1]:
from datgan import DATGAN
import datgan

import numpy as np
import pandas as pd
import networkx as nx

# For the Python notebook
%matplotlib inline
%reload_ext autoreload
%autoreload 2

import tensorflow as tf
#tf.config.run_functions_eagerly(True)

In [9]:
df = pd.read_csv('../data/LPMC/trips.csv', index_col=False)

In [10]:
# First, define the specificities of continuous variables
data_info = {
    'start_time_linear': {
        'type': 'continuous',
        'bounds': [0.0, 23.999],
        'discrete': False,
    },
    'age': {
        'type': 'continuous',
        'bounds': [0, 100],
        'discrete': True
    },
    'distance': {
        'type': 'continuous',
        'bounds': [0, np.infty],
        'discrete': True,
        'apply_func': (lambda x: np.log(x+1))
    },
    'dur_walking': {
        'type': 'continuous',
        'bounds': [0, np.infty],
        'enforce_bounds': True,
        'discrete': False,
        'apply_func': (lambda x: np.log(x+1))
    },
    'dur_cycling': {
        'type': 'continuous',
        'bounds': [0, np.infty],
        'enforce_bounds': True,
        'discrete': False,
        'apply_func': (lambda x: np.log(x+1))
    },
    'dur_pt': {
        'type': 'continuous',
        'bounds': [0, np.infty],
        'enforce_bounds': True,
        'discrete': False,
        'apply_func': (lambda x: np.log(x+1))
    },
    'dur_driving': {
        'type': 'continuous',
        'bounds': [0, np.infty],
        'enforce_bounds': True,
        'discrete': False,
        'apply_func': (lambda x: np.log(x+1))
    },
    'driving_traffic_percent': {
        'type': 'continuous',
        'bounds': [0, np.infty],
        'discrete': False,
    },
}

# Add the other variables as categorical
for c in df.columns:
    if c not in data_info.keys():
        data_info[c] = {'type': 'categorical'}

In [11]:
# personalised graph
graph = nx.DiGraph()

graph.add_edges_from([
    ('hh_region', 'hh_people'),
    ('hh_region', 'distance'),
    ('hh_region', 'hh_income'),
    ('hh_region', 'travel_mode'),
    ('hh_income', 'hh_vehicles'),
    ('hh_people', 'hh_vehicles'),
    ('age', 'hh_people'),
    ('age', 'faretype'),
    ('age', 'driving_license'),
    ('age', 'purpose'),
    ('age', 'travel_mode'),
    ('female', 'driving_license'),
    ('female', 'hh_people'),
    ('driving_license', 'travel_mode'),
    ('hh_vehicles', 'driving_license'),
    ('hh_vehicles', 'travel_mode'),
    ('faretype', 'travel_mode'),
    ('day_of_week', 'purpose'),
    ('day_of_week', 'start_time_linear'),
    ('day_of_week', 'driving_traffic_percent'),
    ('purpose', 'start_time_linear'),
    ('purpose', 'travel_mode'),
    ('purpose', 'distance'),
    ('start_time_linear', 'driving_traffic_percent'),
    ('driving_traffic_percent', 'dur_driving'),
    ('distance', 'driving_traffic_percent'),
    ('distance', 'dur_walking'),
    ('distance', 'dur_cycling'),
    ('distance', 'dur_pt'),
    ('distance', 'dur_driving'),
    ('distance', 'travel_mode')
])

In [30]:
df = pd.read_csv('../data/LPMC/trips_small_bias.csv', index_col=False)

In [31]:
output_folder = './output/small_bias/encoded_data/'

In [32]:
datgan = DATGAN(output=output_folder,
                loss_function='WGGP',
                batch_size=1101,
                num_epochs=1000)

In [None]:
datgan.preprocess(df, data_info, output_folder)

Preprocessing the data!
Encoding categorical variable "travel_mode"...
Encoding categorical variable "purpose"...
Encoding categorical variable "faretype"...
Encoding categorical variable "day_of_week"...
Encoding continuous variable "start_time_linear"...
