In [1]:
import os
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import seaborn as sns
sns.set_style("whitegrid")

from sklearn.preprocessing import OrdinalEncoder

from datgan import stats_assessment
from datgan import ml_assessment, transform_results


# For the Python notebook
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [2]:
df_orig = pd.read_csv('../../data/LPMC/trips.csv', index_col=False)

continuous_columns = ['start_time_linear', 'age', 'distance', 'dur_walking', 'dur_cycling', 'dur_pt','dur_driving', 'driving_traffic_percent']

cond_inputs = ["age", "female", "hh_region"]

synth_files = []

for i in range(5):
    for j in range(5):
        synth_files.append('../../data/synthetic/normal/ciDATGAN_{}_{}.csv'.format(i+1, j+1))
        synth_files.append('../../data/synthetic/normal/DATGAN_{}_{}.csv'.format(i+1, j+1))

len_df = len(df_orig)

In [3]:
results_path = './results/'

if not os.path.exists(results_path):
    os.makedirs(results_path)

In [4]:
stats_str = ['mae', 'rmse', 'r2', 'srmse', 'corr']
colors = [(0.6627450980392157, 0.6627450980392157, 0.6627450980392157, 1.0), (1.0, 0.0, 0.0, 0.0)]

# First level

In [5]:
pickle_name = 'res_lvl1.pickle'
aggregation_level = 1

first_lvl_stats = {}

try:
    first_lvl_stats = pickle.load(open(results_path + pickle_name, 'rb'))
    print('Found previous pickle file, using that')
except:
    print('No previous results found, starting fresh')

Found previous pickle file, using that


In [6]:
for i, f in enumerate(synth_files):

    file_name = f.split('/')[-1].split('.')[0]

    if file_name in first_lvl_stats:
        print("Results for file \033[1m{}\033[0m ({}/{}) already exists!".format(file_name, i+1, len(synth_files)))
    else:
        print("Preparing stats for file \033[1m{}\033[0m ({}/{})".format(file_name, i+1, len(synth_files)))

        first_lvl_stats[file_name] = {}

        df_synth = pd.read_csv(f, index_col=False)
        df_synth = df_synth.sample(len_df).reset_index(drop=True)

        stats = stats_assessment(df_orig, df_synth, continuous_columns, aggregation_level, ignore_cols=cond_inputs)

        first_lvl_stats[file_name] = stats

    pickle.dump(first_lvl_stats, open(results_path + pickle_name, 'wb'))

print("\033[1mFINISHED!\033[0m")

Results for file [1mciDATGAN_1_1[0m (1/50) already exists!
Results for file [1mDATGAN_1_1[0m (2/50) already exists!
Results for file [1mciDATGAN_1_2[0m (3/50) already exists!
Results for file [1mDATGAN_1_2[0m (4/50) already exists!
Results for file [1mciDATGAN_1_3[0m (5/50) already exists!
Results for file [1mDATGAN_1_3[0m (6/50) already exists!
Results for file [1mciDATGAN_1_4[0m (7/50) already exists!
Results for file [1mDATGAN_1_4[0m (8/50) already exists!
Results for file [1mciDATGAN_1_5[0m (9/50) already exists!
Results for file [1mDATGAN_1_5[0m (10/50) already exists!
Results for file [1mciDATGAN_2_1[0m (11/50) already exists!
Results for file [1mDATGAN_2_1[0m (12/50) already exists!
Results for file [1mciDATGAN_2_2[0m (13/50) already exists!
Results for file [1mDATGAN_2_2[0m (14/50) already exists!
Results for file [1mciDATGAN_2_3[0m (15/50) already exists!
Results for file [1mDATGAN_2_3[0m (16/50) already exists!
Results for file [1mciDATGAN_2_4

In [7]:
res = {}

for test in ['all', 'cont', 'cat']:

    res[test] = {}

    if test == 'all':
        cols = df_orig.columns
    elif test == 'cont':
        cols = continuous_columns
    elif test == 'cat':
        cols = set(df_orig.columns) - set(continuous_columns)

    cols = set(cols) - set(cond_inputs)

    for s in stats_str:
        res[test][s] = {}

    for m in first_lvl_stats.keys():
        for s in stats_str:
            res[test][s][m] = []

            for c in cols:
                res[test][s][m].append(first_lvl_stats[m][c][s])

In [8]:
avg = {}

for test in ['all', 'cont', 'cat']:

    avg[test] = {}

    for s in stats_str:
        avg[test][s] = {}

        for m in first_lvl_stats.keys():
            avg[test][s][m] = {
                'mean': np.mean(res[test][s][m]),
                'std': np.std(res[test][s][m])
            }

In [9]:
sts = 'srmse'

res_DATGAN = []
res_ciDATGAN = []

for i in range(5):
    for j in range(5):
        res_DATGAN.append(avg['all'][sts]['DATGAN_{}_{}'.format(i+1, j+1)]['mean'])
        res_ciDATGAN.append(avg['all'][sts]['ciDATGAN_{}_{}'.format(i+1, j+1)]['mean'])

tmp = [res_DATGAN, res_ciDATGAN]
df = pd.DataFrame(tmp, index=['DATGAN', 'ciDATGAN'])

plt.figure(figsize=(10,7))

sns.boxplot(data=df.T, palette=colors,showmeans=True,
            meanprops={"marker":"o",
                       "markerfacecolor":"white",
                       "markeredgecolor":"black",
                      "markersize":"10"})

plt.xticks([0,1], ['DATGAN', 'ciDATGAN'])
plt.ylabel(sts.upper())

plt.savefig('../../figures/obs/lvl1.png', bbox_inches='tight')
plt.savefig('../../figures/obs/lvl1.pdf', bbox_inches='tight')
plt.close()

# Second level

In [13]:
pickle_name = 'res_lvl2.pickle'
aggregation_level = 2

second_lvl_stats = {}

try:
    second_lvl_stats = pickle.load(open(results_path + pickle_name, 'rb'))
    print('Found previous pickel file, using that')
except:
    print('No previous results found, starting fresh')

Found previous pickel file, using that


In [14]:
for i, f in enumerate(synth_files):

    file_name = f.split('/')[-1].split('.')[0]

    if file_name in second_lvl_stats:
        print("Results for file \033[1m{}\033[0m ({}/{}) already exists!".format(file_name, i+1, len(synth_files)))
    else:
        print("Preparing stats for file \033[1m{}\033[0m ({}/{})".format(file_name, i+1, len(synth_files)))

        second_lvl_stats[file_name] = {}

        df_synth = pd.read_csv(f, index_col=False)
        df_synth = df_synth.sample(len_df).reset_index(drop=True)

        stats = stats_assessment(df_orig, df_synth, continuous_columns, aggregation_level, ignore_cols=cond_inputs)

        second_lvl_stats[file_name] = stats

    pickle.dump(second_lvl_stats, open(results_path + pickle_name, 'wb'))

print("\033[1mFINISHED!\033[0m")

Results for file [1mciDATGAN_1_1[0m (1/50) already exists!
Results for file [1mDATGAN_1_1[0m (2/50) already exists!
Results for file [1mciDATGAN_1_2[0m (3/50) already exists!
Results for file [1mDATGAN_1_2[0m (4/50) already exists!
Results for file [1mciDATGAN_1_3[0m (5/50) already exists!
Results for file [1mDATGAN_1_3[0m (6/50) already exists!
Results for file [1mciDATGAN_1_4[0m (7/50) already exists!
Results for file [1mDATGAN_1_4[0m (8/50) already exists!
Results for file [1mciDATGAN_1_5[0m (9/50) already exists!
Results for file [1mDATGAN_1_5[0m (10/50) already exists!
Results for file [1mciDATGAN_2_1[0m (11/50) already exists!
Results for file [1mDATGAN_2_1[0m (12/50) already exists!
Results for file [1mciDATGAN_2_2[0m (13/50) already exists!
Results for file [1mDATGAN_2_2[0m (14/50) already exists!
Results for file [1mciDATGAN_2_3[0m (15/50) already exists!
Results for file [1mDATGAN_2_3[0m (16/50) already exists!
Results for file [1mciDATGAN_2_4

In [15]:
res = {}

for s in stats_str:
    res[s] = {}

for m in second_lvl_stats.keys():

    for s in stats_str:
        res[s][m] = []

        for c in second_lvl_stats[m].keys():
            res[s][m].append(second_lvl_stats[m][c][s])

In [16]:
avg = {}

for s in stats_str:
    avg[s] = {}

    for m in second_lvl_stats.keys():
        avg[s][m] = {
            'mean': np.mean(res[s][m]),
            'std': np.std(res[s][m])
        }

In [17]:
sts = 'srmse'

res_DATGAN = []
res_ciDATGAN = []

for i in range(5):
    for j in range(5):
        res_DATGAN.append(avg[sts]['DATGAN_{}_{}'.format(i+1, j+1)]['mean'])
        res_ciDATGAN.append(avg[sts]['ciDATGAN_{}_{}'.format(i+1, j+1)]['mean'])

res_DATGAN = np.array(res_DATGAN).flatten()
res_ciDATGAN = np.array(res_ciDATGAN).flatten()

tmp = [res_DATGAN, res_ciDATGAN]
df = pd.DataFrame(tmp, index=['DATGAN', 'ciDATGAN'])

plt.figure(figsize=(10,7))

sns.boxplot(data=df.T, palette=colors,showmeans=True,
            meanprops={"marker":"o",
                       "markerfacecolor":"white",
                       "markeredgecolor":"black",
                      "markersize":"10"})

plt.xticks([0,1], ['DATGAN', 'ciDATGAN'])
plt.ylabel(sts.upper())

plt.savefig('../../figures/obs/lvl2.png'.format(s), bbox_inches='tight')
plt.savefig('../../figures/obs/lvl2.pdf'.format(s), bbox_inches='tight')
plt.close()

In [18]:
np.array(tmp).T

array([[0.23917134, 0.59076536],
       [0.23315717, 0.57176915],
       [0.24441854, 0.5777694 ],
       [0.2387281 , 0.56953828],
       [0.2421738 , 0.58401001],
       [0.3327894 , 0.21820049],
       [0.32634526, 0.21978411],
       [0.33711431, 0.21657678],
       [0.33841542, 0.21036147],
       [0.34479171, 0.21609694],
       [0.3078515 , 0.29974097],
       [0.3094804 , 0.29360482],
       [0.30333814, 0.29405283],
       [0.30730208, 0.28902295],
       [0.31019236, 0.29404801],
       [0.71911791, 0.21366172],
       [0.71572995, 0.21525457],
       [0.7140543 , 0.22208573],
       [0.7177552 , 0.22477665],
       [0.7151661 , 0.21503084],
       [0.3211193 , 0.1592682 ],
       [0.32236086, 0.16411326],
       [0.32839416, 0.17047965],
       [0.32446947, 0.16313566],
       [0.32404635, 0.16843899]])

# Machine Learning efficacy

In [19]:
def check_low_appearing_vars(df):

    for c in df.columns:
        val = df[c].value_counts()
        if len(val) < 20:
            val = val/len(df)
            if any(val < 0.01) and c != 'choice':
                print('Variable {}: '.format(c))
                for idx, v in zip(val.index, val):
                    if v < 0.01:
                        print('  {} - {:.2f}% ({:d})'.format(idx, 100*v, int(v*len(df))))
                print()

def replace_low_appearing_values(df):

    dct_ = {}
    for i in df['hh_vehicles'].unique():
        if i >= 3:
            dct_[i] = '3+'
        else:
            dct_[i] = str(i)
    df['hh_vehicles'].replace(dct_, inplace=True)

    dct_ = {}
    for i in df['hh_people'].unique():
        if i >= 6:
            dct_[i] = '6+'
        else:
            dct_[i] = str(i)
    df['hh_people'].replace(dct_, inplace=True)

In [20]:
check_low_appearing_vars(df_orig)

Variable hh_vehicles: 
  4 - 0.52% (88)
  5 - 0.12% (21)
  6 - 0.02% (3)
  8 - 0.02% (3)
  7 - 0.01% (2)

Variable hh_people: 
  7 - 0.70% (118)
  8 - 0.18% (31)
  9 - 0.08% (13)
  10 - 0.03% (5)
  11 - 0.02% (3)



In [21]:
replace_low_appearing_values(df_orig)

In [22]:
check_low_appearing_vars(df_orig)

In [23]:
categorical_columns = list(set(df_orig.columns) - set(continuous_columns))

In [24]:
enc = OrdinalEncoder()
df_orig[categorical_columns] = enc.fit_transform(df_orig[categorical_columns])

In [25]:
pickle_name = 'ml.pickle'

cv_modelscores = {}

try:
    cv_modelscores = pickle.load(open(results_path + pickle_name, 'rb'))
    print('Found previous pickel file, using that')
except:
    print('No previous results found, starting fresh')

Found previous pickel file, using that


In [26]:
for i, f in enumerate(synth_files):

    file_name = f.split('/')[-1].split('.')[0]

    if file_name in cv_modelscores:
        print("Results for file \033[1m{}\033[0m ({}/{}) already exists!".format(file_name, i+1, len(synth_files)))
    else:
        print("Preparing stats for file \033[1m{}\033[0m ({}/{})".format(file_name, i+1, len(synth_files)))

        cv_modelscores[file_name] = {}

        # Load the synthetic dataset
        df_synth = pd.read_csv(f, index_col=False)
        df_synth = df_synth.sample(len_df).reset_index(drop=True)

        # Replace the values rarely appearing
        replace_low_appearing_values(df_synth)

        # Encode the synthetic dataset
        df_synth[categorical_columns] = enc.transform(df_synth[categorical_columns])

        res = ml_assessment(df_orig, df_synth, continuous_columns, categorical_columns, ignore_cols=cond_inputs)

        cv_modelscores[file_name] = res

    pickle.dump(cv_modelscores, open(results_path + pickle_name, 'wb'))

print("\033[1mFINISHED!\033[0m")

Results for file [1mciDATGAN_1_1[0m (1/50) already exists!
Results for file [1mDATGAN_1_1[0m (2/50) already exists!
Results for file [1mciDATGAN_1_2[0m (3/50) already exists!
Results for file [1mDATGAN_1_2[0m (4/50) already exists!
Results for file [1mciDATGAN_1_3[0m (5/50) already exists!
Results for file [1mDATGAN_1_3[0m (6/50) already exists!
Results for file [1mciDATGAN_1_4[0m (7/50) already exists!
Results for file [1mDATGAN_1_4[0m (8/50) already exists!
Results for file [1mciDATGAN_1_5[0m (9/50) already exists!
Results for file [1mDATGAN_1_5[0m (10/50) already exists!
Results for file [1mciDATGAN_2_1[0m (11/50) already exists!
Results for file [1mDATGAN_2_1[0m (12/50) already exists!
Results for file [1mciDATGAN_2_2[0m (13/50) already exists!
Results for file [1mDATGAN_2_2[0m (14/50) already exists!
Results for file [1mciDATGAN_2_3[0m (15/50) already exists!
Results for file [1mDATGAN_2_3[0m (16/50) already exists!
Results for file [1mciDATGAN_2_4

In [27]:
if 'original' in cv_modelscores:
    print("Results for file \033[1m{}\033[0m already exists!".format('original'))
else:
    print("Preparing stats for file \033[1m{}\033[0m".format('original'))

    res = ml_assessment(df_orig, df_orig, continuous_columns, categorical_columns, ignore_cols=cond_inputs)
    cv_modelscores['original'] = res
    pickle.dump(cv_modelscores, open(results_path + pickle_name, 'wb'))
    print("\033[1mFINISHED!\033[0m")

Results for file [1moriginal[0m already exists!


In [28]:
res_ml = {
    'DATGAN': {'continuous': [], 'categorical': []},
    'ciDATGAN': {'continuous': [], 'categorical': []}
}

continuous_columns = list(set(continuous_columns) - set(cond_inputs))
categorical_columns = list(set(categorical_columns) - set(cond_inputs))

ori_scores = {col: cv_modelscores['original'][col]['test_log_loss'] for col in categorical_columns}
ori_scores.update({col: cv_modelscores['original'][col]['test_l2'] for col in continuous_columns})

res = {}

for c in ['continuous', 'categorical']:
    res[c] = {}

for m in cv_modelscores.keys():

    res['continuous'][m] = []
    for col in continuous_columns:
        res['continuous'][m].append(cv_modelscores[m][col]['original_l2']/ori_scores[col])

    res['categorical'][m] = []
    for col in categorical_columns:
        res['categorical'][m].append(cv_modelscores[m][col]['original_log_loss'] - ori_scores[col])

In [29]:
avg = {}

for c in ['continuous', 'categorical']:
    avg[c] = {}
    for f in res[c].keys():
        avg[c][f] = {
            'mean': np.mean(res[c][f]),
            'std': np.std(res[c][f])
        }

In [36]:
for c in ['continuous', 'categorical']:

    res_DATGAN = []
    res_ciDATGAN = []

    for i in range(5):
        for j in range(5):
            res_DATGAN.append(avg[c]['DATGAN_{}_{}'.format(i+1, j+1)]['mean'])
            res_ciDATGAN.append(avg[c]['ciDATGAN_{}_{}'.format(i+1, j+1)]['mean'])

    res_DATGAN = np.array(res_DATGAN).flatten()
    res_ciDATGAN = np.array(res_ciDATGAN).flatten()

    tmp = [res_DATGAN, res_ciDATGAN]
    if c == 'continuous':
        asd
    df = pd.DataFrame(tmp, index=['DATGAN', 'ciDATGAN'])

    plt.figure(figsize=(10,7))

    sns.boxplot(data=df.T, palette=colors,showmeans=True,
                meanprops={"marker":"o",
                           "markerfacecolor":"white",
                           "markeredgecolor":"black",
                          "markersize":"10"})

    plt.xticks([0,1], ['DATGAN', 'ciDATGAN'])
    if c == 'continuous':
        plt.ylabel('Relative L2 error')
    else:
        plt.ylabel('Relative log loss')

    plt.savefig('../../figures/obs/ml_{}.png'.format(c[:3]), bbox_inches='tight')
    plt.savefig('../../figures/obs/ml_{}.pdf'.format(c[:3]), bbox_inches='tight')
    plt.close()

NameError: name 'asd' is not defined

In [37]:
np.array(tmp).T

array([[ 3.52615053, 31.92131158],
       [ 3.43620027, 31.89304331],
       [ 3.40609433, 30.76174734],
       [ 3.4355164 , 29.57631637],
       [ 3.4686816 , 29.84276936],
       [ 1.94883469,  2.098249  ],
       [ 1.93052831,  2.14000911],
       [ 2.00456957,  2.21374015],
       [ 2.04202917,  2.13649921],
       [ 1.97597557,  2.16419369],
       [ 1.94094653,  2.0083097 ],
       [ 1.89879212,  2.09196255],
       [ 1.88891674,  2.05077059],
       [ 1.8994391 ,  2.0911643 ],
       [ 1.94516559,  1.94542904],
       [59.07286594,  3.03472235],
       [67.55554776,  2.89163275],
       [66.29067453,  2.86322168],
       [58.22084244,  2.90305487],
       [70.60128724,  3.05808533],
       [ 4.01421206,  1.83924008],
       [ 3.57948425,  1.70365025],
       [ 3.91149708,  1.72463   ],
       [ 3.83067509,  1.7092307 ],
       [ 3.65697663,  1.73625814]])