In [1]:
import os
import pickle
import numpy as np
import pandas as pd
import networkx as nx
import datgan

from sklearn.preprocessing import OrdinalEncoder

from datgan import stats_assessment
from datgan import ml_assessment, transform_results

# For the Python notebook
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [8]:
df_orig = pd.read_csv('../../data/LPMC/trips.csv', index_col=False)

continuous_columns = ['start_time_linear', 'age', 'distance', 'dur_walking',
                      'dur_cycling', 'dur_pt_access', 'dur_pt_rail', 'dur_pt_bus',
                      'dur_pt_int', 'dur_driving', 'cost_transit',
                      'cost_driving_fuel', 'driving_traffic_percent']

cond_inputs = ["age", "female", "hh_borough"]

synth_files = []

for i in range(5):
    synth_files.append('../../data/synthetic/test/DATGAN_{:02d}.csv'.format(i+1))
    synth_files.append('../../data/synthetic/test/DATGAN2_{:02d}.csv'.format(i+1))
    synth_files.append('../../data/synthetic/test/ciDATGAN_{:02d}.csv'.format(i+1))
    synth_files.append('../../data/synthetic/test/ciDATGAN2_{:02d}.csv'.format(i+1))


len_df = len(df_orig)

In [9]:
results_path = './results/'

if not os.path.exists(results_path):
    os.makedirs(results_path)

In [10]:
stats_str = ['mae', 'rmse', 'r2', 'srmse', 'corr']

# First level

In [11]:
pickle_name = 'stats_first_level.pickle'
aggregation_level = 1

first_lvl_stats = {}

try:
    first_lvl_stats = pickle.load(open(results_path + pickle_name, 'rb'))
    print('Found previous pickle file, using that')
except:
    print('No previous results found, starting fresh')

Found previous pickle file, using that


In [12]:
for i, f in enumerate(synth_files):

    file_name = f.split('/')[-1].split('.')[0]

    if file_name in first_lvl_stats:
        print("Results for file \033[1m{}\033[0m ({}/{}) already exists!".format(file_name, i+1, len(synth_files)))
    else:
        print("Preparing stats for file \033[1m{}\033[0m ({}/{})".format(file_name, i+1, len(synth_files)))

        first_lvl_stats[file_name] = {}

        df_synth = pd.read_csv(f, index_col=False)
        df_synth = df_synth.sample(len_df).reset_index(drop=True)

        stats = stats_assessment(df_orig, df_synth, continuous_columns, aggregation_level, ignore_cols=cond_inputs)

        first_lvl_stats[file_name] = stats

    pickle.dump(first_lvl_stats, open(results_path + pickle_name, 'wb'))

print("\033[1mFINISHED!\033[0m")

Results for file [1mDATGAN_01[0m (1/20) already exists!
Results for file [1mDATGAN2_01[0m (2/20) already exists!
Preparing stats for file [1mciDATGAN_01[0m (3/20)
Preparing stats for file [1mciDATGAN2_01[0m (4/20)
Preparing stats for file [1mDATGAN_02[0m (5/20)
Preparing stats for file [1mDATGAN2_02[0m (6/20)
Preparing stats for file [1mciDATGAN_02[0m (7/20)
Preparing stats for file [1mciDATGAN2_02[0m (8/20)
Preparing stats for file [1mDATGAN_03[0m (9/20)
Preparing stats for file [1mDATGAN2_03[0m (10/20)
Preparing stats for file [1mciDATGAN_03[0m (11/20)
Preparing stats for file [1mciDATGAN2_03[0m (12/20)
Preparing stats for file [1mDATGAN_04[0m (13/20)
Preparing stats for file [1mDATGAN2_04[0m (14/20)
Preparing stats for file [1mciDATGAN_04[0m (15/20)
Preparing stats for file [1mciDATGAN2_04[0m (16/20)
Preparing stats for file [1mDATGAN_05[0m (17/20)
Preparing stats for file [1mDATGAN2_05[0m (18/20)
Preparing stats for file [1mciDATGAN_05[0m (19/2

In [13]:
res = {}

for test in ['all', 'cont', 'cat']:

    res[test] = {}

    if test == 'all':
        cols = df_orig.columns
    elif test == 'cont':
        cols = continuous_columns
    elif test == 'cat':
        cols = set(df_orig.columns) - set(continuous_columns)

    cols = set(cols) - set(cond_inputs)

    for s in stats_str:
        res[test][s] = {}

    for m in first_lvl_stats.keys():
        for s in stats_str:
            res[test][s][m] = []

            for c in cols:
                res[test][s][m].append(first_lvl_stats[m][c][s])

In [14]:
avg = {}

for test in ['all', 'cont', 'cat']:

    avg[test] = {}

    for s in stats_str:
        avg[test][s] = {}

        for m in first_lvl_stats.keys():
            avg[test][s][m] = {
                'mean': np.mean(res[test][s][m]),
                'std': np.std(res[test][s][m])
            }

In [15]:
for test in ['all', 'cont', 'cat']:

    if test == 'all':
        str_ = 'on all columns'
    elif test == 'cont':
        str_ = 'on continuous columns'
    elif test == 'cat':
        str_ = 'on categorical columns'

    for s in ['srmse']:#stats:
        print('Ranking {} based on {}:'.format(str_, s.upper()))

        if s in ['r2', 'corr']:
            sorted_dct = {k: v for k, v in sorted(avg[test][s].items(), key=lambda item: item[1]['mean'])[::-1]}
        else:
            sorted_dct = {k: v for k, v in sorted(avg[test][s].items(), key=lambda item: item[1]['mean'])}

        for i, item in enumerate(sorted_dct):
            print('  {:>2}. {:<15} - {:.2e} ± {:.2e}'.format(i+1, item, sorted_dct[item]['mean'], sorted_dct[item]['std']))
        print()

Ranking on all columns based on SRMSE:
   1. ciDATGAN2_02    - 6.36e-02 ± 6.59e-02
   2. ciDATGAN2_01    - 6.69e-02 ± 6.18e-02
   3. ciDATGAN_03     - 6.77e-02 ± 5.76e-02
   4. ciDATGAN_02     - 7.00e-02 ± 6.02e-02
   5. ciDATGAN_01     - 7.02e-02 ± 6.02e-02
   6. ciDATGAN_04     - 7.12e-02 ± 5.87e-02
   7. ciDATGAN2_05    - 7.13e-02 ± 6.69e-02
   8. ciDATGAN2_03    - 7.23e-02 ± 6.56e-02
   9. ciDATGAN_05     - 7.26e-02 ± 5.62e-02
  10. ciDATGAN2_04    - 7.29e-02 ± 6.79e-02
  11. DATGAN_02       - 9.39e-02 ± 1.04e-01
  12. DATGAN_03       - 9.45e-02 ± 1.01e-01
  13. DATGAN_01       - 9.66e-02 ± 1.04e-01
  14. DATGAN_05       - 9.68e-02 ± 1.04e-01
  15. DATGAN_04       - 1.00e-01 ± 1.07e-01
  16. DATGAN2_05      - 1.07e-01 ± 6.79e-02
  17. DATGAN2_04      - 1.08e-01 ± 7.31e-02
  18. DATGAN2_02      - 1.08e-01 ± 7.08e-02
  19. DATGAN2_03      - 1.10e-01 ± 6.76e-02
  20. DATGAN2_01      - 1.13e-01 ± 7.37e-02

Ranking on continuous columns based on SRMSE:
   1. ciDATGAN2_02    - 1.02e-01 ±

# Second level

In [16]:
pickle_name = 'stats_second_level.pickle'
aggregation_level = 2

second_lvl_stats = {}

try:
    second_lvl_stats = pickle.load(open(results_path + pickle_name, 'rb'))
    print('Found previous pickel file, using that')
except:
    print('No previous results found, starting fresh')

No previous results found, starting fresh


In [17]:
for i, f in enumerate(synth_files):

    file_name = f.split('/')[-1].split('.')[0]

    if file_name in second_lvl_stats:
        print("Results for file \033[1m{}\033[0m ({}/{}) already exists!".format(file_name, i+1, len(synth_files)))
    else:
        print("Preparing stats for file \033[1m{}\033[0m ({}/{})".format(file_name, i+1, len(synth_files)))

        second_lvl_stats[file_name] = {}

        df_synth = pd.read_csv(f, index_col=False)
        df_synth = df_synth.sample(len_df).reset_index(drop=True)

        stats = stats_assessment(df_orig, df_synth, continuous_columns, aggregation_level, ignore_cols=cond_inputs)

        second_lvl_stats[file_name] = stats

    pickle.dump(second_lvl_stats, open(results_path + pickle_name, 'wb'))

print("\033[1mFINISHED!\033[0m")

Preparing stats for file [1mDATGAN_01[0m (1/20)
Preparing stats for file [1mDATGAN2_01[0m (2/20)
Preparing stats for file [1mciDATGAN_01[0m (3/20)
Preparing stats for file [1mciDATGAN2_01[0m (4/20)
Preparing stats for file [1mDATGAN_02[0m (5/20)
Preparing stats for file [1mDATGAN2_02[0m (6/20)
Preparing stats for file [1mciDATGAN_02[0m (7/20)
Preparing stats for file [1mciDATGAN2_02[0m (8/20)
Preparing stats for file [1mDATGAN_03[0m (9/20)
Preparing stats for file [1mDATGAN2_03[0m (10/20)
Preparing stats for file [1mciDATGAN_03[0m (11/20)
Preparing stats for file [1mciDATGAN2_03[0m (12/20)
Preparing stats for file [1mDATGAN_04[0m (13/20)
Preparing stats for file [1mDATGAN2_04[0m (14/20)
Preparing stats for file [1mciDATGAN_04[0m (15/20)
Preparing stats for file [1mciDATGAN2_04[0m (16/20)
Preparing stats for file [1mDATGAN_05[0m (17/20)
Preparing stats for file [1mDATGAN2_05[0m (18/20)
Preparing stats for file [1mciDATGAN_05[0m (19/20)
Preparing sta

In [18]:
res = {}

for s in stats_str:
    res[s] = {}

for m in second_lvl_stats.keys():

    for s in stats_str:
        res[s][m] = []

        for c in second_lvl_stats[m].keys():
            res[s][m].append(second_lvl_stats[m][c][s])

In [19]:
avg = {}

for s in stats_str:
    avg[s] = {}

    for m in first_lvl_stats.keys():
        avg[s][m] = {
            'mean': np.mean(res[s][m]),
            'std': np.std(res[s][m])
        }

In [20]:
for s in ['srmse']:#stats:
    print('Ranking based on {} for aggregation level {}:'.format(s.upper(), aggregation_level))

    if s in ['r2', 'corr']:
        sorted_dct = {k: v for k, v in sorted(avg[s].items(), key=lambda item: item[1]['mean'])[::-1]}
    else:
        sorted_dct = {k: v for k, v in sorted(avg[s].items(), key=lambda item: item[1]['mean'])}

    for i, item in enumerate(sorted_dct):
        print('  {:>2}. {:<15} - {:.2e} ± {:.2e}'.format(i+1, item, sorted_dct[item]['mean'], sorted_dct[item]['std']))
    print()

Ranking based on SRMSE for aggregation level 2:
   1. ciDATGAN2_02    - 2.06e-01 ± 1.33e-01
   2. ciDATGAN2_01    - 2.10e-01 ± 1.30e-01
   3. ciDATGAN2_03    - 2.21e-01 ± 1.34e-01
   4. ciDATGAN2_05    - 2.23e-01 ± 1.37e-01
   5. ciDATGAN2_04    - 2.26e-01 ± 1.39e-01
   6. ciDATGAN_03     - 2.26e-01 ± 1.34e-01
   7. ciDATGAN_01     - 2.29e-01 ± 1.31e-01
   8. ciDATGAN_04     - 2.30e-01 ± 1.32e-01
   9. ciDATGAN_05     - 2.31e-01 ± 1.28e-01
  10. ciDATGAN_02     - 2.33e-01 ± 1.38e-01
  11. DATGAN2_05      - 2.75e-01 ± 1.27e-01
  12. DATGAN2_04      - 2.76e-01 ± 1.29e-01
  13. DATGAN2_02      - 2.78e-01 ± 1.30e-01
  14. DATGAN2_03      - 2.80e-01 ± 1.28e-01
  15. DATGAN2_01      - 2.86e-01 ± 1.36e-01
  16. DATGAN_03       - 3.03e-01 ± 2.10e-01
  17. DATGAN_02       - 3.05e-01 ± 2.14e-01
  18. DATGAN_01       - 3.08e-01 ± 2.14e-01
  19. DATGAN_05       - 3.08e-01 ± 2.14e-01
  20. DATGAN_04       - 3.14e-01 ± 2.18e-01



# ML efficacy

In [21]:
def check_low_appearing_vars(df):

    for c in df.columns:
        val = df[c].value_counts()
        if len(val) < 20:
            val = val/len(df)
            if any(val < 0.01) and c != 'choice':
                print('Variable {}: '.format(c))
                for idx, v in zip(val.index, val):
                    if v < 0.01:
                        print('  {} - {:.2f}% ({:d})'.format(idx, 100*v, int(v*len(df))))
                print()

def replace_low_appearing_values(df):

    dct_ = {}
    for i in df['pt_n_interchanges'].unique():
        if i >= 2:
            dct_[i] = '2+'
        else:
            dct_[i] = str(i)
    df['pt_n_interchanges'].replace(dct_, inplace=True)

    dct_ = {
        'Diesel_LGV': 'LGV',
        'Petrol_LGV': 'LGV',
        'Hybrid_Car': 'Average_Car'
    }
    df['fueltype'].replace(dct_, inplace=True)

    dct_ = {}
    for i in df['hh_vehicles'].unique():
        if i >= 3:
            dct_[i] = '3+'
        else:
            dct_[i] = str(i)
    df['hh_vehicles'].replace(dct_, inplace=True)

    dct_ = {}
    for i in df['hh_people'].unique():
        if i >= 6:
            dct_[i] = '6+'
        else:
            dct_[i] = str(i)
    df['hh_people'].replace(dct_, inplace=True)

In [22]:
check_low_appearing_vars(df_orig)

Variable fueltype: 
  Hybrid_Car - 0.51% (87)
  Petrol_LGV - 0.18% (31)

Variable pt_n_interchanges: 
  3 - 0.77% (131)
  4 - 0.04% (6)

Variable hh_vehicles: 
  4 - 0.52% (88)
  5 - 0.12% (21)
  8 - 0.02% (3)
  6 - 0.02% (3)
  7 - 0.01% (2)

Variable hh_people: 
  7 - 0.70% (118)
  8 - 0.18% (31)
  9 - 0.08% (13)
  10 - 0.03% (5)
  11 - 0.02% (3)



In [23]:
replace_low_appearing_values(df_orig)

In [24]:
check_low_appearing_vars(df_orig)

In [25]:
categorical_columns = list(set(df_orig.columns) - set(continuous_columns))

In [26]:
enc = OrdinalEncoder()
df_orig[categorical_columns] = enc.fit_transform(df_orig[categorical_columns])

In [27]:
pickle_name = 'ml_efficacy.pickle'

cv_modelscores = {}

try:
    cv_modelscores = pickle.load(open(results_path + pickle_name, 'rb'))
    print('Found previous pickel file, using that')
except:
    print('No previous results found, starting fresh')

No previous results found, starting fresh


In [28]:
for i, f in enumerate(synth_files):

    file_name = f.split('/')[-1].split('.')[0]

    if file_name in cv_modelscores:
        print("Results for file \033[1m{}\033[0m ({}/{}) already exists!".format(file_name, i+1, len(synth_files)))
    else:
        print("Preparing stats for file \033[1m{}\033[0m ({}/{})".format(file_name, i+1, len(synth_files)))

        cv_modelscores[file_name] = {}

        # Load the synthetic dataset
        df_synth = pd.read_csv(f, index_col=False)
        df_synth = df_synth.sample(len_df).reset_index(drop=True)

        # Replace the values rarely appearing
        replace_low_appearing_values(df_synth)

        # Encode the synthetic dataset
        df_synth[categorical_columns] = enc.transform(df_synth[categorical_columns])

        res = ml_assessment(df_orig, df_synth, continuous_columns, categorical_columns, ignore_cols=cond_inputs)

        cv_modelscores[file_name] = res

    pickle.dump(cv_modelscores, open(results_path + pickle_name, 'wb'))

print("\033[1mFINISHED!\033[0m")

Preparing stats for file [1mDATGAN_01[0m (1/20)
Preparing stats for file [1mDATGAN2_01[0m (2/20)
Preparing stats for file [1mciDATGAN_01[0m (3/20)mn: travel_mode (25/27)
                                              Column: travel_month (20/27)Preparing stats for file [1mciDATGAN2_01[0m (4/20)
Preparing stats for file [1mDATGAN_02[0m (5/20)
Preparing stats for file [1mDATGAN2_02[0m (6/20)
Preparing stats for file [1mciDATGAN_02[0m (7/20)
                                          Preparing stats for file [1mciDATGAN2_02[0m (8/20)
Preparing stats for file [1mDATGAN_03[0m (9/20)
Preparing stats for file [1mDATGAN2_03[0m (10/20)
Preparing stats for file [1mciDATGAN_03[0m (11/20)
                                          Preparing stats for file [1mciDATGAN2_03[0m (12/20)
Preparing stats for file [1mDATGAN_04[0m (13/20)
Preparing stats for file [1mDATGAN2_04[0m (14/20)
Preparing stats for file [1mciDATGAN_04[0m (15/20)
Preparing stats for file [1mciDATGAN2_

In [29]:
if 'original' in cv_modelscores:
    print("Results for file \033[1m{}\033[0m already exists!".format('original'))
else:
    print("Preparing stats for file \033[1m{}\033[0m".format('original'))

    res = ml_assessment(df_orig, df_orig, continuous_columns, categorical_columns, ignore_cols=cond_inputs)
    cv_modelscores['original'] = res
    pickle.dump(cv_modelscores, open(results_path + pickle_name, 'wb'))
    print("\033[1mFINISHED!\033[0m")

Preparing stats for file [1moriginal[0m
[1mFINISHED![0m                          


In [30]:
cont_sorted, cat_sorted = transform_results(cv_modelscores, continuous_columns, categorical_columns, ignore_cols=cond_inputs)

In [31]:
i=1
print('   | {:<30} | {:<30}'.format('categorical', 'continuous'))
print('-----------------------------------------------------------')
for a, b in zip(cat_sorted, cont_sorted):
    print('{:>2} | {:<30} | {:<30}'.format(i, '{:<12}: {:.3f}'.format(a[0], a[1]), '{:<12}: {:.3f}'.format(b[0], b[1])))
    i+=1

   | categorical                    | continuous                    
-----------------------------------------------------------
 1 | original    : -4.149           | original    : 8.462           
 2 | ciDATGAN2_03: 5.248            | ciDATGAN2_02: 22.955          
 3 | ciDATGAN2_02: 5.293            | ciDATGAN2_05: 23.143          
 4 | ciDATGAN2_04: 5.307            | ciDATGAN2_04: 23.316          
 5 | ciDATGAN2_01: 5.323            | ciDATGAN2_03: 23.369          
 6 | ciDATGAN2_05: 5.337            | ciDATGAN2_01: 23.372          
 7 | ciDATGAN_04 : 5.432            | ciDATGAN_05 : 26.525          
 8 | ciDATGAN_02 : 5.467            | ciDATGAN_04 : 27.023          
 9 | DATGAN2_02  : 5.469            | ciDATGAN_01 : 27.029          
10 | ciDATGAN_01 : 5.488            | ciDATGAN_02 : 27.143          
11 | DATGAN2_04  : 5.491            | ciDATGAN_03 : 27.917          
12 | DATGAN2_01  : 5.506            | DATGAN_02   : 41.189          
13 | ciDATGAN_03 : 5.509            | DATGA