In [1]:
import numpy as np
import pandas as pd
import random
from os import listdir
from os.path import isfile, join
import json
import pickle

from itertools import combinations

import matplotlib.pyplot as plt

# For the Python notebook
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [2]:
dataset = 'Chicago'
df_orig = pd.read_csv('../../../data/' + dataset.split('_')[0] + '/data.csv')

In [3]:
if 'Chicago' in dataset:
    continuous_cols = ['distance', 'age', 'departure_time']
elif 'LPMC' in dataset:
    continuous_cols = ['start_time_linear', 'age', 'distance', 'dur_walking', 'dur_cycling', 'dur_pt_access', 'dur_pt_rail', 'dur_pt_bus', 'dur_pt_int', 'dur_driving', 'cost_transit', 'cost_driving_fuel', 'driving_traffic_percent']

In [4]:
stats_str = ['mae', 'rmse', 'r2', 'srmse', 'corr']
stats_tex = {
    'mae': '\\textbf{MAE}',
    'rmse': '\\textbf{RMSE}',
    'srmse': '\\textbf{SRMSE}', 
    'r2': '$\\bf R^2$',
    'corr': '$\\bf \\rho_{\\text{Pearson} }$'
}

dataset_name = {
    'Chicago': 'CMAP',
    'LPMC': 'LPMC'
}

In [5]:
n_models = 5
n_data = 5

# Functions

In [6]:
def change_name(name):
    dct = {
        'WO': 'NO',
        'OR': 'OS',
        'WI': 'TS',
        'OC': 'CO',
        'OD': 'CA',
    }
    
    for k,v in dct.items():
        name = name.replace(k, v)
    
    return name

def to_TeX(num, val=1):
    num = "{{0:.{}e}}".format(val).format(num)
    mantissa, exponent = num.split('e')
    exponent = int(exponent)
    return "{0}e{{{1}}}".format(mantissa, exponent)


In [7]:
order = []

for l in ['SGAN', 'WGAN', 'WGGP']:
    for ls in ['WO', 'OR', 'WI']:
        for s in ['BO', 'OC', 'OD', 'NO']:
            order.append('{}_{}_{}'.format(l, ls, s))

# Stats - first level

In [8]:
stats = pickle.load(open('./{}/single_columns.pickle'.format(dataset), 'rb'))

In [9]:
res = {}

for test in ['all', 'cont', 'cat']:
    
    res[test] = {}
    
    if test == 'all':
        cols = df_orig.columns
    elif test == 'cont':
        cols = continuous_cols
    elif test == 'cat':
        cols = set(df_orig.columns) - set(continuous_cols)

    for s in stats_str:
        res[test][s] = {}

    for m in order:

        for s in stats_str:
            res[test][s][m] = []

            for i in range(n_models*n_data):
                tmp = []

                for c in cols:
                    tmp.append(stats[m][c][s][i])

                res[test][s][m].append(np.mean(tmp))

In [10]:
avg = {}

for test in ['all', 'cont', 'cat']:
    
    avg[test] = {}

    for s in stats_str:
        avg[test][s] = {}

        for m in order:
            avg[test][s][m] = {
                'mean': np.mean(res[test][s][m]),
                'std': np.std(res[test][s][m])
            }
            
        if s in ['r2', 'corr']:
            sorted_list = [k for k, v in sorted(avg[test][s].items(), key=lambda item: item[1]['mean'])[::-1]]
        else:
            sorted_list = [k for k, v in sorted(avg[test][s].items(), key=lambda item: item[1]['mean'])]

        for i, m in enumerate(sorted_list):
            avg[test][s][m]['rank'] = i+1
        

In [11]:
for test in ['all', 'cont', 'cat']:
    table = ''
    table += '\\begin{table}[H]\n'
    table += '\t\\centering\n'
    
    test_str = {
        'cont': 'continuous',
        'cat': 'categorical',
        'all': 'all'
    }
    
    table += '\t\\caption{{Results for the statistics on the first level ({} columns) for the {} dataset}}\n'.format(test_str[test], dataset_name[dataset])
    table += '\t\\label{{tab:first_{}_{}}}\n'.format(test, dataset)
    table += '\t\\begin{{tabularx}}{{\\textwidth}}{{l|{}}}\n'.format('|cC'*len(stats_str)+'||c')
    
    str_ = ''
    for i, s in enumerate(stats_str):
        if i == len(stats_str)-1:
            str_ += '& \multicolumn{{2}}{{c||}}{{{}}} '.format(stats_tex[s])
        else:    
            str_ += '& \multicolumn{{2}}{{c|}}{{{}}} '.format(stats_tex[s])
        
    
    str_ += '& \\multicolumn{1}{c}{\\textbf{rank}} '
    
    table += '\t\\multicolumn{{1}}{{c||}}{{\\textbf{{Name}}}} {} \\\\ \\midrule[1.5pt]\n'.format(str_)
    for i, m in enumerate(order):
        table += '\t\t\\texttt{{{}}} & '.format(change_name(m).replace('_', '\_'))
        tmp_rank = []
        for j, s in enumerate(stats_str):
            tmp = avg[test][s][m]
            tmp_rank.append(tmp['rank'])
            
            if tmp['rank'] <=10:
                suff = '\\bf'
            else:
                suff = ''
                
            val = round(100*(tmp['rank']-1)/(len(order)-1))
                
            table += '\\cellcolor{{Gray!{}}}${} {:02}$ & \\cellcolor{{Gray!{}}}${} \\num{{{:.2e}}}$'.format(val, suff, tmp['rank'], val, suff, tmp['mean'])
            
            table += ' & '
            
        # Avg rank
        avg_rank = np.mean(tmp_rank)

        if avg_rank <=10:
            suff = '\\bf'
        else:
            suff = ''

        val = round(100*(avg_rank-1)/(len(order)-1))

        table += '\\cellcolor{{Gray!{}}}${} {:.1f}$  \\\\'.format(val, suff, avg_rank)
            
        if (i+1)%12 == 0 and (i+1) < len(order):
            table += ' \\hline'

        table += '\n'
        
    table += '\t\\end{tabularx}\n'
    table += '\\end{table}\n'
    
    with open('./tables/first_{}_{}.tex'.format(test, dataset), 'w') as infile:
        infile.write(table)

# Statistics - second level

In [12]:
stats = pickle.load(open('./{}/couple_combinations.pickle'.format(dataset), 'rb'))

In [13]:
combs = []

for k in combinations(df_orig.columns, 2):
    combs.append(k[0] + '::' + k[1])

In [14]:
res = {}

for s in stats_str:
    res[s] = {}

for m in order:

    for s in stats_str:
        res[s][m] = []

        for i in range(n_models*n_data):
            tmp = []

            for c in combs:
                tmp.append(stats[m][c][s][i])

            res[s][m].append(np.mean(tmp))

In [15]:
avg = {}

for s in stats_str:
    avg[s] = {}

    for m in order:
        avg[s][m] = {
            'mean': np.mean(res[s][m]),
            'std': np.std(res[s][m])
        }

    if s in ['r2', 'corr']:
        sorted_list = [k for k, v in sorted(avg[s].items(), key=lambda item: item[1]['mean'])[::-1]]
    else:
        sorted_list = [k for k, v in sorted(avg[s].items(), key=lambda item: item[1]['mean'])]

    for i, m in enumerate(sorted_list):
        avg[s][m]['rank'] = i+1
        

In [16]:
table = ''
table += '\\begin{table}[H]\n'
table += '\t\\centering\n'
table += '\t\\caption{{Results for the statistics on the second level for the {} dataset}}\n'.format(dataset_name[dataset])
table += '\t\\label{{tab:second_{}}}\n'.format(dataset)

table += '\t\\begin{{tabularx}}{{\\textwidth}}{{l|{}}}\n'.format('|cC'*len(stats_str)+'||c')

str_ = ''
for i, s in enumerate(stats_str):
    if i == len(stats_str)-1:
        str_ += '& \multicolumn{{2}}{{c||}}{{{}}} '.format(stats_tex[s])
    else:    
        str_ += '& \multicolumn{{2}}{{c|}}{{{}}} '.format(stats_tex[s])


str_ += '& \\multicolumn{1}{c}{\\textbf{rank}} '

table += '\t\\multicolumn{{1}}{{c||}}{{\\textbf{{Name}}}} {} \\\\ \\midrule[1.5pt]\n'.format(str_)

for i, m in enumerate(order):
    table += '\t\t\\texttt{{{}}} & '.format(change_name(m).replace('_', '\_'))
    tmp_rank = []
    for j, s in enumerate(stats_str):
        tmp = avg[s][m]
        tmp_rank.append(tmp['rank'])

        if tmp['rank'] <=10:
            suff = '\\bf'
        else:
            suff = ''

        val = round(100*(tmp['rank']-1)/(len(order)-1))

        table += '\\cellcolor{{Gray!{}}}${} {:02}$ & \\cellcolor{{Gray!{}}}${} \\num{{{:.2e}}}$ & '.format(val, suff, tmp['rank'], val, suff, tmp['mean'])
            
    # Avg rank
    avg_rank = np.mean(tmp_rank)

    if avg_rank <=10:
        suff = '\\bf'
    else:
        suff = ''

    val = round(100*(avg_rank-1)/(len(order)-1))

    table += '\\cellcolor{{Gray!{}}}${} {:.1f}$  \\\\'.format(val, suff, avg_rank)

    if (i+1)%12 == 0 and (i+1) < len(order):
        table += ' \\hline'

    table += '\n'

table += '\t\\end{tabularx}\n'
table += '\\end{table}\n'

with open('./tables/second_{}.tex'.format(dataset), 'w') as infile:
    infile.write(table)

# Stats - third level

In [17]:
stats = pickle.load(open('./{}/trouple_combinations.pickle'.format(dataset), 'rb'))

In [18]:
combs = []

for k in combinations(df_orig.columns, 3):
    combs.append(k[0] + '::' + k[1] + '::' + k[2])

In [19]:
res = {}

for s in stats_str:
    res[s] = {}

for m in order:

    for s in stats_str:
        res[s][m] = []

        for i in range(n_models*n_data):
            tmp = []

            for c in combs:
                tmp.append(stats[m][c][s][i])

            res[s][m].append(np.mean(tmp))

In [20]:
avg = {}

for s in stats_str:
    avg[s] = {}

    for m in order:
        avg[s][m] = {
            'mean': np.mean(res[s][m]),
            'std': np.std(res[s][m])
        }

    if s in ['r2', 'corr']:
        sorted_list = [k for k, v in sorted(avg[s].items(), key=lambda item: item[1]['mean'])[::-1]]
    else:
        sorted_list = [k for k, v in sorted(avg[s].items(), key=lambda item: item[1]['mean'])]

    for i, m in enumerate(sorted_list):
        avg[s][m]['rank'] = i+1
        

In [21]:
table = ''
table += '\\begin{table}[H]\n'
table += '\t\\centering\n'
table += '\t\\caption{{Results for the statistics on the third level for the {} dataset}}\n'.format(dataset_name[dataset])
table += '\t\\label{{tab:third_{}}}\n'.format(dataset)
table += '\t\\begin{{tabularx}}{{\\textwidth}}{{l|{}}}\n'.format('|cC'*len(stats_str)+'||c')

str_ = ''
for i, s in enumerate(stats_str):
    if i == len(stats_str)-1:
        str_ += '& \multicolumn{{2}}{{c||}}{{{}}} '.format(stats_tex[s])
    else:    
        str_ += '& \multicolumn{{2}}{{c|}}{{{}}} '.format(stats_tex[s])


str_ += '& \\multicolumn{1}{c}{\\textbf{rank}} '

table += '\t\\multicolumn{{1}}{{c||}}{{\\textbf{{Name}}}} {} \\\\ \\midrule[1.5pt]\n'.format(str_)

for i, m in enumerate(order):
    table += '\t\t\\texttt{{{}}} & '.format(change_name(m).replace('_', '\_'))
    tmp_rank = []
    for j, s in enumerate(stats_str):
        tmp = avg[s][m]
        tmp_rank.append(tmp['rank'])

        if tmp['rank'] <=10:
            suff = '\\bf'
        else:
            suff = ''

        val = round(100*(tmp['rank']-1)/(len(order)-1))

        table += '\\cellcolor{{Gray!{}}}${} {:02}$ & \\cellcolor{{Gray!{}}}${} \\num{{{:.2e}}}$ & '.format(val, suff, tmp['rank'], val, suff, tmp['mean'])
            
    # Avg rank
    avg_rank = np.mean(tmp_rank)

    if avg_rank <=10:
        suff = '\\bf'
    else:
        suff = ''

    val = round(100*(avg_rank-1)/(len(order)-1))

    table += '\\cellcolor{{Gray!{}}}${} {:.1f}$  \\\\'.format(val, suff, avg_rank)

    if (i+1)%12 == 0 and (i+1) < len(order):
        table += ' \\hline'

    table += '\n'

table += '\t\\end{tabularx}\n'
table += '\\end{table}\n'


with open('./tables/third_{}.tex'.format(dataset), 'w') as infile:
    infile.write(table)

# ML efficacy

In [22]:
cv_modelscores = pickle.load(open('./{}/cv_result_ml.pickle'.format(dataset), 'rb'))

In [23]:
if 'Chicago' in dataset:
    cont_cols = ['distance', 'age', 'departure_time']
    ord_cols = ['hh_vehicles', 'hh_size', 'hh_bikes', 'hh_income', 'education_level']
    cat_cols = [col for col in df_orig.columns if col not in cont_cols + ord_cols]
elif 'LPMC' in dataset:
    cont_cols = ['start_time_linear', 'age', 'distance', 'dur_walking', 
                 'dur_cycling', 'dur_pt_access', 'dur_pt_rail', 'dur_pt_bus', 
                 'dur_pt_int', 'dur_driving', 'cost_transit', 
                 'cost_driving_fuel', 'driving_traffic_percent']
    ord_cols = ['travel_year', 'travel_month', 'travel_date', 
                'day_of_week', 'pt_n_interchanges', 'car_ownership']
    cat_cols = [col for col in df_orig.columns if col not in cont_cols + ord_cols]

In [24]:
ori_scores = {col: cv_modelscores['original'][0][col]['test_log_loss'] for col in cat_cols + ord_cols}
ori_scores.update({col: cv_modelscores['original'][0][col]['test_l2'] for col in cont_cols})

internal = {}
external = {}
external_normalised = {}
cont_scores = {}
cat_scores = {}

for model in order:
    
    n_tests = len(cv_modelscores[model])
    
    internal[model] = {}
    external[model] = {}
    external_normalised[model] = {}
    for col in cat_cols + ord_cols:
        tmp = [cv_modelscores[model][i][col]['test_log_loss'] for i in range(n_tests)]
        internal[model][col] = {'avg': np.mean(tmp), 'std': np.std(tmp)}
        
        tmp = [cv_modelscores[model][i][col]['original_log_loss'] for i in range(n_tests)]
        external[model][col] = {'avg': np.mean(tmp), 'std': np.std(tmp)}
        
        external_normalised[model][col] = external[model][col]['avg'] - ori_scores[col]

        
    for col in cont_cols:
        tmp = [cv_modelscores[model][i][col]['test_l2'] for i in range(n_tests)]
        internal[model][col] = {'avg': np.mean(tmp), 'std': np.std(tmp)}
        
        tmp = [cv_modelscores[model][i][col]['original_l2'] for i in range(n_tests)]
        external[model][col] = {'avg': np.mean(tmp), 'std': np.std(tmp)}
        
        external_normalised[model][col] = external[model][col]['avg'] - ori_scores[col]
    
    cont_scores[model] = sum([external[model][col]['avg']/ori_scores[col] for col in cont_cols])
    cat_scores[model] = sum([external[model][col]['avg']-ori_scores[col] for col in cat_cols + ord_cols])

In [25]:
cat_sorted = sorted(cat_scores.items(), key=lambda item: item[1])
cont_sorted = sorted(cont_scores.items(), key=lambda item: item[1])

In [26]:
table = ''
table += '\\begin{table}[H]\n'
table += '\t\\centering\n'
table += '\t\\caption{{Results for the Machine Learning efficacy for the {} dataset}}\n'.format(dataset_name[dataset])
table += '\t\\label{{tab:ml_efficacy_{}}}\n'.format(dataset)
table += '\t\\begin{{tabularx}}{{\\textwidth}}{{l|{}}}\n'.format('|CC'*2+'||C')

str_ = ''
for i, s in enumerate(['Continuous', 'Categorical']):
    str_ += '& \multicolumn{{2}}{{c{}}}{{\\textbf{{{}}}}} '.format((i+1)*'|', s)
    
str_ += '& \\multicolumn{1}{c}{\\textbf{rank}} '

table += '\t\\multicolumn{{1}}{{c||}}{{\\textbf{{Name}}}} {} \\\\ \\midrule[1.5pt]\n'.format(str_)

for i, m in enumerate(order):
    table += '\t\t\\texttt{{{}}} & '.format(change_name(m).replace('_', '\_'))
    # continuous
    rank_cont = [x+1 for x, y in enumerate(cont_sorted) if y[0] == m][0]
    value = cont_sorted[rank_cont-1][1]

    if rank_cont <=10:
        suff = '\\bf'
    else:
        suff = ''

    val = round(100*(rank_cont-1)/(len(order)-1))

    table += '\\cellcolor{{Gray!{}}}${} {:02}$ & \\cellcolor{{Gray!{}}}${} \\num{{{:.2e}}}$ & '.format(val, suff, rank_cont, val, suff, value)

    # categorical
    rank_cat = [x+1 for x, y in enumerate(cat_sorted) if y[0] == m][0]
    value = cat_sorted[rank_cat-1][1]

    if rank_cat <=10:
        suff = '\\bf'
    else:
        suff = ''

    val = round(100*(rank_cat-1)/(len(order)-1))

    table += '\\cellcolor{{Gray!{}}}${} {:02}$ & \\cellcolor{{Gray!{}}}${} \\num{{{:.2e}}}$ &'.format(val, suff, rank_cat, val, suff, value)
    
    # Avg rank
    avg_rank = (rank_cont + rank_cat)/2
    
    if avg_rank <=10:
        suff = '\\bf'
    else:
        suff = ''

    val = round(100*(avg_rank-1)/(len(order)-1))
    
    table += '\\cellcolor{{Gray!{}}}${} {:.1f}$  \\\\'.format(val, suff, avg_rank)
    
    if (i+1) < len(order) and (i+1)%12 == 0:
        table += ' \\hline'

    table += '\n'

table += '\t\\end{tabularx}\n'
table += '\\end{table}\n'

with open('./tables/ml_efficacy_{}.tex'.format(dataset), 'w') as infile:
    infile.write(table)