In [1]:
import numpy as np
import pandas as pd
import random
from os import listdir
from os.path import isfile, join
import json
import pickle

from itertools import combinations

import matplotlib.pyplot as plt

# For the Python notebook
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [2]:
dataset = 'Chicago'
df_orig = pd.read_csv('../../../data/' + dataset.split('_')[0] + '/data.csv')

In [3]:
if 'Chicago' in dataset:
    continuous_cols = ['distance', 'age', 'departure_time']
elif 'LPMC' in dataset:
    continuous_cols = ['start_time_linear', 'age', 'distance', 'dur_walking', 'dur_cycling', 'dur_pt_access', 'dur_pt_rail', 'dur_pt_bus', 'dur_pt_int', 'dur_driving', 'cost_transit', 'cost_driving_fuel', 'driving_traffic_percent']
elif 'adult' in dataset:
    continuous_cols = ['age', 'capital-gain', 'capital-loss', 'hours-per-week']

In [4]:
stats_str = ['mae', 'rmse', 'r2', 'srmse', 'corr']
stats_tex = {
    'mae': '\\textbf{MAE}',
    'rmse': '\\textbf{RMSE}',
    'srmse': '\\textbf{SRMSE}', 
    'r2': '\\boldmath$R^2$',
    'corr': '\\boldmath$\\rho_{\\text{Pearson} }$'
}

dataset_name = {
    'Chicago': 'CMAP',
    'LPMC': 'LPMC',
    'LPMC_half': 'LPMC\_half',
    'adult': 'ADULT'
}

model_name = {
    'CTABGAN': 'CTABGAN',
    'CTGAN': 'CTGAN',
    'WGGP_WI_NO': 'DATGAN (\\texttt{WGGP})',
    'WGAN_WI_NO': 'DATGAN (\\texttt{WGAN})',
    'TGAN': 'TGAN',
    'TVAE': 'TVAE',
}

col_type = {
    'all': 'all',
    'cont': 'continuous',
    'cat': 'categorical'
}

In [5]:
n_models = 5
n_data = 5

# Functions

In [6]:
def to_TeX(num, val=1):
    num = "{{0:.{}e}}".format(val).format(num)
    mantissa, exponent = num.split('e')
    exponent = int(exponent)
    return "{0}e{{{1}}}".format(mantissa, exponent)


In [7]:
if 'Chicago' in dataset:
    order = ['CTABGAN', 'CTGAN', 'WGAN_WI_NO', 'TGAN', 'TVAE']
elif 'LPMC' in dataset:
    order = ['CTABGAN', 'CTGAN', 'WGGP_WI_NO', 'TGAN', 'TVAE']
if 'adult' in dataset:
    order = ['CTABGAN', 'CTGAN', 'WGAN_WI_NO', 'WGGP_WI_NO', 'TGAN', 'TVAE']

In [8]:
all_ranks = {}

for m in order:
    all_ranks[m] = {'stats': [], 'ml': []}

# Stats - first level

In [9]:
stats = pickle.load(open('./{}/single_columns.pickle'.format(dataset), 'rb'))

In [10]:
res = {}

for test in ['all', 'cont', 'cat']:
    
    res[test] = {}
    
    if test == 'all':
        cols = df_orig.columns
    elif test == 'cont':
        cols = continuous_cols
    elif test == 'cat':
        cols = set(df_orig.columns) - set(continuous_cols)

    for s in stats_str:
        res[test][s] = {}

    for m in order:

        for s in stats_str:
            res[test][s][m] = []

            for i in range(n_models*n_data):
                tmp = []

                for c in cols:
                    tmp.append(stats[m][c][s][i])

                res[test][s][m].append(np.mean(tmp))

In [11]:
avg = {}

for test in ['all', 'cont', 'cat']:
    
    avg[test] = {}

    for s in stats_str:
        avg[test][s] = {}

        for m in order:
            avg[test][s][m] = {
                'mean': np.mean(res[test][s][m]),
                'std': np.std(res[test][s][m])
            }
            
        if s in ['r2', 'corr']:
            sorted_list = [k for k, v in sorted(avg[test][s].items(), key=lambda item: item[1]['mean'])[::-1]]
        else:
            sorted_list = [k for k, v in sorted(avg[test][s].items(), key=lambda item: item[1]['mean'])]

        for i, m in enumerate(sorted_list):
            avg[test][s][m]['rank'] = i+1
        

In [12]:
table = '\\begin{{xltabular}}{{\\textwidth}}{{l|{}}}\n'.format('|cC'*len(stats_str)+'||c')

str_ = ''
for i, s in enumerate(stats_str):
    if i == len(stats_str)-1:
        str_ += '& \multicolumn{{2}}{{c||}}{{{}}} '.format(stats_tex[s])
    else:    
        str_ += '& \multicolumn{{2}}{{c|}}{{{}}} '.format(stats_tex[s])


str_ += '& \\multicolumn{1}{c}{\\textbf{rank}} '

header = '\\multicolumn{{1}}{{c||}}{{\\textbf{{Name}}}} {} \\\\ \\midrule[1.5pt]\n'.format(str_)

table += '\\caption{{\\normalsize Results of the statistical assessments between the best DATGAN version and the state-of-the-art models for the {} dataset. Lighter grey tone corresponds to better results compared to darker ones.}}\n'.format(dataset_name[dataset])
table += '\\label{{tab:stats_final_{}}}\\\\\n\n'.format(dataset)
table += header
table += '\endfirsthead\n\n'
table += '\\multicolumn{' + str(2*len(stats_str)+2) + '}{c}{\\tablename\\ \\thetable{} -- continued from previous page} \\\\\n'
table += header
table += '\endhead\n\n'
table += '\\hline\\multicolumn{' + str(2*len(stats_str)+2) + '}{r}{{\\normalsize Continues on next page...}}\n'
table += '\endfoot\n\n'
table += '\endlastfoot\n\n'

for test in ['all', 'cont', 'cat']:

    table += '\t\\hline \\multicolumn{{12}}{{c}}{{\\cellcolor{{Gray!25}}\\textbf{{First aggregation level - {} columns}}}} \\\\ \\hline\n'.format(col_type[test])
    
    for i, m in enumerate(order):
        table += '\t\\texttt{{{}}} & '.format(model_name[m])
        tmp_rank = []
        for j, s in enumerate(stats_str):
            tmp = avg[test][s][m]
            tmp_rank.append(tmp['rank'])
            all_ranks[m]['stats'].append(tmp['rank'])
            
            if tmp['rank'] <=10:
                suff = '\\bf'
            else:
                suff = ''
                
            val = round(100*(tmp['rank']-1)/(len(order)-1))
                
            table += '\\cellcolor{{Gray!{}}}${} {:02}$ & \\cellcolor{{Gray!{}}}${} \\num{{{:.2e}}}$'.format(val, suff, tmp['rank'], val, suff, tmp['mean'])
            
            table += ' & '
            
        # Avg rank
        avg_rank = np.mean(tmp_rank)

        if avg_rank <=10:
            suff = '\\bf'
        else:
            suff = ''

        val = round(100*(avg_rank-1)/(len(order)-1))

        table += '\\cellcolor{{Gray!{}}}${} {:.1f}$  \\\\'.format(val, suff, avg_rank)
            
        if (i+1)%12 == 0 and (i+1) < len(order):
            table += ' \\hline'

        table += '\n'

# Statistics - second level

In [13]:
stats = pickle.load(open('./{}/couple_combinations.pickle'.format(dataset), 'rb'))

In [14]:
combs = []

for k in combinations(df_orig.columns, 2):
    combs.append(k[0] + '::' + k[1])

In [15]:
res = {}

for s in stats_str:
    res[s] = {}

for m in order:

    for s in stats_str:
        res[s][m] = []

        for i in range(n_models*n_data):
            tmp = []

            for c in combs:
                tmp.append(stats[m][c][s][i])

            res[s][m].append(np.mean(tmp))

In [16]:
avg = {}

for s in stats_str:
    avg[s] = {}

    for m in order:
        avg[s][m] = {
            'mean': np.mean(res[s][m]),
            'std': np.std(res[s][m])
        }

    if s in ['r2', 'corr']:
        sorted_list = [k for k, v in sorted(avg[s].items(), key=lambda item: item[1]['mean'])[::-1]]
    else:
        sorted_list = [k for k, v in sorted(avg[s].items(), key=lambda item: item[1]['mean'])]

    for i, m in enumerate(sorted_list):
        avg[s][m]['rank'] = i+1
        

In [17]:
table += '\t\\hline \\multicolumn{12}{c}{\\cellcolor{Gray!25}\\textbf{Second aggregation level}} \\\\ \\hline\n'


for i, m in enumerate(order):
    table += '\t\\texttt{{{}}} & '.format(model_name[m])
    tmp_rank = []
    for j, s in enumerate(stats_str):
        tmp = avg[s][m]
        tmp_rank.append(tmp['rank'])
        all_ranks[m]['stats'].append(tmp['rank'])


        if tmp['rank'] <=10:
            suff = '\\bf'
        else:
            suff = ''

        val = round(100*(tmp['rank']-1)/(len(order)-1))

        table += '\\cellcolor{{Gray!{}}}${} {:02}$ & \\cellcolor{{Gray!{}}}${} \\num{{{:.2e}}}$ & '.format(val, suff, tmp['rank'], val, suff, tmp['mean'])
            
    # Avg rank
    avg_rank = np.mean(tmp_rank)

    if avg_rank <=10:
        suff = '\\bf'
    else:
        suff = ''

    val = round(100*(avg_rank-1)/(len(order)-1))

    table += '\\cellcolor{{Gray!{}}}${} {:.1f}$  \\\\'.format(val, suff, avg_rank)

    if (i+1)%12 == 0 and (i+1) < len(order):
        table += ' \\hline'

    table += '\n'

# Stats - third level

In [18]:
stats = pickle.load(open('./{}/trouple_combinations.pickle'.format(dataset), 'rb'))

In [19]:
combs = []

for k in combinations(df_orig.columns, 3):
    combs.append(k[0] + '::' + k[1] + '::' + k[2])

In [20]:
res = {}

for s in stats_str:
    res[s] = {}

for m in order:

    for s in stats_str:
        res[s][m] = []

        for i in range(n_models*n_data):
            tmp = []

            for c in combs:
                tmp.append(stats[m][c][s][i])

            res[s][m].append(np.mean(tmp))

In [21]:
avg = {}

for s in stats_str:
    avg[s] = {}

    for m in order:
            
        avg[s][m] = {
            'mean': np.mean(res[s][m]),
            'std': np.std(res[s][m])
        }

    if s in ['r2', 'corr']:
        sorted_list = [k for k, v in sorted(avg[s].items(), key=lambda item: item[1]['mean'])[::-1]]
    else:
        sorted_list = [k for k, v in sorted(avg[s].items(), key=lambda item: item[1]['mean'])]

    for i, m in enumerate(sorted_list):
        avg[s][m]['rank'] = i+1
        

In [22]:
table += '\t\\hline \\multicolumn{12}{c}{\\cellcolor{Gray!25}\\textbf{Third aggregation level}} \\\\ \\hline\n'


for i, m in enumerate(order):
    
    table += '\t\\texttt{{{}}} & '.format(model_name[m])
    tmp_rank = []
    for j, s in enumerate(stats_str):
        tmp = avg[s][m]
        tmp_rank.append(tmp['rank'])
        all_ranks[m]['stats'].append(tmp['rank'])


        if tmp['rank'] <=10:
            suff = '\\bf'
        else:
            suff = ''

        val = round(100*(tmp['rank']-1)/(len(order)-1))

        table += '\\cellcolor{{Gray!{}}}${} {:02}$ & \\cellcolor{{Gray!{}}}${} \\num{{{:.2e}}}$ & '.format(val, suff, tmp['rank'], val, suff, tmp['mean'])
            
    # Avg rank
    avg_rank = np.mean(tmp_rank)

    if avg_rank <=10:
        suff = '\\bf'
    else:
        suff = ''

    val = round(100*(avg_rank-1)/(len(order)-1))

    table += '\\cellcolor{{Gray!{}}}${} {:.1f}$  \\\\'.format(val, suff, avg_rank)

    if (i+1)%12 == 0 and (i+1) < len(order):
        table += ' \\hline'

    table += '\n'

table += '\\end{xltabular}\n'


with open('./tables/stats_final_{}.tex'.format(dataset), 'w') as infile:
    infile.write(table)

# ML efficacy

In [23]:
cv_modelscores = pickle.load(open('./{}/cv_result_ml.pickle'.format(dataset), 'rb'))

In [24]:
if 'Chicago' in dataset:
    cont_cols = ['distance', 'age', 'departure_time']
    ord_cols = ['hh_vehicles', 'hh_size', 'hh_bikes', 'hh_income', 'education_level']
    cat_cols = [col for col in df_orig.columns if col not in cont_cols + ord_cols]
elif 'LPMC' in dataset:
    cont_cols = ['start_time_linear', 'age', 'distance', 'dur_walking', 
                 'dur_cycling', 'dur_pt_access', 'dur_pt_rail', 'dur_pt_bus', 
                 'dur_pt_int', 'dur_driving', 'cost_transit', 
                 'cost_driving_fuel', 'driving_traffic_percent']
    ord_cols = ['travel_year', 'travel_month', 'travel_date', 
                'day_of_week', 'pt_n_interchanges', 'car_ownership']
    cat_cols = [col for col in df_orig.columns if col not in cont_cols + ord_cols]
elif 'adult' in dataset:
    cont_cols = ['age', 'capital-gain', 'capital-loss', 'hours-per-week']
    ord_cols = []
    cat_cols = [col for col in df_orig.columns if col not in cont_cols + ord_cols]

In [25]:
ori_scores = {col: cv_modelscores['original'][0][col]['test_log_loss'] for col in cat_cols + ord_cols}
ori_scores.update({col: cv_modelscores['original'][0][col]['test_l2'] for col in cont_cols})

internal = {}
external = {}
external_normalised = {}
cont_scores = {}
cat_scores = {}

for model in order:
    
    n_tests = len(cv_modelscores[model])
    
    internal[model] = {}
    external[model] = {}
    external_normalised[model] = {}
    for col in cat_cols + ord_cols:
        tmp = [cv_modelscores[model][i][col]['test_log_loss'] for i in range(n_tests)]
        internal[model][col] = {'avg': np.mean(tmp), 'std': np.std(tmp)}
        
        tmp = [cv_modelscores[model][i][col]['original_log_loss'] for i in range(n_tests)]
        external[model][col] = {'avg': np.mean(tmp), 'std': np.std(tmp)}
        
        external_normalised[model][col] = external[model][col]['avg'] - ori_scores[col]

        
    for col in cont_cols:
        tmp = [cv_modelscores[model][i][col]['test_l2'] for i in range(n_tests)]
        internal[model][col] = {'avg': np.mean(tmp), 'std': np.std(tmp)}
        
        tmp = [cv_modelscores[model][i][col]['original_l2'] for i in range(n_tests)]
        external[model][col] = {'avg': np.mean(tmp), 'std': np.std(tmp)}
        
        external_normalised[model][col] = external[model][col]['avg'] - ori_scores[col]
    
    cont_scores[model] = sum([external[model][col]['avg']/ori_scores[col] for col in cont_cols])
    cat_scores[model] = sum([external[model][col]['avg']-ori_scores[col] for col in cat_cols + ord_cols])

In [26]:
cat_sorted = sorted(cat_scores.items(), key=lambda item: item[1])
cont_sorted = sorted(cont_scores.items(), key=lambda item: item[1])

In [27]:
table = '\\begin{{xltabular}}{{\\textwidth}}{{l|{}}}\n'.format('|CC'*2+'||C')

str_ = ''
for i, s in enumerate(['Continuous', 'Categorical']):
    str_ += '& \multicolumn{{2}}{{c{}}}{{\\textbf{{{}}}}} '.format((i+1)*'|', s)
    
str_ += '& \\multicolumn{1}{c}{\\textbf{rank}} '

header = '\\multicolumn{{1}}{{c||}}{{\\textbf{{Name}}}} {} \\\\ \\midrule[1.5pt]\n'.format(str_)

table += '\\caption{{\\normalsize Results of the Machine Learning efficacy between the best DATGAN version and the state-of-the-art models for the {} dataset. Lighter grey tone corresponds to better results compared to darker ones.}}\n'.format(dataset_name[dataset])
table += '\\label{{tab:ml_efficacy_final_{}}}\\\\\n\n'.format(dataset)
table += header
table += '\endfirsthead\n\n'
table += '\\multicolumn{6}{c}{\\tablename\\ \\thetable{} -- continued from previous page} \\\\\n'
table += header
table += '\endhead\n\n'
table += '\\hline\\multicolumn{6}{r}{{\\normalsize Continues on next page...}}\n'
table += '\endfoot\n\n'
table += '\endlastfoot\n\n'

for i, m in enumerate(order):
    table += '\t\\texttt{{{}}} & '.format(model_name[m])
    # continuous
    rank_cont = [x+1 for x, y in enumerate(cont_sorted) if y[0] == m][0]
    all_ranks[m]['ml'].append(rank_cont)

    value = cont_sorted[rank_cont-1][1]

    if rank_cont <=10:
        suff = '\\bf'
    else:
        suff = ''

    val = round(100*(rank_cont-1)/(len(order)-1))

    table += '\\cellcolor{{Gray!{}}}${} {:02}$ & \\cellcolor{{Gray!{}}}${} \\num{{{:.2e}}}$ & '.format(val, suff, rank_cont, val, suff, value)

    # categorical
    rank_cat = [x+1 for x, y in enumerate(cat_sorted) if y[0] == m][0]
    all_ranks[m]['ml'].append(rank_cat)

    value = cat_sorted[rank_cat-1][1]

    if rank_cat <=10:
        suff = '\\bf'
    else:
        suff = ''

    val = round(100*(rank_cat-1)/(len(order)-1))

    table += '\\cellcolor{{Gray!{}}}${} {:02}$ & \\cellcolor{{Gray!{}}}${} \\num{{{:.2e}}}$ &'.format(val, suff, rank_cat, val, suff, value)
    
    # Avg rank
    avg_rank = (rank_cont + rank_cat)/2
    
    if avg_rank <=10:
        suff = '\\bf'
    else:
        suff = ''

    val = round(100*(avg_rank-1)/(len(order)-1))
    
    table += '\\cellcolor{{Gray!{}}}${} {:.1f}$  \\\\'.format(val, suff, avg_rank)
    
    if (i+1) < len(order) and (i+1)%12 == 0:
        table += ' \\hline'

    table += '\n'

table += '\\end{xltabular}\n'

with open('./tables/ml_efficacy_final_{}.tex'.format(dataset), 'w') as infile:
    infile.write(table)

# Summary ranks

In [28]:
ranks = {}

best_stat_rank = 100
best_ml_rank = 100

for m in order:
    ranks[m] = {'stats': np.mean(all_ranks[m]['stats']), 'ml': np.mean(all_ranks[m]['ml'])}
    ranks[m]['avg'] = (ranks[m]['stats'] + ranks[m]['ml'])/2
    
    best_stat_rank = min(best_stat_rank, ranks[m]['stats'])
    best_ml_rank = min(best_ml_rank, ranks[m]['ml'])    

In [29]:
sorted_list = [k for k, v in sorted(ranks.items(), key=lambda item: item[1]['avg'])]

In [30]:
table = '\\begin{{tabularx}}{{0.7\\textwidth}}{{l|{}}}\n'.format('|C'*2+'||C')

str_ = ''
for i, s in enumerate(['Avg. rank stats', 'Avg. rank ML']):
    str_ += '& \multicolumn{{1}}{{c{}}}{{\\textbf{{{}}}}} '.format((i+1)*'|', s)
    
str_ += '& \\multicolumn{1}{c}{\\textbf{rank}} '

table += '\\multicolumn{{1}}{{c||}}{{\\textbf{{Name}}}} {} \\\\ \\midrule[1.5pt]\n'.format(str_)

for i in range(len(order)):
    m = sorted_list[i]
    
    table += '\t\\texttt{{{}}} & '.format(model_name[m])
    
    if ranks[m]['stats'] == best_stat_rank:
        table += '\\textbf{{{:.2f}}} & '.format(ranks[m]['stats'])
    else:
        table += '{:.2f} & '.format(ranks[m]['stats'])

    if ranks[m]['ml'] == best_ml_rank:
        table += '\\textbf{{{:.1f}}} & '.format(ranks[m]['ml'])
    else:
        table += '{:.1f} & '.format(ranks[m]['ml'])
    
    if i == 0:
        table += '\\textbf{{{:.2f}}}'.format(ranks[m]['avg'])
    else:
        table += '{:.2f}'.format(ranks[m]['avg'])
        
    table += ' \\\\\n'

table += '\\end{tabularx}\n'

with open('./tables/summary_final_{}.tex'.format(dataset), 'w') as infile:
    infile.write(table)