In [1]:
import os
os.chdir('../..')

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
from sklearn.linear_model import LinearRegression
from collections import Iterable
import random
from os import listdir
from os.path import isfile, join
import matplotlib.patches as mpatches
import pickle

from itertools import combinations

import seaborn as sns
sns.set_style("whitegrid")

# For the Python notebook
%matplotlib inline
%reload_ext autoreload
%autoreload 2

  


Calculations for stats and plots are based on: https://github.com/stasmix/popsynth/blob/master/pop-synth-vae.ipynb

# Functions

In [3]:
def is_a_DATGAN(name):
    if 'TGAN' in name or 'CTGAN' in name:
        return False
    else:
        return True

def compute_stats(freq_list_orig, freq_list_synth):
    """
    Different statistics computed on the frequency list
    
    """
    freq_list_orig, freq_list_synth = np.array(freq_list_orig), np.array(freq_list_synth)
    corr_mat = np.corrcoef(freq_list_orig, freq_list_synth)
    corr = corr_mat[0, 1]
    if np.isnan(corr): corr = 0.0
    # MAE
    mae = np.absolute(freq_list_orig - freq_list_synth).mean()
    # RMSE
    rmse = np.linalg.norm(freq_list_orig - freq_list_synth) / np.sqrt(len(freq_list_orig))
    # SRMSE
    freq_list_orig_avg = freq_list_orig.mean()
    srmse = rmse / freq_list_orig_avg
    # r-square
    u = np.sum((freq_list_synth - freq_list_orig)**2)
    v = np.sum((freq_list_orig - freq_list_orig_avg)**2)
    r2 = 1.0 - u / v
    stat = {'mae': mae, 'rmse': rmse, 'r2': r2, 'srmse': srmse, 'corr': corr}
    
    return stat

# Load the files

In [29]:
dataset = 'Chicago'

input_folder = '../synth_data/{}/'.format(dataset)

files_ = {}
models = []

for f in listdir(input_folder):
    if isfile(join(input_folder, f)):
        m = f.split('.')[0]
        models.append(m)
        files_[m] = join(input_folder, f)
        
models = ['WGAN_WI_01_NO_01', 'TEST']
files_ = {}
for m in models:
    files_[m] = join(input_folder, m) + '.csv'

In [32]:
df_orig = pd.read_csv('../data/' + dataset + '/data.csv')

In [33]:
if dataset is 'Chicago':
    continuous_cols = ['distance', 'age', 'departure_time']
elif dataset is 'LPMC':
    continuous_cols = ['start_time_linear', 'age', 'distance', 'dur_walking', 'dur_cycling', 'dur_pt_access', 'dur_pt_rail', 'dur_pt_bus', 'dur_pt_int', 'dur_driving', 'cost_transit', 'cost_driving_fuel', 'driving_traffic_percent']

In [34]:
bins_cont = {}

for c in continuous_cols:
    #bins_cont[c] = pd.qcut(df_orig[c], q=10, retbins=True)[1]
    bins_cont[c] = pd.cut(df_orig[c], bins=10, retbins=True)[1]
    bins_cont[c][0] = -np.inf
    bins_cont[c][-1] = np.inf
    df_orig[c] = pd.cut(df_orig[c], bins=bins_cont[c])

In [35]:
df_orig.head()

Unnamed: 0,choice,travel_dow,trip_purpose,distance,hh_vehicles,hh_size,hh_bikes,hh_descr,hh_income,gender,age,license,education_level,work_status,departure_time
0,drive,7,HOME_OTHER,"(-inf, 6.971]",2,3,3,detached,6,0,"(29.4, 39.2]",1,4,PTE,"(19.093, 21.48]"
1,drive,2,SHOPPING,"(-inf, 6.971]",3,3,3,detached,7,0,"(49.0, 58.8]",1,5,FTE,"(16.707, 19.093]"
2,drive,2,SHOPPING,"(-inf, 6.971]",1,1,0,detached,3,0,"(78.4, 88.2]",1,3,PTE,"(7.16, 9.547]"
3,drive,2,OTHER,"(-inf, 6.971]",2,2,0,detached,5,1,"(39.2, 49.0]",1,5,FTE,"(11.933, 14.32]"
4,passenger,1,SHOPPING,"(-inf, 6.971]",2,2,1,detached,4,0,"(29.4, 39.2]",0,3,Unemployed,"(9.547, 11.933]"


In [36]:
stats_str = ['mae', 'rmse', 'r2', 'srmse', 'corr']
orig_str = 'random-original'

# Stats per individual column

In [37]:
filepath = './notebooks/tests/stats/'
filename = 'single_columns_{}.pickle'.format(dataset)

all_stats = {}

try:
    all_stats = pickle.load(open(filepath + filename, 'rb'))
    print('Found previous pickel file, using that')
except:
    print('No previous results found, starting fresh')
    try:
        os.makedirs(filepath)
    except:
        pass

No previous results found, starting fresh


In [38]:
# Go through each model
for i, m in enumerate(models):
    
    if m in all_stats:
        print("Results for model \033[1m{}\033[0m ({}/{}) already exists!".format(m, i+1, len(models)))

    else:
        print("Preparing stats for model \033[1m{}\033[0m ({}/{})".format(m, i+1, len(models)))

        all_stats[m] = {}

        # Load all dataframes for current model
        df = pd.read_csv(files_[m])

        # Discretize continuous columns
        for c in continuous_cols:
            df[c] = pd.cut(df[c], bins=bins_cont[c])

        # Go through each columns
        for c in df_orig.columns:

            agg_vars = [c]

            real = df_orig.copy()
            real['count'] = 1
            real = real.groupby(agg_vars, observed=True).count()
            real /= len(df_orig)

            synth = df.copy()
            synth['count'] = 1
            synth = synth.groupby(agg_vars, observed=True).count()
            synth /= len(df)

            real_and_sampled = pd.merge(real, synth, suffixes=['_real', '_sampled'], on=agg_vars, how='outer', indicator=True)
            real_and_sampled = real_and_sampled[['count_real', 'count_sampled']].fillna(0)

            sts = compute_stats(real_and_sampled['count_real'], real_and_sampled['count_sampled'])

            all_stats[m][c] = sts
            
        pickle.dump(all_stats, open(filepath + filename, 'wb'))

print("\033[1mFINISHED!\033[0m")

Preparing stats for model [1mWGAN_WI_01_NO_01[0m (1/2)
Preparing stats for model [1mTEST[0m (2/2)
[1mFINISHED![0m


In [39]:
stats_orig = {}

train = df_orig.sample(int(len(df_orig) * 0.5))
train.index = range(len(train))
test = df_orig[~df_orig.index.isin(train.index)]
test.index = range(len(test))

# Go through each columns
for c in df_orig.columns:

    agg_vars = [c]

    real = train.copy()
    real['count'] = 1
    real = real.groupby(agg_vars, observed=True).count()
    real /= len(df_orig)

    synth = test.copy()
    synth['count'] = 1
    synth = synth.groupby(agg_vars, observed=True).count()
    synth /= len(df)

    real_and_sampled = pd.merge(real, synth, suffixes=['_real', '_sampled'], on=agg_vars, how='outer', indicator=True)
    real_and_sampled = real_and_sampled[['count_real', 'count_sampled']].fillna(0)

    sts = compute_stats(real_and_sampled['count_real'], real_and_sampled['count_sampled'])
    
    stats_orig[c] = sts

In [40]:
all_stats[orig_str] = stats_orig

In [41]:
res = {}

for test in ['all', 'cont', 'cat']:
    
    res[test] = {}
    
    if test == 'all':
        cols = df_orig.columns
    elif test == 'cont':
        cols = continuous_cols
    elif test == 'cat':
        cols = set(df_orig.columns) - set(continuous_cols)
        
    for s in stats_str:
        res[test][s] = {}

    for m in all_stats.keys():

        for s in stats_str:
            
            tmp = []
            for c in cols:
                tmp.append(all_stats[m][c][s])
            
            res[test][s][m] = np.mean(tmp)

In [42]:
for test in ['all', 'cont', 'cat']:
    
    if test == 'all':
        str_ = 'on all columns'
    elif test == 'cont':
        str_ = 'on continuous columns'
    elif test == 'cat':
        str_ = 'on categorical columns'
        
    for s in ['srmse']:#stats_str:
        print('Ranking {} based on {}:'.format(str_, s.upper()))

        if s in ['r2', 'corr']:
            sorted_dct = {k: v for k, v in sorted(res[test][s].items(), key=lambda item: item[1])[::-1]}
        else:
            sorted_dct = {k: v for k, v in sorted(res[test][s].items(), key=lambda item: item[1])}

        for i, item in enumerate(sorted_dct):
            print('  {:>2}. {:<20} - {:.2e}'.format(i+1, item, sorted_dct[item]))
        print()


Ranking on all columns based on SRMSE:
   1. random-original      - 4.73e-02
   2. TEST                 - 5.07e-02
   3. WGAN_WI_01_NO_01     - 5.73e-02

Ranking on continuous columns based on SRMSE:
   1. random-original      - 3.75e-02
   2. TEST                 - 1.03e-01
   3. WGAN_WI_01_NO_01     - 1.38e-01

Ranking on categorical columns based on SRMSE:
   1. WGAN_WI_01_NO_01     - 3.72e-02
   2. TEST                 - 3.76e-02
   3. random-original      - 4.97e-02



# Stats per couple columns

In [43]:
combs = []

for k in combinations(df_orig.columns, 2):
    combs.append(k[0] + '::' + k[1])
    
print('There are {} combinations!'.format(len(combs)))

There are 105 combinations!


In [44]:
filepath = './notebooks/tests/stats/'
filename = 'couple_columns_{}.pickle'.format(dataset)

all_stats = {}

try:
    all_stats = pickle.load(open(filepath + filename, 'rb'))
    print('Found previous pickel file, using that')
except:
    print('No previous results found, starting fresh')
    try:
        os.makedirs(filepath)
    except:
        pass

No previous results found, starting fresh


In [45]:
# Go through each model
for i, m in enumerate(models):
    
    if m in all_stats:
        print("Results for model \033[1m{}\033[0m ({}/{}) already exists!".format(m, i+1, len(models)))

    else:
    
        print("Preparing stats for model \033[1m{}\033[0m ({}/{})".format(m, i+1, len(models)))

        all_stats[m] = {}

        # Load all dataframes for current model
        df = pd.read_csv(files_[m])

        # Discretize continuous columns
        for c in continuous_cols:
            df[c] = pd.cut(df[c], bins=bins_cont[c])

        # Go through each columns
        for c in combs:

            agg_vars = c.split('::')

            real = df_orig.copy()
            real['count'] = 1
            real = real.groupby(agg_vars, observed=True).count()
            real /= len(df_orig)

            synth = df.copy()
            synth['count'] = 1
            synth = synth.groupby(agg_vars, observed=True).count()
            synth /= len(df)

            real_and_sampled = pd.merge(real, synth, suffixes=['_real', '_sampled'], on=agg_vars, how='outer', indicator=True)
            real_and_sampled = real_and_sampled[['count_real', 'count_sampled']].fillna(0)

            sts = compute_stats(real_and_sampled['count_real'], real_and_sampled['count_sampled'])

            all_stats[m][c] = sts
            
        pickle.dump(all_stats, open(filepath + filename, 'wb'))

print("\033[1mFINISHED!\033[0m")

Preparing stats for model [1mWGAN_WI_01_NO_01[0m (1/2)
Preparing stats for model [1mTEST[0m (2/2)
[1mFINISHED![0m


In [46]:
stats_orig = {}

train = df_orig.sample(int(len(df_orig) * 0.5))
train.index = range(len(train))
test = df_orig[~df_orig.index.isin(train.index)]
test.index = range(len(test))

# Go through each columns
for c in combs:

    agg_vars = c.split('::')

    real = train.copy()
    real['count'] = 1
    real = real.groupby(agg_vars, observed=True).count()
    real /= len(df_orig)

    synth = test.copy()
    synth['count'] = 1
    synth = synth.groupby(agg_vars, observed=True).count()
    synth /= len(df)

    real_and_sampled = pd.merge(real, synth, suffixes=['_real', '_sampled'], on=agg_vars, how='outer', indicator=True)
    real_and_sampled = real_and_sampled[['count_real', 'count_sampled']].fillna(0)

    sts = compute_stats(real_and_sampled['count_real'], real_and_sampled['count_sampled'])
    
    stats_orig[c] = sts

In [47]:
all_stats[orig_str] = stats_orig

In [48]:
res = {}

for s in stats_str:
    res[s] = {}
                    
for m in all_stats.keys():

    for s in stats_str:

        tmp = []
        for c in combs:
            tmp.append(all_stats[m][c][s])

        res[s][m] = np.mean(tmp)

In [49]:
for s in ['srmse']:#stats_str:
    print('Ranking on all coupled combinations based on {}:'.format(s.upper()))

    if s in ['r2', 'corr']:
        sorted_dct = {k: v for k, v in sorted(res[s].items(), key=lambda item: item[1])[::-1]}
    else:
        sorted_dct = {k: v for k, v in sorted(res[s].items(), key=lambda item: item[1])}

    for i, item in enumerate(sorted_dct):
        print('  {:>2}. {:<20} - {:.2e}'.format(i+1, item, sorted_dct[item]))
    print()

Ranking on all coupled combinations based on SRMSE:
   1. random-original      - 1.29e-01
   2. TEST                 - 1.49e-01
   3. WGAN_WI_01_NO_01     - 1.80e-01

