In [1]:
import os
os.chdir('../..')

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
from sklearn.linear_model import LinearRegression
from collections import Iterable
import random
from os import listdir
from os.path import isfile, join
import matplotlib.patches as mpatches
import pickle

from itertools import combinations

import seaborn as sns
sns.set_style("whitegrid")

# For the Python notebook
%matplotlib inline
%reload_ext autoreload
%autoreload 2

  


Calculations for stats and plots are based on: https://github.com/stasmix/popsynth/blob/master/pop-synth-vae.ipynb

# Functions

In [3]:
def is_a_DATGAN(name):
    if 'TGAN' in name or 'CTGAN' in name:
        return False
    else:
        return True

def compute_stats(freq_list_orig, freq_list_synth):
    """
    Different statistics computed on the frequency list
    
    """
    freq_list_orig, freq_list_synth = np.array(freq_list_orig), np.array(freq_list_synth)
    corr_mat = np.corrcoef(freq_list_orig, freq_list_synth)
    corr = corr_mat[0, 1]
    if np.isnan(corr): corr = 0.0
    # MAE
    mae = np.absolute(freq_list_orig - freq_list_synth).mean()
    # RMSE
    rmse = np.linalg.norm(freq_list_orig - freq_list_synth) / np.sqrt(len(freq_list_orig))
    # SRMSE
    freq_list_orig_avg = freq_list_orig.mean()
    srmse = rmse / freq_list_orig_avg
    # r-square
    u = np.sum((freq_list_synth - freq_list_orig)**2)
    v = np.sum((freq_list_orig - freq_list_orig_avg)**2)
    r2 = 1.0 - u / v
    stat = {'mae': mae, 'rmse': rmse, 'r2': r2, 'srmse': srmse, 'corr': corr}
    
    return stat

# Load the files

In [4]:
dataset = 'LPMC'

input_folder = '../synth_data/TEST/'

files_ = {}
models = []

for f in listdir(input_folder):
    if isfile(join(input_folder, f)):
        m = f.split('.')[0]
        models.append(m)
        files_[m] = join(input_folder, f)

In [5]:
df_orig = pd.read_csv('../data/' + dataset + '/data.csv')

In [23]:
df_orig.dtypes

travel_mode                  object
purpose                      object
fueltype                     object
faretype                     object
bus_scale                   float64
travel_year                   int64
travel_month                  int64
travel_date                   int64
day_of_week                   int64
start_time_linear          category
age                        category
female                        int64
driving_license               int64
car_ownership                 int64
distance                   category
dur_walking                category
dur_cycling                category
dur_pt_access              category
dur_pt_rail                category
dur_pt_bus                 category
dur_pt_int                 category
pt_n_interchanges             int64
dur_driving                category
cost_transit               category
cost_driving_fuel          category
cost_driving_con_charge     float64
driving_traffic_percent    category
dtype: object

In [6]:
if dataset is 'Chicago':
    continuous_cols = ['distance', 'age', 'departure_time']
elif dataset is 'LPMC':
    continuous_cols = ['start_time_linear', 'age', 'distance', 'dur_walking', 'dur_cycling', 'dur_pt_access', 'dur_pt_rail', 'dur_pt_bus', 'dur_pt_int', 'dur_driving', 'cost_transit', 'cost_driving_fuel', 'driving_traffic_percent']

In [7]:
bins_cont = {}

for c in continuous_cols:
    #bins_cont[c] = pd.qcut(df_orig[c], q=10, retbins=True)[1]
    bins_cont[c] = pd.cut(df_orig[c], bins=10, retbins=True)[1]
    bins_cont[c][0] = -np.inf
    bins_cont[c][-1] = np.inf
    df_orig[c] = pd.cut(df_orig[c], bins=bins_cont[c])

In [8]:
df_orig.head()

Unnamed: 0,travel_mode,purpose,fueltype,faretype,bus_scale,travel_year,travel_month,travel_date,day_of_week,start_time_linear,...,dur_pt_access,dur_pt_rail,dur_pt_bus,dur_pt_int,pt_n_interchanges,dur_driving,cost_transit,cost_driving_fuel,cost_driving_con_charge,driving_traffic_percent
0,drive,HBO,Petrol_Car,child,0.0,2012,4,1,7,"(9.567, 11.958]",...,"(0.106, 0.212]","(-inf, 0.137]","(-inf, 0.215]","(-inf, 0.0567]",0,"(-inf, 0.183]","(-inf, 1.17]","(-inf, 1.027]",0.0,"(0.104, 0.208]"
1,drive,HBO,Petrol_Car,free,0.0,2012,4,1,7,"(16.742, 19.133]",...,"(0.212, 0.318]","(-inf, 0.137]","(-inf, 0.215]","(-inf, 0.0567]",0,"(-inf, 0.183]","(-inf, 1.17]","(-inf, 1.027]",0.0,"(-inf, 0.104]"
2,drive,HBO,Petrol_Car,full,1.0,2012,4,1,7,"(11.958, 14.35]",...,"(0.212, 0.318]","(-inf, 0.137]","(0.859, 1.074]","(0.0567, 0.113]",1,"(0.362, 0.54]","(2.34, 3.51]","(1.027, 2.034]",0.0,"(0.313, 0.417]"
3,pt,HBW,Average_Car,full,1.0,2012,4,1,7,"(19.133, 21.525]",...,"(0.106, 0.212]","(-inf, 0.137]","(-inf, 0.215]","(0.0567, 0.113]",1,"(-inf, 0.183]","(2.34, 3.51]","(-inf, 1.027]",0.0,"(-inf, 0.104]"
4,pt,HBO,Average_Car,free,0.0,2012,4,1,7,"(7.175, 9.567]",...,"(0.106, 0.212]","(-inf, 0.137]","(0.215, 0.429]","(-inf, 0.0567]",0,"(0.183, 0.362]","(-inf, 1.17]","(-inf, 1.027]",0.0,"(-inf, 0.104]"


In [9]:
stats_str = ['mae', 'rmse', 'r2', 'srmse', 'corr']
orig_str = 'random-original'

# Stats per individual column

In [10]:
filepath = './notebooks/tests/stats/'
filename = 'single_columns.pickle'.format(dataset)

all_stats = {}

try:
    all_stats = pickle.load(open(filepath + filename, 'rb'))
    print('Found previous pickel file, using that')
except:
    print('No previous results found, starting fresh')
    try:
        os.makedirs(filepath)
    except:
        pass

No previous results found, starting fresh


In [11]:
# Go through each model
for i, m in enumerate(models):
    
    if m in all_stats:
        print("Results for model \033[1m{}\033[0m ({}/{}) already exists!".format(m, i+1, len(models)))

    else:
        print("Preparing stats for model \033[1m{}\033[0m ({}/{})".format(m, i+1, len(models)))

        all_stats[m] = {}

        # Load all dataframes for current model
        df = pd.read_csv(files_[m])

        # Discretize continuous columns
        for c in continuous_cols:
            df[c] = pd.cut(df[c], bins=bins_cont[c])

        # Go through each columns
        for c in df_orig.columns:

            agg_vars = [c]

            real = df_orig.copy()
            real['count'] = 1
            real = real.groupby(agg_vars, observed=True).count()
            real /= len(df_orig)

            synth = df.copy()
            synth['count'] = 1
            synth = synth.groupby(agg_vars, observed=True).count()
            synth /= len(df)

            real_and_sampled = pd.merge(real, synth, suffixes=['_real', '_sampled'], on=agg_vars, how='outer', indicator=True)
            real_and_sampled = real_and_sampled[['count_real', 'count_sampled']].fillna(0)

            sts = compute_stats(real_and_sampled['count_real'], real_and_sampled['count_sampled'])

            all_stats[m][c] = sts
            
        pickle.dump(all_stats, open(filepath + filename, 'wb'))

print("\033[1mFINISHED!\033[0m")

Preparing stats for model [1mCTABGAN[0m (1/3)
Preparing stats for model [1mCTGAN[0m (2/3)
Preparing stats for model [1mDATGAN[0m (3/3)
[1mFINISHED![0m


In [12]:
stats_orig = {}

train = df_orig.sample(int(len(df_orig) * 0.5))
train.index = range(len(train))
test = df_orig[~df_orig.index.isin(train.index)]
test.index = range(len(test))

# Go through each columns
for c in df_orig.columns:

    agg_vars = [c]

    real = train.copy()
    real['count'] = 1
    real = real.groupby(agg_vars, observed=True).count()
    real /= len(df_orig)

    synth = test.copy()
    synth['count'] = 1
    synth = synth.groupby(agg_vars, observed=True).count()
    synth /= len(df)

    real_and_sampled = pd.merge(real, synth, suffixes=['_real', '_sampled'], on=agg_vars, how='outer', indicator=True)
    real_and_sampled = real_and_sampled[['count_real', 'count_sampled']].fillna(0)

    sts = compute_stats(real_and_sampled['count_real'], real_and_sampled['count_sampled'])
    
    stats_orig[c] = sts

In [13]:
all_stats[orig_str] = stats_orig

In [14]:
res = {}

for test in ['all', 'cont', 'cat']:
    
    res[test] = {}
    
    if test == 'all':
        cols = df_orig.columns
    elif test == 'cont':
        cols = continuous_cols
    elif test == 'cat':
        cols = set(df_orig.columns) - set(continuous_cols)
        
    for s in stats_str:
        res[test][s] = {}

    for m in all_stats.keys():

        for s in stats_str:
            
            tmp = []
            for c in cols:
                tmp.append(all_stats[m][c][s])
            
            res[test][s][m] = np.mean(tmp)

In [15]:
for test in ['all', 'cont', 'cat']:
    
    if test == 'all':
        str_ = 'on all columns'
    elif test == 'cont':
        str_ = 'on continuous columns'
    elif test == 'cat':
        str_ = 'on categorical columns'
        
    for s in ['srmse']:#stats_str:
        print('Ranking {} based on {}:'.format(str_, s.upper()))

        if s in ['r2', 'corr']:
            sorted_dct = {k: v for k, v in sorted(res[test][s].items(), key=lambda item: item[1])[::-1]}
        else:
            sorted_dct = {k: v for k, v in sorted(res[test][s].items(), key=lambda item: item[1])}

        for i, item in enumerate(sorted_dct):
            print('  {:>2}. {:<20} - {:.2e}'.format(i+1, item, sorted_dct[item]))
        print()


Ranking on all columns based on SRMSE:
   1. random-original      - 8.04e-02
   2. DATGAN               - 8.09e-02
   3. CTGAN                - 2.21e-01
   4. CTABGAN              - 2.69e-01

Ranking on continuous columns based on SRMSE:
   1. random-original      - 4.25e-02
   2. DATGAN               - 1.25e-01
   3. CTGAN                - 2.17e-01
   4. CTABGAN              - 2.71e-01

Ranking on categorical columns based on SRMSE:
   1. DATGAN               - 3.97e-02
   2. random-original      - 1.16e-01
   3. CTGAN                - 2.25e-01
   4. CTABGAN              - 2.67e-01



# Stats per couple columns

In [16]:
combs = []

for k in combinations(df_orig.columns, 2):
    combs.append(k[0] + '::' + k[1])
    
print('There are {} combinations!'.format(len(combs)))

There are 351 combinations!


In [17]:
filepath = './notebooks/tests/stats/'
filename = 'couple_columns.pickle'.format(dataset)

all_stats = {}

try:
    all_stats = pickle.load(open(filepath + filename, 'rb'))
    print('Found previous pickel file, using that')
except:
    print('No previous results found, starting fresh')
    try:
        os.makedirs(filepath)
    except:
        pass

No previous results found, starting fresh


In [18]:
# Go through each model
for i, m in enumerate(models):
    
    if m in all_stats:
        print("Results for model \033[1m{}\033[0m ({}/{}) already exists!".format(m, i+1, len(models)))

    else:
    
        print("Preparing stats for model \033[1m{}\033[0m ({}/{})".format(m, i+1, len(models)))

        all_stats[m] = {}

        # Load all dataframes for current model
        df = pd.read_csv(files_[m])

        # Discretize continuous columns
        for c in continuous_cols:
            df[c] = pd.cut(df[c], bins=bins_cont[c])

        # Go through each columns
        for c in combs:

            agg_vars = c.split('::')

            real = df_orig.copy()
            real['count'] = 1
            real = real.groupby(agg_vars, observed=True).count()
            real /= len(df_orig)

            synth = df.copy()
            synth['count'] = 1
            synth = synth.groupby(agg_vars, observed=True).count()
            synth /= len(df)

            real_and_sampled = pd.merge(real, synth, suffixes=['_real', '_sampled'], on=agg_vars, how='outer', indicator=True)
            real_and_sampled = real_and_sampled[['count_real', 'count_sampled']].fillna(0)

            sts = compute_stats(real_and_sampled['count_real'], real_and_sampled['count_sampled'])

            all_stats[m][c] = sts
            
        pickle.dump(all_stats, open(filepath + filename, 'wb'))

print("\033[1mFINISHED!\033[0m")

Preparing stats for model [1mCTABGAN[0m (1/3)
Preparing stats for model [1mCTGAN[0m (2/3)
Preparing stats for model [1mDATGAN[0m (3/3)
[1mFINISHED![0m


In [19]:
stats_orig = {}

train = df_orig.sample(int(len(df_orig) * 0.5))
train.index = range(len(train))
test = df_orig[~df_orig.index.isin(train.index)]
test.index = range(len(test))

# Go through each columns
for c in combs:

    agg_vars = c.split('::')

    real = train.copy()
    real['count'] = 1
    real = real.groupby(agg_vars, observed=True).count()
    real /= len(df_orig)

    synth = test.copy()
    synth['count'] = 1
    synth = synth.groupby(agg_vars, observed=True).count()
    synth /= len(df)

    real_and_sampled = pd.merge(real, synth, suffixes=['_real', '_sampled'], on=agg_vars, how='outer', indicator=True)
    real_and_sampled = real_and_sampled[['count_real', 'count_sampled']].fillna(0)

    sts = compute_stats(real_and_sampled['count_real'], real_and_sampled['count_sampled'])
    
    stats_orig[c] = sts

In [20]:
all_stats[orig_str] = stats_orig

In [21]:
res = {}

for s in stats_str:
    res[s] = {}
                    
for m in all_stats.keys():

    for s in stats_str:

        tmp = []
        for c in combs:
            tmp.append(all_stats[m][c][s])

        res[s][m] = np.mean(tmp)

In [22]:
for s in ['srmse']:#stats_str:
    print('Ranking on all coupled combinations based on {}:'.format(s.upper()))

    if s in ['r2', 'corr']:
        sorted_dct = {k: v for k, v in sorted(res[s].items(), key=lambda item: item[1])[::-1]}
    else:
        sorted_dct = {k: v for k, v in sorted(res[s].items(), key=lambda item: item[1])}

    for i, item in enumerate(sorted_dct):
        print('  {:>2}. {:<20} - {:.2e}'.format(i+1, item, sorted_dct[item]))
    print()

Ranking on all coupled combinations based on SRMSE:
   1. random-original      - 1.98e-01
   2. DATGAN               - 2.13e-01
   3. CTGAN                - 5.28e-01
   4. CTABGAN              - 6.37e-01

