In [1]:
import os
os.chdir('../..')

In [2]:
import numpy as np
import pandas as pd
from collections import Iterable, defaultdict
import random
from os import listdir
from os.path import isfile, join
import json
import pickle

from itertools import combinations

import matplotlib.pyplot as plt

# For the Python notebook
%matplotlib inline
%reload_ext autoreload
%autoreload 2

  This is separate from the ipykernel package so we can avoid doing imports until


# Functions

In [3]:
def is_a_DATGAN(name):
    if any(x in name for x in ['TGAN', 'CTGAN', 'CTABGAN', 'TVAE', 'FULL', 'TRANSRED', 'LINEAR', 'NOLINKS', 'PREDICTION']):
        return False
    else:
        return True

def compute_stats(freq_list_orig, freq_list_synth):
    """
    Different statistics computed on the frequency list
    
    """
    freq_list_orig, freq_list_synth = np.array(freq_list_orig), np.array(freq_list_synth)
    corr_mat = np.corrcoef(freq_list_orig, freq_list_synth)
    corr = corr_mat[0, 1]
    if np.isnan(corr): corr = 0.0
    # MAE
    mae = np.absolute(freq_list_orig - freq_list_synth).mean()
    # RMSE
    rmse = np.linalg.norm(freq_list_orig - freq_list_synth) / np.sqrt(len(freq_list_orig))
    # SRMSE
    freq_list_orig_avg = freq_list_orig.mean()
    srmse = rmse / freq_list_orig_avg
    # r-square
    u = np.sum((freq_list_synth - freq_list_orig)**2)
    v = np.sum((freq_list_orig - freq_list_orig_avg)**2)
    r2 = 1.0 - u / v
    stat = {'mae': mae, 'rmse': rmse, 'r2': r2, 'srmse': srmse, 'corr': corr}
    
    return stat

# Get all models and associated files

In [4]:
dataset = 'adult'
n_models = 5
n_data = 5

# Models for testing all DATGANS
if 'adult' in dataset:
    models = ['CTGAN', 'TGAN', 'TVAE', 'CTABGAN', 'WGGP_WI_NO', 'WGAN_WI_NO', 'LINEAR']
else:
    models = ['CTGAN', 'TGAN', 'TVAE', 'CTABGAN']

    for i in ['WGAN', 'SGAN', 'WGGP']:
        for j in ['WI', 'OR', 'WO']:
            for k in ['NO', 'BO', 'OD', 'OC']:
                models.append('{}_{}_{}'.format(i,j,k))
            
# Models for testing different DAGs
if 'DAG' in dataset:
    models = ['FULL', 'TRANSRED', 'LINEAR', 'NOLINKS', 'PREDICTION']
            
models.sort()

files_ = {}

for m in models:
    tmp = []
    if is_a_DATGAN(m):
        spl = m.split('_')
        for i in range(n_models):
            for j in range(n_data):
                tmp.append('{}_{}_{:0>2}_{}_{:0>2}.csv'.format(spl[0], spl[1], i+1,  spl[2], j+1))
    else:
        for i in range(n_models):
            for j in range(n_data):
                tmp.append('{}_{:0>2}_{:0>2}.csv'.format(m, i+1, j+1))
    files_[m] = tmp


input_folder = '../synth_data/{}/'.format(dataset)

In [5]:
df_orig = pd.read_csv('../data/' + dataset.split('_')[0] + '/data.csv')

In [6]:
if 'Chicago' in dataset:
    continuous_cols = ['distance', 'age', 'departure_time']
elif 'LPMC' in dataset:
    continuous_cols = ['start_time_linear', 'age', 'distance', 'dur_walking', 'dur_cycling', 'dur_pt_access', 'dur_pt_rail', 'dur_pt_bus', 'dur_pt_int', 'dur_driving', 'cost_transit', 'cost_driving_fuel', 'driving_traffic_percent']
elif 'adult' in dataset:
    continuous_cols = ['age', 'capital-gain', 'capital-loss', 'hours-per-week']

In [7]:
bins_cont = {}

for c in continuous_cols:
    #bins_cont[c] = pd.qcut(df_orig[c], q=10, retbins=True)[1]
    bins_cont[c] = pd.cut(df_orig[c], bins=10, retbins=True)[1]
    bins_cont[c][0] = -np.inf
    bins_cont[c][-1] = np.inf
    df_orig[c] = pd.cut(df_orig[c], bins=bins_cont[c])

In [8]:
stats_str = ['mae', 'rmse', 'r2', 'srmse', 'corr']
orig_str = 'random-original'

In [9]:
try:
    os.makedirs('./notebooks/results/{}'.format(dataset))
except:
    pass

# Stats per individual column

In [10]:
filepath = './notebooks/results/{}/'.format(dataset)
filename = 'single_columns.pickle'.format(dataset)

all_stats = {}

try:
    all_stats = pickle.load(open(filepath + filename, 'rb'))
    print('Found previous pickel file, using that')
except:
    print('No previous results found, starting fresh')
    try:
        os.makedirs(filepath)
    except:
        pass

Found previous pickel file, using that


In [11]:
# Go through each model
for i, m in enumerate(models):
    
    if m in all_stats:
        print("Results for model \033[1m{}\033[0m ({}/{}) already exists!".format(m, i+1, len(models)))

    else:
        print("Preparing stats for model \033[1m{}\033[0m ({}/{})".format(m, i+1, len(models)))

        all_stats[m] = {}

        for c in df_orig.columns:
            all_stats[m][c] = {}
            for s in stats_str:
                all_stats[m][c][s] = []

        # Load all dataframes for current model
        dfs = [pd.read_csv(input_folder + f) for f in files_[m]]

        # Go through all dataframes generated for each model
        for df in dfs:

            # Discretize continuous columns
            for c in continuous_cols:
                df[c] = pd.cut(df[c], bins=bins_cont[c])

            # Go through each columns
            for c in df_orig.columns:

                agg_vars = [c]

                real = df_orig.copy()
                real['count'] = 1
                real = real.groupby(agg_vars, observed=True).count()
                real /= len(df_orig)

                synth = df.copy()
                synth['count'] = 1
                synth = synth.groupby(agg_vars, observed=True).count()
                synth /= len(df)

                real_and_sampled = pd.merge(real, synth, suffixes=['_real', '_sampled'], on=agg_vars, how='outer', indicator=True)
                real_and_sampled = real_and_sampled[['count_real', 'count_sampled']].fillna(0)

                sts = compute_stats(real_and_sampled['count_real'], real_and_sampled['count_sampled'])

                for s in sts:
                    all_stats[m][c][s].append(sts[s])
                    
        pickle.dump(all_stats, open(filepath + filename, 'wb'))

print("\033[1mFINISHED!\033[0m")

Results for model [1mCTABGAN[0m (1/7) already exists!
Results for model [1mCTGAN[0m (2/7) already exists!
Preparing stats for model [1mLINEAR[0m (3/7)
Results for model [1mTGAN[0m (4/7) already exists!
Results for model [1mTVAE[0m (5/7) already exists!
Results for model [1mWGAN_WI_NO[0m (6/7) already exists!
Results for model [1mWGGP_WI_NO[0m (7/7) already exists!
[1mFINISHED![0m


In [12]:
if orig_str not in all_stats:

    stats_orig = {}

    for c in df_orig.columns:
        stats_orig[c] = {}
        for s in stats_str:
            stats_orig[c][s] = []

    for i in range(n_models*n_data):

        train = df_orig.sample(int(len(df_orig) * 0.5))
        train.index = range(len(train))
        test = df_orig[~df_orig.index.isin(train.index)]
        test.index = range(len(test))

        # Go through each columns
        for c in df_orig.columns:

            agg_vars = [c]

            real = train.copy()
            real['count'] = 1
            real = real.groupby(agg_vars, observed=True).count()
            real /= len(df_orig)

            synth = test.copy()
            synth['count'] = 1
            synth = synth.groupby(agg_vars, observed=True).count()
            synth /= len(df)

            real_and_sampled = pd.merge(real, synth, suffixes=['_real', '_sampled'], on=agg_vars, how='outer', indicator=True)
            real_and_sampled = real_and_sampled[['count_real', 'count_sampled']].fillna(0)

            sts = compute_stats(real_and_sampled['count_real'], real_and_sampled['count_sampled'])

            for s in sts:
                stats_orig[c][s].append(sts[s])
    
    all_stats[orig_str] = stats_orig
    
    pickle.dump(all_stats, open(filepath + filename, 'wb'))

In [13]:
res = {}

for test in ['all', 'cont', 'cat']:
    
    res[test] = {}
    
    if test == 'all':
        cols = df_orig.columns
    elif test == 'cont':
        cols = continuous_cols
    elif test == 'cat':
        cols = set(df_orig.columns) - set(continuous_cols)

    for s in stats_str:
        res[test][s] = {}

    for m in all_stats.keys():

        for s in stats_str:
            res[test][s][m] = []

            for i in range(n_models*n_data):
                tmp = []

                for c in cols:
                    tmp.append(all_stats[m][c][s][i])

                res[test][s][m].append(np.mean(tmp))

In [14]:
avg = {}

for test in ['all', 'cont', 'cat']:
    
    avg[test] = {}

    for s in stats_str:
        avg[test][s] = {}

        for m in all_stats.keys():
            avg[test][s][m] = {
                'mean': np.mean(res[test][s][m]),
                'std': np.std(res[test][s][m])
            }

In [15]:
for test in ['all', 'cont', 'cat']:
    
    if test == 'all':
        str_ = 'on all columns'
    elif test == 'cont':
        str_ = 'on continuous columns'
    elif test == 'cat':
        str_ = 'on categorical columns'
        
    for s in ['srmse']:#stats:
        print('Ranking {} based on {}:'.format(str_, s.upper()))

        if s in ['r2', 'corr']:
            sorted_dct = {k: v for k, v in sorted(avg[test][s].items(), key=lambda item: item[1]['mean'])[::-1]}
        else:
            sorted_dct = {k: v for k, v in sorted(avg[test][s].items(), key=lambda item: item[1]['mean'])}

        for i, item in enumerate(sorted_dct):
            print('  {:>2}. {:<15} - {:.2e} ± {:.2e}'.format(i+1, item, sorted_dct[item]['mean'], sorted_dct[item]['std']))
        print()


Ranking on all columns based on SRMSE:
   1. random-original - 1.13e-02 ± 1.18e-03
   2. LINEAR          - 4.01e-02 ± 4.84e-03
   3. WGAN_WI_NO      - 4.12e-02 ± 1.79e-03
   4. TGAN            - 7.48e-02 ± 8.47e-03
   5. WGGP_WI_NO      - 1.11e-01 ± 5.42e-02
   6. TVAE            - 1.18e-01 ± 2.40e-02
   7. CTGAN           - 2.45e-01 ± 1.78e-02
   8. CTABGAN         - 2.79e-01 ± 3.79e-02

Ranking on continuous columns based on SRMSE:
   1. random-original - 8.63e-03 ± 1.73e-03
   2. LINEAR          - 6.60e-02 ± 8.34e-03
   3. WGAN_WI_NO      - 7.13e-02 ± 9.32e-03
   4. TGAN            - 9.62e-02 ± 1.28e-02
   5. TVAE            - 1.04e-01 ± 2.42e-02
   6. CTGAN           - 1.66e-01 ± 3.10e-02
   7. WGGP_WI_NO      - 1.85e-01 ± 7.77e-02
   8. CTABGAN         - 2.31e-01 ± 4.49e-02

Ranking on categorical columns based on SRMSE:
   1. random-original - 1.23e-02 ± 1.44e-03
   2. WGAN_WI_NO      - 2.91e-02 ± 2.70e-03
   3. LINEAR          - 2.97e-02 ± 6.40e-03
   4. TGAN            - 6.62e-

# Stats per couple columns

In [16]:
combs = []

for k in combinations(df_orig.columns, 2):
    combs.append(k[0] + '::' + k[1])

In [17]:
filepath = './notebooks/results/{}/'.format(dataset)
filename = 'couple_combinations.pickle'.format(dataset)

all_stats = {}

try:
    all_stats = pickle.load(open(filepath + filename, 'rb'))
    print('Found previous pickel file, using that')
except:
    print('No previous results found, starting fresh')
    try:
        os.makedirs(filepath)
    except:
        pass

Found previous pickel file, using that


In [18]:
# Go through each model
for i, m in enumerate(models):
    
    if m in all_stats:
        print("Results for model \033[1m{}\033[0m ({}/{}) already exists!".format(m, i+1, len(models)))

    else:
        print("Preparing stats for model \033[1m{}\033[0m ({}/{})".format(m, i+1, len(models)))

        all_stats[m] = {}

        for c in combs:
            all_stats[m][c] = {}
            for s in stats_str:
                all_stats[m][c][s] = []

        # Load all dataframes for current model
        dfs = [pd.read_csv(input_folder + f) for f in files_[m]]

        # Go through all dataframes generated for each model
        for df in dfs:

            # Discretize continuous columns
            for c in continuous_cols:
                df[c] = pd.cut(df[c], bins=bins_cont[c])

            # Go through each columns
            for c in combs:

                agg_vars = c.split('::')

                real = df_orig.copy()
                real['count'] = 1
                real = real.groupby(agg_vars, observed=True).count()
                real /= len(df_orig)

                synth = df.copy()
                synth['count'] = 1
                synth = synth.groupby(agg_vars, observed=True).count()
                synth /= len(df)

                real_and_sampled = pd.merge(real, synth, suffixes=['_real', '_sampled'], on=agg_vars, how='outer', indicator=True)
                real_and_sampled = real_and_sampled[['count_real', 'count_sampled']].fillna(0)

                sts = compute_stats(real_and_sampled['count_real'], real_and_sampled['count_sampled'])

                for s in sts:
                    all_stats[m][c][s].append(sts[s])
                    
        pickle.dump(all_stats, open(filepath + filename, 'wb'))

print("\033[1mFINISHED!\033[0m")

Results for model [1mCTABGAN[0m (1/7) already exists!
Results for model [1mCTGAN[0m (2/7) already exists!
Preparing stats for model [1mLINEAR[0m (3/7)
Results for model [1mTGAN[0m (4/7) already exists!
Results for model [1mTVAE[0m (5/7) already exists!
Results for model [1mWGAN_WI_NO[0m (6/7) already exists!
Results for model [1mWGGP_WI_NO[0m (7/7) already exists!
[1mFINISHED![0m


In [19]:
if orig_str not in all_stats:
    stats_orig = {}

    for c in combs:
        stats_orig[c] = {}
        for s in stats_str:
            stats_orig[c][s] = []

    for i in range(n_models*n_data):

        train = df_orig.sample(int(len(df_orig) * 0.5))
        train.index = range(len(train))
        test = df_orig[~df_orig.index.isin(train.index)]
        test.index = range(len(test))

        # Go through each columns
        for c in combs:

            agg_vars = c.split('::')

            real = train.copy()
            real['count'] = 1
            real = real.groupby(agg_vars, observed=True).count()
            real /= len(df_orig)

            synth = test.copy()
            synth['count'] = 1
            synth = synth.groupby(agg_vars, observed=True).count()
            synth /= len(df)

            real_and_sampled = pd.merge(real, synth, suffixes=['_real', '_sampled'], on=agg_vars, how='outer', indicator=True)
            real_and_sampled = real_and_sampled[['count_real', 'count_sampled']].fillna(0)

            sts = compute_stats(real_and_sampled['count_real'], real_and_sampled['count_sampled'])

            for s in sts:
                stats_orig[c][s].append(sts[s])
                
    all_stats[orig_str] = stats_orig
    
    pickle.dump(all_stats, open(filepath + filename, 'wb'))

In [20]:
res = {}

for s in stats_str:
    res[s] = {}

for m in all_stats.keys():

    for s in stats_str:
        res[s][m] = []

        for i in range(n_models*n_data):
            tmp = []

            for c in combs:
                tmp.append(all_stats[m][c][s][i])

            res[s][m].append(np.mean(tmp))

In [21]:
avg = {}

for s in stats_str:
    avg[s] = {}

    for m in all_stats.keys():
        avg[s][m] = {
            'mean': np.mean(res[s][m]),
            'std': np.std(res[s][m])
        }

In [22]:
for s in ['srmse']:#stats:
    print('Ranking on all coupled combinations based on {}:'.format(s.upper()))

    if s in ['r2', 'corr']:
        sorted_dct = {k: v for k, v in sorted(avg[s].items(), key=lambda item: item[1]['mean'])[::-1]}
    else:
        sorted_dct = {k: v for k, v in sorted(avg[s].items(), key=lambda item: item[1]['mean'])}

    for i, item in enumerate(sorted_dct):
        print('  {:>2}. {:<15} - {:.2e} ± {:.2e}'.format(i+1, item, sorted_dct[item]['mean'], sorted_dct[item]['std']))
    print()


Ranking on all coupled combinations based on SRMSE:
   1. random-original - 4.35e-02 ± 3.22e-03
   2. LINEAR          - 1.56e-01 ± 1.25e-02
   3. WGAN_WI_NO      - 1.69e-01 ± 8.07e-03
   4. TGAN            - 2.31e-01 ± 2.05e-02
   5. WGGP_WI_NO      - 3.45e-01 ± 1.53e-01
   6. TVAE            - 3.78e-01 ± 8.26e-02
   7. CTGAN           - 7.44e-01 ± 5.53e-02
   8. CTABGAN         - 9.00e-01 ± 9.30e-02



# Stats per trouple columns

In [23]:
combs = []

for k in combinations(df_orig.columns, 3):
    combs.append(k[0] + '::' + k[1] + '::' + k[2])

In [24]:
filepath = './notebooks/results/{}/'.format(dataset)
filename = 'trouple_combinations.pickle'.format(dataset)

all_stats = {}

try:
    all_stats = pickle.load(open(filepath + filename, 'rb'))
    print('Found previous pickel file, using that')
except:
    print('No previous results found, starting fresh')
    try:
        os.makedirs(filepath)
    except:
        pass

Found previous pickel file, using that


In [25]:
# Go through each model
for i, m in enumerate(models):

    if m in all_stats:
        print("Results for model \033[1m{}\033[0m ({}/{}) already exists!".format(m, i+1, len(models)))
    else:
        print("Preparing stats for model \033[1m{}\033[0m ({}/{})".format(m, i+1, len(models)))

        all_stats[m] = {}

        for c in combs:
            all_stats[m][c] = {}
            for s in stats_str:
                all_stats[m][c][s] = []

        # Load all dataframes for current model
        dfs = [pd.read_csv(input_folder + f) for f in files_[m]]

        # Go through all dataframes generated for each model
        for df in dfs:

            # Discretize continuous columns
            for c in continuous_cols:
                df[c] = pd.cut(df[c], bins=bins_cont[c])

            # Go through each columns
            for c in combs:

                agg_vars = c.split('::')

                real = df_orig.copy()
                real['count'] = 1
                real = real.groupby(agg_vars, observed=True).count()
                real /= len(df_orig)

                synth = df.copy()
                synth['count'] = 1
                synth = synth.groupby(agg_vars, observed=True).count()
                synth /= len(df)

                real_and_sampled = pd.merge(real, synth, suffixes=['_real', '_sampled'], on=agg_vars, how='outer', indicator=True)
                real_and_sampled = real_and_sampled[['count_real', 'count_sampled']].fillna(0)

                sts = compute_stats(real_and_sampled['count_real'], real_and_sampled['count_sampled'])

                for s in sts:
                    all_stats[m][c][s].append(sts[s])
    
        pickle.dump(all_stats, open(filepath + filename, 'wb'))

print("\033[1mFINISHED!\033[0m")

Results for model [1mCTABGAN[0m (1/7) already exists!
Results for model [1mCTGAN[0m (2/7) already exists!
Preparing stats for model [1mLINEAR[0m (3/7)
Results for model [1mTGAN[0m (4/7) already exists!
Results for model [1mTVAE[0m (5/7) already exists!
Results for model [1mWGAN_WI_NO[0m (6/7) already exists!
Results for model [1mWGGP_WI_NO[0m (7/7) already exists!
[1mFINISHED![0m


In [26]:
if orig_str not in all_stats:
    stats_orig = {}

    for c in combs:
        stats_orig[c] = {}
        for s in stats_str:
            stats_orig[c][s] = []

    for i in range(n_models*n_data):

        train = df_orig.sample(int(len(df_orig) * 0.5))
        train.index = range(len(train))
        test = df_orig[~df_orig.index.isin(train.index)]
        test.index = range(len(test))

        # Go through each columns
        for c in combs:

            agg_vars = c.split('::')

            real = train.copy()
            real['count'] = 1
            real = real.groupby(agg_vars, observed=True).count()
            real /= len(df_orig)

            synth = test.copy()
            synth['count'] = 1
            synth = synth.groupby(agg_vars, observed=True).count()
            synth /= len(df)

            real_and_sampled = pd.merge(real, synth, suffixes=['_real', '_sampled'], on=agg_vars, how='outer', indicator=True)
            real_and_sampled = real_and_sampled[['count_real', 'count_sampled']].fillna(0)

            sts = compute_stats(real_and_sampled['count_real'], real_and_sampled['count_sampled'])

            for s in sts:
                stats_orig[c][s].append(sts[s])
                
    all_stats[orig_str] = stats_orig
    
    pickle.dump(all_stats, open(filepath + filename, 'wb'))

In [27]:
res = {}

for s in stats_str:
    res[s] = {}

for m in all_stats.keys():

    for s in stats_str:
        res[s][m] = []

        for i in range(n_models*n_data):
            tmp = []

            for c in combs:
                tmp.append(all_stats[m][c][s][i])

            res[s][m].append(np.mean(tmp))

In [28]:
avg = {}

for s in stats_str:
    avg[s] = {}

    for m in all_stats.keys():
        avg[s][m] = {
            'mean': np.mean(res[s][m]),
            'std': np.std(res[s][m])
        }

In [29]:
for s in ['srmse']:#stats_str:
    print('Ranking on all triple combinations based on {}:'.format(s.upper()))

    if s in ['r2', 'corr']:
        sorted_dct = {k: v for k, v in sorted(avg[s].items(), key=lambda item: item[1]['mean'])[::-1]}
    else:
        sorted_dct = {k: v for k, v in sorted(avg[s].items(), key=lambda item: item[1]['mean'])}

    for i, item in enumerate(sorted_dct):
        print('  {:>2}. {:<15} - {:.2e} ± {:.2e}'.format(i+1, item, sorted_dct[item]['mean'], sorted_dct[item]['std']))
    print()


Ranking on all triple combinations based on SRMSE:
   1. random-original - 9.86e-02 ± 4.32e-03
   2. LINEAR          - 3.76e-01 ± 2.40e-02
   3. WGAN_WI_NO      - 4.18e-01 ± 1.79e-02
   4. TGAN            - 4.55e-01 ± 3.26e-02
   5. WGGP_WI_NO      - 7.11e-01 ± 2.89e-01
   6. TVAE            - 7.70e-01 ± 1.59e-01
   7. CTGAN           - 1.46e+00 ± 1.17e-01
   8. CTABGAN         - 1.82e+00 ± 1.59e-01

