In [1]:
import os
os.chdir('../..')

In [2]:
import numpy as np
import pandas as pd
import json

import matplotlib.pyplot as plt

import sdv.metrics.tabular as sdvm

# For the Python notebook
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [3]:
dataset = 'Chicago'

In [4]:
synth_dir = '../synth_data/' + dataset

files = ['../data/' + dataset + '/data.csv']
labels = ['original', 'WGAN_WI_SIMCONT', 'LINEAR', 'TGAN']

for l in labels[1:]:
    files.append('../synth_data/' + dataset + '/' + l + '.csv')

In [5]:
files

['../data/Chicago/data.csv',
 '../synth_data/Chicago/WGAN_WI_SIMCONT.csv',
 '../synth_data/Chicago/LINEAR.csv',
 '../synth_data/Chicago/TGAN.csv']

In [6]:
dfs = [pd.read_csv(file) for file in files]

In [7]:
print()




In [8]:
def do_stuff(real, synth):
    
    print('  Statistical metrics:')
    res = sdvm.CSTest.compute(real, synth)
    print('    - CS test: {:.3f}'.format(res))
    res = sdvm.KSTest.compute(real, synth)
    print('    - KS test: {:.3f}'.format(res))
    
    print('  Likelihood metrics:')
    res = sdvm.BNLikelihood.compute(real, synth)
    print('    - BN Likelihood: {:.3f}'.format(res))
    res = sdvm.BNLogLikelihood.compute(real, synth)
    print('    - BN Loglikelihood: {:.3f}'.format(res))
    #res = sdvm.GMLogLikelihood.compute(real, synth)
    #print('    - GM Loglikelihood: {:.3f}'.format(res))
    
    print('  ML efficacy metrics:')
    res = sdvm.MulticlassDecisionTreeClassifier.compute(real, synth, target='choice')
    print('    - Decision Tree: {:.3f}'.format(res))
    res = sdvm.MulticlassMLPClassifier.compute(real, synth, target='choice')
    print('    - MLP: {:.3f}'.format(res))
    print('')
    

In [9]:
print('Results from real data only:')

train = dfs[0].sample(int(len(dfs[0]) * 0.75))
test = dfs[0][~dfs[0].index.isin(train.index)]

do_stuff(train, test)

for i, l in enumerate(labels[1:]):
    print('Dataset: {}'.format(l))
    
    do_stuff(dfs[0], dfs[i+1])

Results from real data only:
  Statistical metrics:
    - CS test: 1.000
    - KS test: 0.995
  Likelihood metrics:
    - BN Likelihood: 0.029
    - BN Loglikelihood: -4.155
  ML efficacy metrics:
    - Decision Tree: 0.535




    - MLP: 0.532

Dataset: WGAN_WI_SIMCONT
  Statistical metrics:
    - CS test: 1.000
    - KS test: 0.988
  Likelihood metrics:
    - BN Likelihood: 0.030
    - BN Loglikelihood: -4.203
  ML efficacy metrics:
    - Decision Tree: 0.419




    - MLP: 0.499

Dataset: LINEAR
  Statistical metrics:
    - CS test: 1.000
    - KS test: 0.988
  Likelihood metrics:
    - BN Likelihood: 0.029
    - BN Loglikelihood: -4.297
  ML efficacy metrics:
    - Decision Tree: 0.411




    - MLP: 0.489

Dataset: TGAN
  Statistical metrics:
    - CS test: 1.000
    - KS test: 0.975
  Likelihood metrics:
    - BN Likelihood: 0.030
    - BN Loglikelihood: -4.211
  ML efficacy metrics:
    - Decision Tree: 0.408




    - MLP: 0.495

