# Initial investigation and exploration of tabular GAN data evaluation techniques


### First, install the Sythetic Data Vault Library
which include data evaluation code that we can use for our project.

**References:**
1. https://hub.gke2.mybinder.org/user/sdv-dev-sdv-uudgqste/notebooks/tutorials/evaluation/Evaluating_Synthetic_Data.ipynb
2. https://pypi.org/project/sdv/

In [1]:
#pip install sdv


In [2]:
#pip install pomegranate

In [3]:
from sdv.evaluation import evaluate
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# do not show warnings in jupyter notebook
import warnings
warnings.filterwarnings('ignore')

In [4]:
#df_real_allcols = pd.read_csv('../data/data_3D_pasthistories.csv')
#df_syn_CTGAN = pd.read_csv('../data/CTGAN_patientHist.csv')

In [5]:
# removing the "data" column which is not needed
#df_syn_CTGAN = df_syn_CTGAN.drop(columns=['data'])
#df_real = df_real_allcols[df_syn_CTGAN.columns]

In [6]:
#evaluate(df_syn_CTGAN, df_real, metrics=['CSTest', 'KSTest','LogisticDetection','DiscreteKLDivergence','ContinuousKLDivergence'], aggregate=False)

In [7]:
#evaluate(df_syn_CTGAN, df_real, metrics=['CSTest', 'KSTest','LogisticDetection','DiscreteKLDivergence','ContinuousKLDivergence'], aggregate=True)

sdv.metrics.tabular.**GMLogLikelihood**: This metric fits multiple GaussianMixture models to the real data and then evaluates the average log likelihood of the synthetic data on them.

In [8]:
#from sdv.metrics.tabular import GMLogLikelihood
#raw_GMLL = GMLogLikelihood.compute(df_real, df_syn_CTGAN)
#print("GaussianMixture Log Likelihood for CTGAN generated data: ")
#print(raw_GMLL)
#GMLogLikelihood.normalize(raw_GMLL)

In [9]:
#from sklearn.model_selection import train_test_split

#real_train, real_test = train_test_split(df_real, test_size=0.2, random_state=42)
#raw_GMLL = GMLogLikelihood.compute(real_train, real_test)
#print("GaussianMixture Log Likelihood for a train test split of real data: ")
#print(raw_GMLL)
#GMLogLikelihood.normalize(raw_GMLL)

## Evaluate our GAN generation of age_unittype to the real data

These are evaluations of age filetered < 90

In [10]:
# get the real data
ages_unit_np = np.load("../data/eICU_age_unittype.npy", allow_pickle=True)
print('length: ', len(ages_unit_np))
print(ages_unit_np[0:5])

ages_np = np.asarray(ages_unit_np[:,0].flatten().tolist()).flatten()
print('ages length: ', len(ages_np))
#print(ages_np[0:5])

unit_np = np.asarray(ages_unit_np[:,1].flatten().tolist()).flatten()
print('unit length: ', len(unit_np))
#print(ethnicity_np[0:5])

df_ages = pd.DataFrame(zip(ages_np, unit_np), columns=['age','unit'])
print(df_ages.shape)
print(df_ages.groupby('unit').count())

length:  250
[[(59,) ('CTICU',)]
 [(55,) ('CTICU',)]
 [(72,) ('Cardiac ICU',)]
 [(49,) ('CTICU',)]
 [(49,) ('CTICU',)]]
ages length:  250
unit length:  250
(250, 2)
             age
unit            
CSICU         65
CTICU         52
Cardiac ICU  133


In [11]:
# get the synthetic data
df_ages_ourGAN = pd.read_csv('../data/ourGAN_ages_ageunittype.csv')
print(df_ages.shape)
print(df_ages.groupby('unit').count())

(250, 2)
             age
unit            
CSICU         65
CTICU         52
Cardiac ICU  133


#### Create a train test split of the real data for comparison of evaluation metrics

In [12]:
from sklearn.model_selection import train_test_split

ages_train, ages_test = train_test_split(df_ages, test_size=0.2, random_state=42)

In [13]:
print("Overall evaluation score for ourGAN generated data: ")
print(evaluate(df_ages_ourGAN, df_ages, metrics=['CSTest', 'KSTest','LogisticDetection','DiscreteKLDivergence','ContinuousKLDivergence'], aggregate=True))
print(" ")
print("Individual evaluation scores for ourGAN generated data: ")
evaluate(df_ages_ourGAN, df_ages, metrics=['CSTest', 'KSTest','LogisticDetection'], aggregate=False)

Overall evaluation score for ourGAN generated data: 
0.9773333333333333
 
Individual evaluation scores for ourGAN generated data: 


Unnamed: 0,metric,name,raw_score,normalized_score,min_value,max_value,goal,error
0,CSTest,Chi-Squared,1.0,1.0,0.0,1.0,MAXIMIZE,
1,KSTest,Inverted Kolmogorov-Smirnov D statistic,0.932,0.932,0.0,1.0,MAXIMIZE,
2,LogisticDetection,LogisticRegression Detection,1.0,1.0,0.0,1.0,MAXIMIZE,


In [14]:
print("Overall evaluation score for real data train / test split: ")
print(evaluate(ages_test, ages_train, metrics=['CSTest', 'KSTest'], aggregate=True))

print(" ")
print("Individual evaluation scores for real data train / test split: ")
evaluate(ages_test, ages_train, metrics=['CSTest', 'KSTest'], aggregate=False)

Overall evaluation score for real data train / test split: 
0.9182982882937352
 
Individual evaluation scores for real data train / test split: 


Unnamed: 0,metric,name,raw_score,normalized_score,min_value,max_value,goal,error
0,CSTest,Chi-Squared,0.986597,0.986597,0.0,1.0,MAXIMIZE,
1,KSTest,Inverted Kolmogorov-Smirnov D statistic,0.85,0.85,0.0,1.0,MAXIMIZE,


In [15]:
from sdv.metrics.tabular import GMLogLikelihood
raw_GMLL = GMLogLikelihood.compute(df_ages, df_ages_ourGAN)
print("GaussianMixture Log Likelihood for CTGAN generated data: ")
print(raw_GMLL)
GMLogLikelihood.normalize(raw_GMLL)

GaussianMixture Log Likelihood for CTGAN generated data: 
-5.750521007685639


0.0031710355212077558

In [16]:


raw_GMLL = GMLogLikelihood.compute(ages_train, ages_test)
print("GaussianMixture Log Likelihood for train/test split of real data: ")
print(raw_GMLL)
GMLogLikelihood.normalize(raw_GMLL)

GaussianMixture Log Likelihood for train/test split of real data: 
-2.9565698206459525


0.04942692042928588

In [17]:
from sdv.metrics.tabular import MulticlassDecisionTreeClassifier, MulticlassMLPClassifier
from sklearn.model_selection import train_test_split

ages_train, ages_test = train_test_split(df_ages, test_size=0.2, random_state=42)

print("=="*20)
print("MulticlassDecisionTreeClassifier Accuracy:")
print("train/test split of real data: ",MulticlassDecisionTreeClassifier.compute(ages_test, ages_train, target='unit'))
print("real vs. our synthetic: ", MulticlassDecisionTreeClassifier.compute(df_ages, df_ages_ourGAN, target='unit'))

print("=="*20)
print("MulticlassMLPClassifier Accuracy:")
print("train/test split of real data: ",MulticlassMLPClassifier.compute(ages_test, ages_train, target='unit'))
print("real vs. our synthetic: ", MulticlassMLPClassifier.compute(df_ages, df_ages_ourGAN, target='unit'))




MulticlassDecisionTreeClassifier Accuracy:
train/test split of real data:  0.5104377104377105
real vs. our synthetic:  0.3885521129571421
MulticlassMLPClassifier Accuracy:
train/test split of real data:  0.2222222222222222
real vs. our synthetic:  0.2733101115654501


### Demonstration of Privacy Evaluations

This isn't really useful for our age data because of limited dimensions and it isn't a problem to generate a duplicate age, in fact it is expected.

In [18]:
from sdv.metrics.tabular import NumericalLR, NumericalMLP, CategoricalEnsemble

print(NumericalLR.compute(
    df_ages,
    df_ages_ourGAN,
    key_fields=['age'],
    sensitive_fields=['age']))

print(NumericalMLP.compute(
    df_ages,
    df_ages_ourGAN,
    key_fields=['age'],
    sensitive_fields=['age']))

print(CategoricalEnsemble.compute(
    df_ages,
    df_ages_ourGAN,
    key_fields=['unit'],
    sensitive_fields=['unit']))

0.0
2.2936836058265627e-05
No attackers specified.


# Evaluate all age unittype

In [25]:
# get the CTGAN synthetic data
df_ages_CTGAN = pd.read_csv('../data/CTGAN_age_unittype.csv')
# drop the data column from df_ages_CTGAN
df_ages_CTGAN = df_ages_CTGAN.drop(columns=['data'])

print(df_ages_CTGAN.shape)
print(df_ages_CTGAN.groupby('unit').count())

(250, 2)
             age
unit            
CSICU         67
CTICU         56
Cardiac ICU  127


In [20]:
print("=="*20)
print("MulticlassDecisionTreeClassifier Accuracy:")
print("train/test split of real data: ",np.round(MulticlassDecisionTreeClassifier.compute(ages_test, ages_train, target='unit'),3))
print("real vs. our synthetic: ", np.round(MulticlassDecisionTreeClassifier.compute(df_ages, df_ages_ourGAN, target='unit'),3))
print("real vs. CTGAN synthetic: ", np.round(MulticlassDecisionTreeClassifier.compute(df_ages, df_ages_CTGAN, target='unit'),3))

print("=="*20)
print("MulticlassMLPClassifier Accuracy:")
print("train/test split of real data: ",np.round(MulticlassMLPClassifier.compute(ages_test, ages_train, target='unit'),3))
print("real vs. our synthetic: ", np.round(MulticlassMLPClassifier.compute(df_ages, df_ages_ourGAN, target='unit'),3))
print("real vs. CTGAN synthetic: ", np.round(MulticlassMLPClassifier.compute(df_ages, df_ages_CTGAN, target='unit'),3))




MulticlassDecisionTreeClassifier Accuracy:
train/test split of real data:  0.51
real vs. our synthetic:  0.389
real vs. CTGAN synthetic:  0.406
MulticlassMLPClassifier Accuracy:
train/test split of real data:  0.222
real vs. our synthetic:  0.216
real vs. CTGAN synthetic:  0.279


In [26]:
print("AGE UNIT TYPE SCORES:")
print("CSTest and KSTest evaluation score for real data train / test split: ")
print(evaluate(ages_test, ages_train, metrics=['CSTest', 'KSTest'], aggregate=True))

print(" ")
print("=="*20)
print("CSTest and KSTest evaluation score for ourGAN : ")
print(evaluate(df_ages_ourGAN, df_ages, metrics=['CSTest', 'KSTest'], aggregate=True))

print(" ")
print("=="*20)
print("CSTest and KSTest evaluation score for CTGAN : ")
print(evaluate(df_ages_CTGAN, df_ages, metrics=['CSTest', 'KSTest'], aggregate=True))



AGE UNIT TYPE SCORES:
CSTest and KSTest evaluation score for real data train / test split: 
0.9182982882937352
 
CSTest and KSTest evaluation score for ourGAN : 
0.966
 
CSTest and KSTest evaluation score for CTGAN : 
0.8913605018459676


# Evaluate all age ethnicity

In [32]:
# real data
print("=="*20)
print("Real age ethnicity data:")
npy_age_eth = np.load('../data/eICU_age_ethnicity.npy', allow_pickle=True)
df_ages_eths = pd.DataFrame(zip(np.asarray(npy_age_eth[:,0].flatten().tolist()).flatten(), np.asarray(npy_age_eth[:,1].flatten().tolist()).flatten()), columns=['age','ethnicity'])
df_ages_eths = df_ages_eths[['age','ethnicity']]
print(df_ages_eths.shape)
print(df_ages_eths.groupby('ethnicity').count())

# train test split the real data
ages_eths_train, ages_eths_test = train_test_split(df_ages_eths, test_size=0.2, random_state=42)

# get the synthetic data
print("=="*20)
print("Our GAN generated data:")
df_ages_eths_ourGAN = pd.read_csv('../data/age_eth_output.csv')
df_ages_eths_ourGAN = df_ages_eths_ourGAN[['age','ethnicity']]
print(df_ages_eths_ourGAN.shape)
print(df_ages_eths_ourGAN.groupby('ethnicity').count())

print("=="*20)
print("Our distributed GAN generated data:")
df_dist_ages_eths_ourGAN = pd.read_csv('../data/dist_age_eth_output.csv')
df_dist_ages_eths_ourGAN = df_dist_ages_eths_ourGAN[['age','ethnicity']]
print(df_dist_ages_eths_ourGAN.shape)
print(df_dist_ages_eths_ourGAN.groupby('ethnicity').count())

print("=="*20)
print("CTGAN generated data:")
df_ages_eths_CTGAN = pd.read_csv('../data/CTGAN_age_ethnicity.csv')
df_ages_eths_CTGAN = df_ages_eths_CTGAN[['age','ethnicity']]
print(df_ages_eths_CTGAN.shape)
print(df_ages_eths_CTGAN.groupby('ethnicity').count())



Real age ethnicity data:
(2253, 2)
                   age
ethnicity             
African American   231
Caucasian         2010
Native American     12
Our GAN generated data:
(4506, 2)
                   age
ethnicity             
African American   462
Caucasian         4020
Native American     24
Our distributed GAN generated data:
(4506, 2)
                   age
ethnicity             
African American   462
Caucasian         4020
Native American     24
CTGAN generated data:
(2252, 2)
                   age
ethnicity             
African American   251
Caucasian         1980
Native American     21


In [34]:
print("Age Ethnicity Machine Learning Efficacy Comparison:")
print("=="*20)
print("MulticlassDecisionTreeClassifier Accuracy:")
print("train/test split of real data: ",np.round(MulticlassDecisionTreeClassifier.compute(ages_eths_test, ages_eths_train, target='ethnicity'),3))
print("real vs. our synthetic: ", np.round(MulticlassDecisionTreeClassifier.compute(df_ages_eths, df_ages_eths_ourGAN, target='ethnicity'),3))
print("real vs. our distributed synthetic: ", np.round(MulticlassDecisionTreeClassifier.compute(df_ages_eths, df_dist_ages_eths_ourGAN, target='ethnicity'),3))
print("real vs. CTGAN synthetic: ", np.round(MulticlassDecisionTreeClassifier.compute(df_ages_eths, df_ages_eths_CTGAN, target='ethnicity'),3))

print("=="*20)
print("MulticlassMLPClassifier Accuracy:")
print("train/test split of real data: ",np.round(MulticlassMLPClassifier.compute(ages_eths_test, ages_eths_train, target='ethnicity'),3))
print("real vs. our synthetic: ", np.round(MulticlassMLPClassifier.compute(df_ages_eths, df_ages_eths_ourGAN, target='ethnicity'),3))
print("real vs. our distributed synthetic: ", np.round(MulticlassMLPClassifier.compute(df_ages_eths, df_dist_ages_eths_ourGAN, target='ethnicity'),3))
print("real vs. CTGAN synthetic: ", np.round(MulticlassMLPClassifier.compute(df_ages_eths, df_ages_eths_CTGAN, target='ethnicity'),3))

Age Ethnicity Machine Learning Efficacy Comparison:
MulticlassDecisionTreeClassifier Accuracy:
train/test split of real data:  0.314
real vs. our synthetic:  0.358
real vs. our distributed synthetic:  0.358
real vs. CTGAN synthetic:  0.334
MulticlassMLPClassifier Accuracy:
train/test split of real data:  0.315
real vs. our synthetic:  0.314
real vs. our distributed synthetic:  0.314
real vs. CTGAN synthetic:  0.314


In [35]:
print("AGE ETHNICITY TYPE SCORES:")
print("CSTest and KSTest evaluation score for real data train / test split: ")
print(evaluate(ages_eths_test, ages_eths_train, metrics=['CSTest', 'KSTest'], aggregate=True))

print(" ")
print("=="*20)
print("CSTest and KSTest evaluation score for our GAN : ")
print(evaluate(df_ages_eths_ourGAN, df_ages_eths, metrics=['CSTest', 'KSTest'], aggregate=True))

print(" ")
print("=="*20)
print("CSTest and KSTest evaluation score for our  distributed GAN : ")
print(evaluate(df_dist_ages_eths_ourGAN, df_ages_eths, metrics=['CSTest', 'KSTest'], aggregate=True))

print(" ")
print("=="*20)
print("CSTest and KSTest evaluation score for CTGAN : ")
print(evaluate(df_ages_eths_CTGAN, df_ages_eths, metrics=['CSTest', 'KSTest'], aggregate=True))

AGE ETHNICITY TYPE SCORES:
CSTest and KSTest evaluation score for real data train / test split: 
0.9835240139166048
 
CSTest and KSTest evaluation score for our GAN : 
0.9825787838437638
 
CSTest and KSTest evaluation score for our  distributed GAN : 
0.9826897470039947
 
CSTest and KSTest evaluation score for CTGAN : 
0.9530016893411468
