In [None]:
#Date: Dec 11, 2024
#Author: Sonal Allana
#Purpose: This notebook converts the input dataset into its synthetic version using Synthetic Data Vault (SDV)
#Documentation https://sdv.dev/
#SmartNoise requires python 3.8 - 3.11

In [None]:
#!pip install sdv
#!pip install backports.tarfile

In [1]:
from sdv.single_table import CTGANSynthesizer, GaussianCopulaSynthesizer, TVAESynthesizer
from sdv.metadata import SingleTableMetadata
import numpy as np
import pandas as pd
import sdv as sd
import time

In [2]:
#Options (1) adult (2) credit (3) compas (4) hospital
dataset_name = "hospital"


In [3]:
#Get preprocessed dataset
fname = '{0}_preprocessed.csv'.format(dataset_name)
    
if dataset_name == "hospital":
    data = pd.read_csv('../datasets/{0}'.format(fname), sep=',', engine='python', na_values='?',nrows=30000)
else:
    data = pd.read_csv('../datasets/{0}'.format(fname), sep=',', engine='python', na_values='?')


In [4]:
#Set the parameters here
bMetdataExists = True
#Options (1) ctgan (2) gausscopula (3) tvae 
syndataType = "gausscopula"


In [5]:
if bMetdataExists:
#if exists then load from metadata file    
    metadata = SingleTableMetadata.load_from_json(
    filepath='../datasets/{0}_metadata.json'.format(dataset_name))
else:
#Auto generate metadata for the table
#https://docs.sdv.dev/sdv/single-table-data/data-preparation/single-table-metadata-api
    metadata = SingleTableMetadata()
    metadata.detect_from_dataframe(data)
    
    #Clean metadata  
    if dataset_name == "adult":
        metadata.remove_primary_key()

        #Change workclass_State-gov to categorical
        metadata.update_column(
        column_name = 'workclass_State-gov',
        sdtype = 'categorical') #pii when True means column is sensitive and its values should not be used in synthetic data

        #Change race and sex to int
        metadata.update_column(
        column_name = 'race',
        sdtype = 'numerical',
        computer_representation = 'Int64')

        metadata.update_column(
        column_name = 'sex',
        sdtype = 'numerical',
        computer_representation = 'Int64')

        metadata.update_column(
        column_name = 'class',
        sdtype = 'numerical',
        computer_representation = 'Int64')

    elif dataset_name == "credit":
        #Change age and sex to int
        metadata.update_column(
        column_name = 'AGE',
        sdtype = 'numerical',
        computer_representation = 'Int64')

        metadata.update_column(
        column_name = 'SEX',
        sdtype = 'numerical',
        computer_representation = 'Int64')   

        metadata.update_column(
        column_name = 'default.payment.next.month',
        sdtype = 'numerical',
        computer_representation = 'Int64') 

    elif dataset_name == 'compas':
        #Change race and sex to int
        metadata.update_column(
        column_name = 'race',
        sdtype = 'numerical',
        computer_representation = 'Int64')

        metadata.update_column(
        column_name = 'sex',
        sdtype = 'numerical',
        computer_representation = 'Int64')   

        metadata.update_column(
        column_name = 'juv_misd_count',
        sdtype = 'numerical',
        computer_representation = 'Int64') 

        metadata.update_column(
        column_name = 'juv_other_count',
        sdtype = 'numerical',
        computer_representation = 'Int64')

        metadata.update_column(
        column_name = 'is_recid',
        sdtype = 'numerical',
        computer_representation = 'Int64') 

        metadata.update_column(
        column_name = 'decile_score',
        sdtype = 'numerical',
        computer_representation = 'Int64') 

        metadata.update_column(
        column_name = 'c_charge_degree_M',
        sdtype = 'numerical',
        computer_representation = 'Int64') 

    elif dataset_name == "hospital":
        #Change race and gender to int
        metadata.update_column(
        column_name = 'race',
        sdtype = 'numerical',
        computer_representation = 'Int64')

        metadata.update_column(
        column_name = 'gender',
        sdtype = 'numerical',
        computer_representation = 'Int64')     

        metadata.update_column(
        column_name = 'readmitted',
        sdtype = 'numerical',
        computer_representation = 'Int64')

        metadata.update_column(
        column_name = 'num_procedures',
        sdtype = 'numerical',
        computer_representation = 'Int64')

        metadata.update_column(
        column_name = 'number_diagnoses',
        sdtype = 'numerical',
        computer_representation = 'Int64')



In [6]:
#Use following to view the metadata
python_dict = metadata.to_dict()
print(python_dict)

{'METADATA_SPEC_VERSION': 'SINGLE_TABLE_V1', 'columns': {'race': {'sdtype': 'numerical', 'computer_representation': 'Int64'}, 'gender': {'sdtype': 'numerical', 'computer_representation': 'Int64'}, 'time_in_hospital': {'sdtype': 'numerical'}, 'num_lab_procedures': {'sdtype': 'numerical'}, 'num_procedures': {'sdtype': 'numerical', 'computer_representation': 'Int64'}, 'num_medications': {'sdtype': 'numerical'}, 'number_outpatient': {'sdtype': 'numerical'}, 'number_emergency': {'sdtype': 'numerical'}, 'number_inpatient': {'sdtype': 'numerical'}, 'number_diagnoses': {'sdtype': 'numerical', 'computer_representation': 'Int64'}, 'readmitted': {'sdtype': 'numerical', 'computer_representation': 'Int64'}, 'age_[0-10)': {'sdtype': 'categorical'}, 'age_[10-20)': {'sdtype': 'categorical'}, 'age_[20-30)': {'sdtype': 'categorical'}, 'age_[30-40)': {'sdtype': 'categorical'}, 'age_[40-50)': {'sdtype': 'categorical'}, 'age_[50-60)': {'sdtype': 'categorical'}, 'age_[60-70)': {'sdtype': 'categorical'}, 'ag

In [8]:
start_time = time.time()
if syndataType == "ctgan":
    synthesizer = CTGANSynthesizer(metadata)
elif syndataType == "gausscopula":  
    synthesizer = GaussianCopulaSynthesizer(metadata)
elif syndataType == "tvae":
    synthesizer = TVAESynthesizer(metadata)
synthesizer.fit(data)
end_time = time.time()
time_elapsed = end_time - start_time
print("Num of rows: ", data.shape[0], ", time: ",time_elapsed)

Num of rows:  30000 , time:  65.06960082054138


In [7]:
if dataset_name != "hospital":
    num_rows = data.shape[0]
else:
    num_rows = 900000
synthetic_data = synthesizer.sample(num_rows)
synthetic_data.head()

NameError: name 'synthesizer' is not defined

In [None]:
print(synthetic_data.columns)

In [None]:
if not bMetdataExists:
    metadata.save_to_json("../datasets/{0}_metadata.json".format(dataset_name))


In [None]:
print(synthetic_data)


In [None]:
synthetic_data.to_csv("../datasets/{0}_sdv_{1}.csv".format(dataset_name,syndataType),index=False) #do not write the row index as the first column


In [None]:
#The following block is used for running diagnostics on synthetic data

In [None]:

from sdmetrics.reports.single_table import DiagnosticReport

report = DiagnosticReport()

In [None]:
synthetic_data = pd.read_csv("../datasets/{0}_sdv_{1}.csv".format(dataset_name,syndataType), sep=',', engine='python', na_values='?')#pandas dataframe containing synthetic data
#metadata = #dictionary with formal and types of data


In [None]:
report.generate(data, synthetic_data, python_dict)

In [None]:
#A floating point value between 0 and 1 that summarizes the quality of your synthetic data
report.get_score()
#The score should be 100% or very close to it

In [None]:
#Returns: A dictionary that lists each property name and its associated score
report.get_properties()

In [None]:
#A pandas.DataFrame object that returns more details about the property
report.get_details(property_name='Data Validity')

In [None]:
#Visualise properties
fig = report.get_visualization(property_name='Data Validity')
fig.show()

In [None]:
report.save(filepath="../datasets/{0}_sdv_{1}_diag_rep.pkl".format(dataset_name,syndataType))

In [None]:
#Loading a presaved report
#report = DiagnosticReport.load('results/diagnostic_report.pkl')