## Library Imports and Parameters
Import the SDV library and verify that it is installed properly.


In [2]:
import pandas as pd 
import sdv
print(sdv.__version__)

# load each sdv module that we support
from sdv.single_table.base import BaseSingleTableSynthesizer
from sdv.single_table import CTGANSynthesizer
from sdv.metadata import SingleTableMetadata
import graphviz

1.4.0


Parameters: Select each dataset and algorithm to use.

In [5]:
#set this variable
param = [0,0]
create_metadata = False
create_synthesizer = False

datasets = ['datasets/adults_train']
algorithms = ['ctgan', 'tvae', 'gaussiancopula', 'copulagan']

## Dataset Load Block
Used to load the dataset to process for synthetic data generation.
Including any preprocessing needed for the data.

In [6]:
dataset_selection = datasets[param[0]]
dataframe = pd.DataFrame()

print("Loading Data")
try:
    dataframe = pd.read_parquet(dataset_selection+'.parquet')
except:
    print("ERROR: Unable to open file. Possibly due to invalid data selection.")
else:
    if dataframe.size == 0:
        print("ERROR: Dataframe has not loaded.")
    

Loading Data


## Metadata Creation/Loading Block

Automatically create the metadata

In [7]:
metadata = SingleTableMetadata()

if not create_metadata:
    print("Loading Metadata")
    metadata = SingleTableMetadata.load_from_json(dataset_selection+'.json')
else:
    print("Automatically detecting Metadata")
    metadata.detect_from_dataframe(dataframe)

Loading Metadata


Inspect and Validate the Metadata.
Common types are 'boolean', 'categorical', 'datetime', 'numerical' and 'id'. Full list of types at: https://docs.sdv.dev/sdv/reference/metadata-spec/sdtypes

In [38]:
if create_metadata:
    print("Round 1: Inspect and Validate")
    metadata_dict = metadata.to_dict()['columns']
    print(metadata_dict)
    metadata.visualize(
        show_table_details='full',
        output_filepath=dataset_selection+'_metadata_stats.png'
    )
    # there should be no error messages caused by the following method calls
    metadata.validate()
    metadata.validate_data(dataframe)

Round 1: Inspect and Validate
{'age': {'sdtype': 'numerical'}, 'type_employer': {'sdtype': 'categorical'}, 'fnlwgt': {'sdtype': 'numerical'}, 'education': {'sdtype': 'categorical'}, 'education_num': {'sdtype': 'numerical'}, 'marital': {'sdtype': 'categorical'}, 'occupation': {'sdtype': 'categorical'}, 'relationship': {'sdtype': 'categorical'}, 'race': {'sdtype': 'categorical'}, 'sex': {'sdtype': 'categorical'}, 'capital_gain': {'sdtype': 'numerical'}, 'capital_loss': {'sdtype': 'numerical'}, 'hr_per_week': {'sdtype': 'numerical'}, 'country': {'sdtype': 'categorical'}, 'income': {'sdtype': 'categorical'}}


Update the Metadata. 
Modify this code as needed based on the results of the validation.

In [40]:
if create_metadata:
    print("Updating Metadata")
    # Modify here
    # metadata.set_primary_key(column_name='')
    # metadata.update_column(
        # column_name='age',
        # sdtype='numerical'
    # )

Updating Metadata


Inspect and Validate the Metadata again

In [41]:
if create_metadata:
    print("Round 2: Inspect and Validate")
    metadata_dict = metadata.to_dict()['columns']
    print(metadata_dict)
    metadata.visualize(
        show_table_details='full',
        output_filepath=dataset_selection+'_metadata_stats.png'
    )
    # there should be no error messages caused by the following method calls
    metadata.validate()
    metadata.validate_data(dataframe)

Round 2: Inspect and Validate
{'age': {'sdtype': 'numerical'}, 'type_employer': {'sdtype': 'categorical'}, 'fnlwgt': {'sdtype': 'numerical'}, 'education': {'sdtype': 'categorical'}, 'education_num': {'sdtype': 'numerical'}, 'marital': {'sdtype': 'categorical'}, 'occupation': {'sdtype': 'categorical'}, 'relationship': {'sdtype': 'categorical'}, 'race': {'sdtype': 'categorical'}, 'sex': {'sdtype': 'categorical'}, 'capital_gain': {'sdtype': 'numerical'}, 'capital_loss': {'sdtype': 'numerical'}, 'hr_per_week': {'sdtype': 'numerical'}, 'country': {'sdtype': 'categorical'}, 'income': {'sdtype': 'categorical'}}


Once Satisfied, Save the Metadata

In [42]:
if create_metadata:
    print("Saving Metadata")
    metadata.save_to_json(dataset_selection+'.json')

Saving Metadata


## Synthetic Data Generation Block

In [9]:
synthesizer = BaseSingleTableSynthesizer(metadata)
synth_method = algorithms[param[1]]
synth_filepath = dataset_selection+'_'+synth_method+'_synthesizer.pkl'
if synth_method == 'ctgan':
    if create_synthesizer:
        synthesizer = CTGANSynthesizer(metadata)
        synthesizer.fit(dataframe)
    else:
        synthesizer = CTGANSynthesizer.load(filepath=synth_filepath)
elif synth_method == 'tvae':
    if create_synthesizer:
        synthesizer = TVAESynthesizer(metadata)
        synthesizer.fit(dataframe)
    else:
        synthesizer = TVAESynthesizer.load(filepath=synth_filepath)
elif synth_method == 'gaussiancopula':
    if create_synthesizer:
        synthesizer = GaussianCopulaSynthesizer(metadata)
        synthesizer.fit(dataframe)
    else:
        synthesizer = GaussianCopulaSynthesizer.load(filepath=synth_filepath)
elif synth_method == 'copulagan':
    if create_synthesizer:
        synthesizer = CopulaGANSynthesizer(metadata)
        synthesizer.fit(dataframe)
    else:
        synthesizer = CopulaGANSynthesizer.load(filepath=synth_filepath)
else:
    print("ERROR: Invalid Synthesis Method Selected")

synthetic_data = synthesizer.sample(num_rows=1000)

In [50]:
#save the synthesizer for future use
should_save = False
if should_save:
    synthesizer.save(
        filepath=synth_filepath
    )

## Synthetic Data Evaluation Block