## Library Imports and Parameters
Import the SDV library and verify that it is installed properly.


In [1]:
import pandas as pd 
import sdv
print(sdv.__version__)

# load each sdv module that we support
from sdv.single_table.base import BaseSingleTableSynthesizer
from sdv.single_table import CTGANSynthesizer
from sdv.metadata import SingleTableMetadata
import graphviz
from sdv.evaluation.single_table import evaluate_quality
from sdv.evaluation.single_table import run_diagnostic
from sdv.evaluation.single_table import get_column_plot

1.4.0


Parameters: Select each dataset and algorithm to use.

In [2]:
#set this variable
param = [0,0]
create_metadata = False
create_synthesizer = False

datasets = ['datasets/adults_train']
algorithms = ['ctgan', 'tvae', 'gaussiancopula', 'copulagan']

## Dataset Load Block
Used to load the dataset to process for synthetic data generation.
Including any preprocessing needed for the data.

In [3]:
dataset_selection = datasets[param[0]]
dataframe = pd.DataFrame()

print("Loading Data")
try:
    dataframe = pd.read_parquet(dataset_selection+'.parquet')
except:
    print("ERROR: Unable to open file. Possibly due to invalid data selection.")
else:
    if dataframe.size == 0:
        print("ERROR: Dataframe has not loaded.")
    

Loading Data


## Metadata Creation/Loading Block

Automatically create the metadata

In [4]:
metadata = SingleTableMetadata()

if not create_metadata:
    print("Loading Metadata")
    metadata = SingleTableMetadata.load_from_json(dataset_selection+'.json')
else:
    print("Automatically detecting Metadata")
    metadata.detect_from_dataframe(dataframe)

Loading Metadata


Inspect and Validate the Metadata.
Common types are 'boolean', 'categorical', 'datetime', 'numerical' and 'id'. Full list of types at: https://docs.sdv.dev/sdv/reference/metadata-spec/sdtypes

In [5]:
if create_metadata:
    print("Round 1: Inspect and Validate")
    metadata_dict = metadata.to_dict()['columns']
    print(metadata_dict)
    metadata.visualize(
        show_table_details='full',
        output_filepath=dataset_selection+'_metadata_stats.png'
    )
    # there should be no error messages caused by the following method calls
    metadata.validate()
    metadata.validate_data(dataframe)

Update the Metadata. 
Modify this code as needed based on the results of the validation.

In [6]:
if create_metadata:
    print("Updating Metadata")
    # Modify here
    # metadata.set_primary_key(column_name='')
    # metadata.update_column(
        # column_name='age',
        # sdtype='numerical'
    # )

Inspect and Validate the Metadata again

In [7]:
if create_metadata:
    print("Round 2: Inspect and Validate")
    metadata_dict = metadata.to_dict()['columns']
    print(metadata_dict)
    metadata.visualize(
        show_table_details='full',
        output_filepath=dataset_selection+'_metadata_stats.png'
    )
    # there should be no error messages caused by the following method calls
    metadata.validate()
    metadata.validate_data(dataframe)

Once Satisfied, Save the Metadata

In [8]:
if create_metadata:
    print("Saving Metadata")
    metadata.save_to_json(dataset_selection+'.json')

## Synthetic Data Generation Block

In [9]:
synthesizer = BaseSingleTableSynthesizer(metadata)
synth_method = algorithms[param[1]]
synth_filepath = dataset_selection+'_'+synth_method+'_synthesizer.pkl'
if synth_method == 'ctgan':
    if create_synthesizer:
        synthesizer = CTGANSynthesizer(metadata)
        synthesizer.fit(dataframe)
    else:
        synthesizer = CTGANSynthesizer.load(filepath=synth_filepath)
elif synth_method == 'tvae':
    if create_synthesizer:
        synthesizer = TVAESynthesizer(metadata)
        synthesizer.fit(dataframe)
    else:
        synthesizer = TVAESynthesizer.load(filepath=synth_filepath)
elif synth_method == 'gaussiancopula':
    if create_synthesizer:
        synthesizer = GaussianCopulaSynthesizer(metadata)
        synthesizer.fit(dataframe)
    else:
        synthesizer = GaussianCopulaSynthesizer.load(filepath=synth_filepath)
elif synth_method == 'copulagan':
    if create_synthesizer:
        synthesizer = CopulaGANSynthesizer(metadata)
        synthesizer.fit(dataframe)
    else:
        synthesizer = CopulaGANSynthesizer.load(filepath=synth_filepath)
else:
    print("ERROR: Invalid Synthesis Method Selected")

synthetic_data = synthesizer.sample(num_rows=1000)

In [10]:
#save the synthesizer for future use
should_save = False
if should_save:
    synthesizer.save(
        filepath=synth_filepath
    )

## Synthetic Data Evaluation Block

In [12]:
print("=== Quality Report ===")
quality_report = evaluate_quality(
    real_data=dataframe,
    synthetic_data=synthetic_data,
    metadata=metadata)

print("=== Diagnostic Report ===")
diagnostic_report = run_diagnostic(
    real_data=dataframe,
    synthetic_data=synthetic_data,
    metadata=metadata)

print("=== Column Report ===")
fig = get_column_plot(
    real_data=dataframe,
    synthetic_data=synthetic_data,
    column_name='age',
    metadata=metadata
)
fig.show()
fig = get_column_plot(
    real_data=dataframe,
    synthetic_data=synthetic_data,
    column_name='type_employer',
    metadata=metadata
)
fig.show()
fig = get_column_plot(
    real_data=dataframe,
    synthetic_data=synthetic_data,
    column_name='fnlwgt',
    metadata=metadata
)
fig.show()
fig = get_column_plot(
    real_data=dataframe,
    synthetic_data=synthetic_data,
    column_name='education',
    metadata=metadata
)
fig.show()
fig = get_column_plot(
    real_data=dataframe,
    synthetic_data=synthetic_data,
    column_name='education_num',
    metadata=metadata
)
fig.show()
fig = get_column_plot(
    real_data=dataframe,
    synthetic_data=synthetic_data,
    column_name='marital',
    metadata=metadata
)
fig.show()
fig = get_column_plot(
    real_data=dataframe,
    synthetic_data=synthetic_data,
    column_name='occupation',
    metadata=metadata
)
fig.show()
fig = get_column_plot(
    real_data=dataframe,
    synthetic_data=synthetic_data,
    column_name='relationship',
    metadata=metadata
)
fig.show()
fig = get_column_plot(
    real_data=dataframe,
    synthetic_data=synthetic_data,
    column_name='race',
    metadata=metadata
)
fig.show()
fig = get_column_plot(
    real_data=dataframe,
    synthetic_data=synthetic_data,
    column_name='sex',
    metadata=metadata
)
fig.show()
fig = get_column_plot(
    real_data=dataframe,
    synthetic_data=synthetic_data,
    column_name='capital_gain',
    metadata=metadata
)
fig.show()
fig = get_column_plot(
    real_data=dataframe,
    synthetic_data=synthetic_data,
    column_name='capital_loss',
    metadata=metadata
)
fig.show()
fig = get_column_plot(
    real_data=dataframe,
    synthetic_data=synthetic_data,
    column_name='hr_per_week',
    metadata=metadata
)
fig.show()
fig = get_column_plot(
    real_data=dataframe,
    synthetic_data=synthetic_data,
    column_name='country',
    metadata=metadata
)
fig.show()
fig = get_column_plot(
    real_data=dataframe,
    synthetic_data=synthetic_data,
    column_name='income',
    metadata=metadata
)
fig.show()

=== Quality Report ===
Generating report ...
(1/2) Evaluating Column Shapes: : 100%|██████████| 15/15 [00:00<00:00, 125.20it/s]
(2/2) Evaluating Column Pair Trends: : 100%|██████████| 105/105 [00:05<00:00, 20.75it/s]

Overall Quality Score: 83.11%

Properties:
- Column Shapes: 88.68%
- Column Pair Trends: 77.55%
=== Diagnostic Report ===
Generating report ...
(1/3) Evaluating Coverage: : 100%|██████████| 15/15 [00:00<00:00, 243.14it/s]
(2/3) Evaluating Boundary: : 100%|██████████| 15/15 [00:00<00:00, 1197.05it/s]
(3/3) Evaluating Synthesis: : 100%|██████████| 1/1 [00:26<00:00, 26.14s/it]

Diagnostic Results:

SUCCESS:
✓ The synthetic data covers over 90% of the categories present in the real data
✓ The synthetic data follows over 90% of the min/max boundaries set by the real data
✓ Over 90% of the synthetic rows are not copies of the real data

! The synthetic data is missing more than 10% of the numerical ranges present in the real data
=== Column Report ===
