# Statistics and Metrics Evaluation

### Import

In [None]:
import pandas as pd
from sdv.metadata import SingleTableMetadata
from sdv.metadata import MultiTableMetadata
from sdv.evaluation.single_table import evaluate_quality
from sdv.evaluation.multi_table import evaluate_quality
from sdv.evaluation.multi_table import run_diagnostic

External Functions

In [None]:
%run utils/Statistics.py
%run utils/Generator.py

File Paths

In [None]:
# Real Dataset Output
path_RD = '../data/real_data/REF.csv'

# Metadata based on Real Dataset Output
path_metadata_st = '../data/metadata/metadata_st_REF.json'
path_metadata_mt = '../data/metadata/metadata_mt_REF.json'

# Reference Synthetic Dataset Output
path_FAST_ML = '../data/synthetic_data/FAST_ML_test.csv'
path_CTGAN = '../data/synthetic_data/CTGAN_test.csv'
path_HMA = '../data/synthetic_data/HMA_test.csv'

# Test Synthetic Dataset Output
path_FAST_ML_test = '../data/synthetic_data/FAST_ML_d_test.csv'
path_CTGAN_test = '../data/synthetic_data/CTGAN_d_test.csv'
path_HMA_test = '../data/synthetic_data/HMA_d_test.csv'

Real Data

In [None]:
RD = pd.read_csv(path_RD)

Metadata

In [None]:
# Single Table metadata
metadata_st = SingleTableMetadata.load_from_json(path_metadata_st)

# Multi Table metadata
metadata_mt = MultiTableMetadata.load_from_json(path_metadata_mt)

Synthetic Data

In [None]:
# CTGAN = pd.read_csv(path_CTGAN)
SD_FAST_ML = pd.read_csv(path_FAST_ML)
SD_CTGAN = pd.read_csv(path_CTGAN)
SD_HMA = pd.read_csv(path_HMA)

Test Data

In [None]:
SD_FAST_ML_test = pd.read_csv(path_FAST_ML_test)
SD_CTGAN_test = pd.read_csv(path_CTGAN_test)
SD_HMA_test = pd.read_csv(path_HMA_test)

### Statistics

In this section a reference SD is compared to a new test SD to evaluate and statistics.

##### FAST_ML

REF

In [None]:
RD_stats = calculate_statistics(RD)
SD_FAST_ML_stats = calculate_statistics(SD_FAST_ML)
compare_statistics(RD_stats, SD_FAST_ML_stats)

TEST

In [None]:
RD_stats = calculate_statistics(RD)
SD_FAST_ML_stats = calculate_statistics(SD_FAST_ML_test)
compare_statistics(RD_stats, SD_FAST_ML_stats)

##### CTGAN

REF

In [None]:
# RD_stats = calculate_statistics(RD)
# SD_CTGAN_stats = calculate_statistics(SD_CTGAN)
# compare_statistics(RD_stats, SD_CTGAN_stats)

TEST

In [None]:
# RD_stats = calculate_statistics(RD)
# SD_CTGAN_stats = calculate_statistics(SD_CTGAN_test)
# compare_statistics(RD_stats, SD_CTGAN_stats)

##### HMA

REF

In [None]:
RD_stats = calculate_statistics(RD)
SD_HMA_stats = calculate_statistics(SD_HMA)
compare_statistics(RD_stats, SD_HMA_stats)

TEST

In [None]:
RD_stats = calculate_statistics(RD)
SD_HMA_stats = calculate_statistics(SD_HMA_test)
compare_statistics(RD_stats, SD_HMA_stats)

### Metrics

In this section a reference SD and a new test SD are compared to the RD to evaluate quality on synthetic data and provide diagnostic information.

##### FAST_ML

##### - Quality Evaluation:

REF

In [None]:
quality_report = evaluate_quality(
    real_data=RD,
    synthetic_data=SD_FAST_ML,
    metadata=metadata_st
)

In [None]:
quality_report.get_details(property_name='Column Shapes')

In [None]:
quality_report.get_details(property_name='Column Pair Trends')

TEST

In [None]:
quality_report_test = evaluate_quality(
    real_data=RD,
    synthetic_data=SD_FAST_ML_test,
    metadata=metadata_st
)

In [None]:
quality_report_test.get_details(property_name='Column Shapes')

In [None]:
quality_report_test.get_details(property_name='Column Pair Trends')

##### - Diagnostic:

In [None]:
# Diagnostic reuire Multi Table data/metadata
real_data = {'d1': RD}
synthetic_data = {'d1': SD_FAST_ML}
synthetic_data_test = {'d1': SD_FAST_ML_test}

REF

In [None]:
diagnostic_report = run_diagnostic(
    real_data=real_data,
    synthetic_data=synthetic_data,
    metadata=metadata_mt
)

In [None]:
diagnostic_report.get_results()

TEST

In [None]:
diagnostic_report_test = run_diagnostic(
    real_data=real_data,
    synthetic_data=synthetic_data_test,
    metadata=metadata_mt
)

In [None]:
diagnostic_report_test.get_results()

##### CTGAN

##### - Quality Evaluation 

REF

In [None]:
# quality_report = evaluate_quality(
#     real_data=RD,
#     synthetic_data=SD_CTGAN,
#     metadata=metadata_st
# )

In [None]:
# quality_report.get_details(property_name='Column Shapes')

In [None]:
# quality_report.get_details(property_name='Column Pair Trends')

TEST

In [None]:
# quality_report_test = evaluate_quality(
#     real_data=RD,
#     synthetic_data=SD_CTGAN_test,
#     metadata=metadata_st
# )

In [None]:
# quality_report_test.get_details(property_name='Column Shapes')

In [None]:
# quality_report_test.get_details(property_name='Column Pair Trends')

##### - Diagnostic

In [None]:
# Data dictionaries for Diagnostic
# real_data = {'d1': RD}
# synthetic_data = {'d1': SD_CTGAN}
# synthetic_data_test = {'d1': SD_CTGAN_test}

REF

In [None]:
# diagnostic_report = run_diagnostic(
#     real_data=real_data,
#     synthetic_data=synthetic_data,
#     metadata=metadata_mt
# )

In [None]:
# diagnostic_report.get_results()

TEST

In [None]:
# diagnostic_report_test = run_diagnostic(
#     real_data=real_data,
#     synthetic_data=synthetic_data,
#     metadata=metadata_mt
# )

In [None]:
# diagnostic_report_test.get_results()

##### HMA

##### - Quality Evaluation:

In [None]:
# Data dictionaries for quality evaluation
real_data = {'d1': RD}
synthetic_data = {'d1': SD_HMA}
synthetic_data_test = {'d1': SD_HMA_test}

REF

In [None]:
quality_report = evaluate_quality(
    real_data=real_data,
    synthetic_data=synthetic_data,
    metadata=metadata_mt
)

In [None]:
quality_report.get_details(property_name='Column Shapes')

In [None]:

quality_report.get_details(property_name='Column Pair Trends')

TEST

In [None]:
quality_report = evaluate_quality(
    real_data=real_data,
    synthetic_data=synthetic_data_test,
    metadata=metadata_mt
)

In [None]:
quality_report.get_details(property_name='Column Shapes')

In [None]:
quality_report.get_details(property_name='Column Pair Trends')

##### - Diagnostic:

In [None]:
# Data dictionaries for Diagnostic
real_data = {'d1': RD}
synthetic_data = {'d1': SD_HMA}
synthetic_data_test = {'d1': SD_HMA_test}

REF

In [None]:
diagnostic_report = run_diagnostic(
    real_data=real_data,
    synthetic_data=synthetic_data,
    metadata=metadata_mt
)

In [None]:
diagnostic_report.get_results()

TEST

In [None]:
diagnostic_report_test = run_diagnostic(
    real_data=real_data,
    synthetic_data=synthetic_data_test,
    metadata=metadata_mt
)

In [None]:
diagnostic_report_test.get_results()