# Statistics and Metrics Evaluation

### Import

In [116]:
import pandas as pd
from sdv.metadata import SingleTableMetadata
from sdv.metadata import MultiTableMetadata
from sdv.evaluation.single_table import evaluate_quality as eqs
from sdv.evaluation.multi_table import evaluate_quality as eqm
from sdv.evaluation.multi_table import run_diagnostic

External Functions

In [117]:
%run utils/Statistics.py
%run utils/Generator.py

File Paths

In [118]:
# Real Dataset Output
path_RD = '../data/real_data/d_c.csv'

# Metadata based on Real Dataset Output
path_metadata_st = '../data/metadata/metadata_st_d_c.json'
path_metadata_mt = '../data/metadata/metadata_mt_d_c.json'

# Synthetic Dataset Output from HMA model
path_CTGAN = '../data/synthetic_data/CTGAN_d_c.csv'
path_FAST_ML = '../data/synthetic_data/FAST_ML_d_c.csv'
path_HMA = '../data/synthetic_data/HMA_d_c.csv'

Real Data

In [119]:
RD = pd.read_csv(path_RD)

Metadata

In [120]:
# Single Table metadata
metadata_st = SingleTableMetadata.load_from_json(path_metadata_st)

In [121]:
# Multi Table metadata
metadata_mt = MultiTableMetadata.load_from_json(path_metadata_mt)

Synthetic Data

In [122]:
# CTGAN = pd.read_csv(path_CTGAN)

In [123]:
SD_FAST_ML = pd.read_csv(path_FAST_ML)

In [124]:
SD_HMA = pd.read_csv(path_HMA)

### Statistics

FAST_ML

In [125]:
RD_stats = calculate_statistics(RD)
SD_FAST_ML_stats = calculate_statistics(SD_FAST_ML)
compare_statistics(RD_stats, SD_FAST_ML_stats)

Unnamed: 0,Statistic,RD_Balance,SD_Balance,RD_Credit_Score,SD_Credit_Score
0,Mean,476419.133964,495912.032381,824.0732,825.488507
1,Std,458746.994504,394787.777978,432.768709,417.377739


HMA

In [126]:
RD_stats = calculate_statistics(RD)
SD_HMA_stats = calculate_statistics(SD_HMA)
compare_statistics(RD_stats, SD_HMA_stats)

Unnamed: 0,Statistic,RD_Balance,SD_Balance,RD_Credit_Score,SD_Credit_Score
0,Mean,476419.133964,488801.069948,824.0732,831.180674
1,Std,458746.994504,459951.570144,432.768709,446.400435


### Metrics

##### Evaluation Quality for Single Table models

FAST_ML

In [127]:
# Evaluation Quality
FAST_ML_quality_report = eqs(
    real_data=RD,
    synthetic_data=SD_FAST_ML,
    metadata=metadata_st
)

Creating report: 100%|██████████| 4/4 [00:03<00:00,  1.17it/s]



Overall Quality Score: 76.81%

Properties:
Column Shapes: 86.56%
Column Pair Trends: 67.07%


In [128]:
# Diagnostic
diagnostic_report = run_diagnostic(
    real_data=real_data,
    synthetic_data=synthetic_data,
    metadata=metadata_mt
)

Creating report: 100%|██████████| 4/4 [00:06<00:00,  1.67s/it]


DiagnosticResults:

SUCCESS:
✓ The synthetic data covers over 90% of the numerical ranges present in the real data
✓ Over 90% of the synthetic rows are not copies of the real data
✓ The synthetic data follows over 90% of the min/max boundaries set by the real data

! The synthetic data is missing more than 10% of the categories present in the real data





##### Evaluation Quality for Multi Table models

HMA

In [129]:
# Data dictionaries for quality evaluation
real_data = {'d1': RD}
synthetic_data = {'d1': SD_HMA}

In [130]:
# Evaluation Quality
HMA_quality_report = eqm(
    real_data=real_data,
    synthetic_data=synthetic_data,
    metadata=metadata_mt
)

Creating report: 100%|██████████| 5/5 [00:03<00:00,  1.57it/s]



Overall Quality Score: 82.24%

Properties:
Column Shapes: 91.26%
Column Pair Trends: 73.21%
Parent Child Relationships: NaN


  self._property_breakdown[prop] = np.nanmean(prop_scores) if (


In [131]:
# Diagnostic
diagnostic_report = run_diagnostic(
    real_data=real_data,
    synthetic_data=synthetic_data,
    metadata=metadata_mt
)

Creating report: 100%|██████████| 4/4 [00:06<00:00,  1.63s/it]


DiagnosticResults:

SUCCESS:
✓ The synthetic data covers over 90% of the numerical ranges present in the real data
✓ Over 90% of the synthetic rows are not copies of the real data
✓ The synthetic data follows over 90% of the min/max boundaries set by the real data

! The synthetic data is missing more than 10% of the categories present in the real data





# To Do

CTGAN

In [132]:
# RD_stats = calculate_statistics(RD)
# SD_CTGAN_stats = calculate_statistics(SD_CTGAN)
# compare_statistics(RD_stats, SD_CTGAN_stats)