# Statistics and Metrics Evaluation

### Import

In [19]:
import pandas as pd
from sdv.metadata import SingleTableMetadata
from sdv.metadata import MultiTableMetadata
from sdv.evaluation.single_table import evaluate_quality as eqs
from sdv.evaluation.multi_table import evaluate_quality as eqm
from sdv.evaluation.multi_table import run_diagnostic

External Functions

In [20]:
%run utils/Statistics.py
%run utils/Generator.py

File Paths

In [21]:
# Real Dataset Output
path_RD = '../data/real_data/d_c.csv'

# Metadata based on Real Dataset Output
path_metadata_st = '../data/metadata/metadata_st_d_c.json'
path_metadata_mt = '../data/metadata/metadata_mt_d_c.json'

# Synthetic Dataset Output
path_CTGAN = '../data/synthetic_data/CTGAN_d_c.csv'
path_FAST_ML = '../data/synthetic_data/FAST_ML_d_c.csv'
path_HMA = '../data/synthetic_data/HMA_d_c.csv'

In [22]:
# Test Synthetic Dataset Output
path_FAST_ML_test = '../data/synthetic_data/FAST_ML_d_test.csv'
path_HMA_test = '../data/synthetic_data/HMA_d_test.csv'

Real Data

In [23]:
RD = pd.read_csv(path_RD)

Metadata

In [24]:
# Single Table metadata
metadata_st = SingleTableMetadata.load_from_json(path_metadata_st)

# Multi Table metadata
metadata_mt = MultiTableMetadata.load_from_json(path_metadata_mt)

Synthetic Data

In [25]:
# CTGAN = pd.read_csv(path_CTGAN)
SD_FAST_ML = pd.read_csv(path_FAST_ML)
SD_HMA = pd.read_csv(path_HMA)

Test Data

In [26]:
SD_FAST_ML_test = pd.read_csv(path_FAST_ML_test)
SD_HMA_test = pd.read_csv(path_HMA_test)

### Statistics

FAST_ML

In [27]:
RD_stats = calculate_statistics(RD)
SD_FAST_ML_stats = calculate_statistics(SD_FAST_ML)
compare_statistics(RD_stats, SD_FAST_ML_stats)

Unnamed: 0,Statistic,RD_Balance,SD_Balance,RD_Credit_Score,SD_Credit_Score
0,Mean,476419.133964,495912.032381,824.0732,825.488507
1,Std,458746.994504,394787.777978,432.768709,417.377739


TEST

In [28]:
RD_stats = calculate_statistics(RD)
SD_FAST_ML_stats = calculate_statistics(SD_FAST_ML_test)
compare_statistics(RD_stats, SD_FAST_ML_stats)

Unnamed: 0,Statistic,RD_Balance,SD_Balance,RD_Credit_Score,SD_Credit_Score
0,Mean,476419.133964,533943.208948,824.0732,840.509613
1,Std,458746.994504,428938.034034,432.768709,425.492158


HMA

In [29]:
RD_stats = calculate_statistics(RD)
SD_HMA_stats = calculate_statistics(SD_HMA)
compare_statistics(RD_stats, SD_HMA_stats)

Unnamed: 0,Statistic,RD_Balance,SD_Balance,RD_Credit_Score,SD_Credit_Score
0,Mean,476419.133964,488801.069948,824.0732,831.180674
1,Std,458746.994504,459951.570144,432.768709,446.400435


TEST

In [30]:
RD_stats = calculate_statistics(RD)
SD_HMA_stats = calculate_statistics(SD_HMA_test)
compare_statistics(RD_stats, SD_HMA_stats)

Unnamed: 0,Statistic,RD_Balance,SD_Balance,RD_Credit_Score,SD_Credit_Score
0,Mean,476419.133964,503634.254569,824.0732,841.803805
1,Std,458746.994504,464968.967484,432.768709,458.127227


### Metrics

##### Evaluation Quality for Single Table models

FAST_ML

In [50]:
# Evaluation Quality
quality_report = eqs(
    real_data=RD,
    synthetic_data=SD_FAST_ML,
    metadata=metadata_st
)

Creating report: 100%|██████████| 4/4 [00:03<00:00,  1.16it/s]



Overall Quality Score: 76.81%

Properties:
Column Shapes: 86.56%
Column Pair Trends: 67.07%


In [51]:
quality_report.get_details(property_name='Column Shapes')

Unnamed: 0,Column,Metric,Quality Score
0,Balance,KSComplement,0.842
1,Last_Transaction_Date,KSComplement,0.924
2,Last_Transaction_Amount,KSComplement,0.918
3,Tot_Transaction_Amount,KSComplement,0.936
4,Credit_Score,KSComplement,0.915
5,Loan_Amount,KSComplement,0.916
6,Interest_Rate,KSComplement,0.934
7,Account_Type,TVComplement,0.985
8,Currency,TVComplement,1.0
9,Branch,TVComplement,0.589


In [49]:
# Evaluation Quality
quality_report_test = eqs(
    real_data=RD,
    synthetic_data=SD_FAST_ML_test,
    metadata=metadata_st
)

Creating report: 100%|██████████| 4/4 [00:14<00:00,  3.70s/it]



Overall Quality Score: 70.16%

Properties:
Column Shapes: 77.94%
Column Pair Trends: 62.38%


In [52]:
quality_report_test.get_details(property_name='Column Shapes')

Unnamed: 0,Column,Metric,Quality Score
0,Balance,KSComplement,0.832
1,Last_Transaction_Date,KSComplement,0.929
2,Last_Transaction_Amount,KSComplement,0.916
3,Tot_Transaction_Amount,KSComplement,0.93
4,Credit_Score,KSComplement,0.875
5,Loan_Amount,KSComplement,0.921
6,Interest_Rate,KSComplement,0.936
7,Account_Type,TVComplement,0.937
8,Currency,TVComplement,1.0
9,Branch,TVComplement,0.032001


In [34]:
# Reference Diagnostic (reuire Multi Table data/metadata)
real_data = {'d1': RD}
synthetic_data = {'d1': SD_FAST_ML}

diagnostic_report = run_diagnostic(
    real_data=real_data,
    synthetic_data=synthetic_data,
    metadata=metadata_mt
)

Creating report: 100%|██████████| 4/4 [00:06<00:00,  1.68s/it]


DiagnosticResults:

SUCCESS:
✓ The synthetic data covers over 90% of the numerical ranges present in the real data
✓ Over 90% of the synthetic rows are not copies of the real data
✓ The synthetic data follows over 90% of the min/max boundaries set by the real data

! The synthetic data is missing more than 10% of the categories present in the real data





In [39]:
diagnostic_report.get_results()

{'SUCCESS': ['The synthetic data covers over 90% of the numerical ranges present in the real data',
  'Over 90% of the synthetic rows are not copies of the real data',
  'The synthetic data follows over 90% of the min/max boundaries set by the real data'],
 'DANGER': []}

In [40]:
# Test Diagnostic (reuire Multi Table data/metadata)
real_data = {'d1': RD}
synthetic_data = {'d1': SD_FAST_ML_test}

diagnostic_report_test = run_diagnostic(
    real_data=real_data,
    synthetic_data=synthetic_data,
    metadata=metadata_mt
)

Creating report: 100%|██████████| 4/4 [00:06<00:00,  1.70s/it]


DiagnosticResults:

SUCCESS:
✓ The synthetic data covers over 90% of the numerical ranges present in the real data
✓ Over 90% of the synthetic rows are not copies of the real data
✓ The synthetic data follows over 90% of the min/max boundaries set by the real data

! The synthetic data is missing more than 10% of the categories present in the real data





In [41]:
diagnostic_report_test.get_results()

{'SUCCESS': ['The synthetic data covers over 90% of the numerical ranges present in the real data',
  'Over 90% of the synthetic rows are not copies of the real data',
  'The synthetic data follows over 90% of the min/max boundaries set by the real data'],
 'DANGER': []}

##### Evaluation Quality for Multi Table models

HMA

In [42]:
# Data dictionaries for quality evaluation
real_data = {'d1': RD}
synthetic_data = {'d1': SD_HMA}
synthetic_data_test = {'d1': SD_HMA_test}

In [43]:
# Evaluation Quality Reference
quality_report = eqm(
    real_data=real_data,
    synthetic_data=synthetic_data,
    metadata=metadata_mt
)

Creating report: 100%|██████████| 5/5 [00:03<00:00,  1.52it/s]



Overall Quality Score: 82.24%

Properties:
Column Shapes: 91.26%
Column Pair Trends: 73.21%
Parent Child Relationships: NaN


  self._property_breakdown[prop] = np.nanmean(prop_scores) if (


In [44]:
# Evaluation Quality Test
quality_report = eqm(
    real_data=real_data,
    synthetic_data=synthetic_data_test,
    metadata=metadata_mt
)

Creating report: 100%|██████████| 5/5 [00:14<00:00,  2.92s/it]



Overall Quality Score: 75.38%

Properties:
Column Shapes: 82.11%
Column Pair Trends: 68.65%
Parent Child Relationships: NaN


  self._property_breakdown[prop] = np.nanmean(prop_scores) if (


In [45]:
# Diagnostic Reference
diagnostic_report = run_diagnostic(
    real_data=real_data,
    synthetic_data=synthetic_data,
    metadata=metadata_mt
)

Creating report: 100%|██████████| 4/4 [00:06<00:00,  1.68s/it]


DiagnosticResults:

SUCCESS:
✓ The synthetic data covers over 90% of the numerical ranges present in the real data
✓ Over 90% of the synthetic rows are not copies of the real data
✓ The synthetic data follows over 90% of the min/max boundaries set by the real data

! The synthetic data is missing more than 10% of the categories present in the real data





In [47]:
# Diagnostic Test
diagnostic_report_test = run_diagnostic(
    real_data=real_data,
    synthetic_data=synthetic_data_test,
    metadata=metadata_mt
)

Creating report: 100%|██████████| 4/4 [00:06<00:00,  1.61s/it]


DiagnosticResults:

SUCCESS:
✓ The synthetic data covers over 90% of the numerical ranges present in the real data
✓ Over 90% of the synthetic rows are not copies of the real data
✓ The synthetic data follows over 90% of the min/max boundaries set by the real data

! The synthetic data is missing more than 10% of the categories present in the real data





# To Do

CTGAN

In [None]:
# RD_stats = calculate_statistics(RD)
# SD_CTGAN_stats = calculate_statistics(SD_CTGAN)
# compare_statistics(RD_stats, SD_CTGAN_stats)