# Statistics and Metrics Evaluation

### Import

In [24]:
import pandas as pd
from sdv.metadata import SingleTableMetadata
from sdv.metadata import MultiTableMetadata
from sdv.evaluation.single_table import evaluate_quality as eqs
from sdv.evaluation.multi_table import evaluate_quality as eqm
from sdv.evaluation.multi_table import run_diagnostic

External Functions

In [25]:
%run utils/Statistics.py
%run utils/Generator.py

File Paths

In [26]:
# Real Dataset Output
path_RD = '../data/real_data/REF.csv'

# Metadata based on Real Dataset Output
path_metadata_st = '../data/metadata/metadata_st_REF.json'
path_metadata_mt = '../data/metadata/metadata_mt_REF.json'

# Reference Synthetic Dataset Output
path_FAST_ML = '../data/synthetic_data/FAST_ML_test.csv'
path_CTGAN = '../data/synthetic_data/CTGAN_test.csv'
path_HMA = '../data/synthetic_data/HMA_test.csv'

# Test Synthetic Dataset Output
path_FAST_ML_test = '../data/synthetic_data/FAST_ML_d_test.csv'
path_CTGAN_test = '../data/synthetic_data/CTGAN_d_test.csv'
path_HMA_test = '../data/synthetic_data/HMA_d_test.csv'

Real Data

In [27]:
RD = pd.read_csv(path_RD)

Metadata

In [28]:
# Single Table metadata
metadata_st = SingleTableMetadata.load_from_json(path_metadata_st)

# Multi Table metadata
metadata_mt = MultiTableMetadata.load_from_json(path_metadata_mt)

Synthetic Data

In [29]:
# CTGAN = pd.read_csv(path_CTGAN)
SD_FAST_ML = pd.read_csv(path_FAST_ML)
SD_CTGAN = pd.read_csv(path_CTGAN)
SD_HMA = pd.read_csv(path_HMA)

### Statistics

In this section a reference SD is compared to a new test SD to evaluate and statistics.

##### FAST_ML

In [31]:
RD_stats = calculate_statistics(RD)
SD_FAST_ML_stats = calculate_statistics(SD_FAST_ML)
compare_statistics(RD_stats, SD_FAST_ML_stats)

Unnamed: 0,Statistic,RD_Balance,SD_Balance,RD_Credit_Score,SD_Credit_Score
0,Mean,502181.747275,536612.451899,822.326603,825.50171
1,Std,472645.0992,414670.592861,429.086412,407.082121


##### CTGAN

In [33]:
# RD_stats = calculate_statistics(RD)
SD_CTGAN_stats = calculate_statistics(SD_CTGAN)
compare_statistics(RD_stats, SD_CTGAN_stats)

Unnamed: 0,Statistic,RD_Balance,SD_Balance,RD_Credit_Score,SD_Credit_Score
0,Mean,502181.747275,258423.760714,822.326603,577.940527
1,Std,472645.0992,,429.086412,


##### HMA

In [35]:
RD_stats = calculate_statistics(RD)
SD_HMA_stats = calculate_statistics(SD_HMA)
compare_statistics(RD_stats, SD_HMA_stats)

Unnamed: 0,Statistic,RD_Balance,SD_Balance,RD_Credit_Score,SD_Credit_Score
0,Mean,502181.747275,501840.886799,822.326603,829.322619
1,Std,472645.0992,469844.973593,429.086412,449.160143


### Metrics

In this section a reference SD and a new test SD are compared to the RD to evaluate quality on synthetic data and provide diagnostic information.

##### FAST_ML

In [37]:
quality_report = eqs(
    real_data=RD,
    synthetic_data=SD_FAST_ML,
    metadata=metadata_st
)

Creating report: 100%|██████████| 4/4 [00:25<00:00,  6.47s/it]



Overall Quality Score: 90.61%

Properties:
Column Shapes: 92.61%
Column Pair Trends: 88.6%


In [38]:
quality_report.get_details(property_name='Column Shapes')

Unnamed: 0,Column,Metric,Quality Score
0,Balance,KSComplement,0.856408
1,Last_Transaction_Date,KSComplement,0.941388
2,Last_Transaction_Amount,KSComplement,0.942081
3,Tot_Transaction_Amount,KSComplement,0.939183
4,Credit_Score,KSComplement,0.912097
5,Loan_Amount,KSComplement,0.942067
6,Interest_Rate,KSComplement,0.941535
7,Account_Type,TVComplement,0.999144
8,Currency,TVComplement,1.0
9,Region,TVComplement,0.866659


##### CTGAN

In [39]:
quality_report = eqs(
    real_data=RD,
    synthetic_data=SD_CTGAN,
    metadata=metadata_st
)

Creating report: 100%|██████████| 4/4 [00:12<00:00,  3.09s/it]



Overall Quality Score: 23.38%

Properties:
Column Shapes: 38.52%
Column Pair Trends: 8.23%


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [40]:
quality_report.get_details(property_name='Column Shapes')

Unnamed: 0,Column,Metric,Quality Score
0,Balance,KSComplement,0.391832
1,Last_Transaction_Date,KSComplement,0.235284
2,Last_Transaction_Amount,KSComplement,0.391554
3,Tot_Transaction_Amount,KSComplement,0.409688
4,Credit_Score,KSComplement,0.295842
5,Loan_Amount,KSComplement,0.369
6,Interest_Rate,KSComplement,0.447279
7,Account_Type,TVComplement,0.499122
8,Currency,TVComplement,1.0
9,Region,TVComplement,0.049894


##### HMA

In [42]:
# Data dictionaries for quality evaluation
real_data = {'d1': RD}
synthetic_data = {'d1': SD_HMA}

In [43]:
quality_report = eqm(
    real_data=real_data,
    synthetic_data=synthetic_data,
    metadata=metadata_mt
)

Creating report: 100%|██████████| 5/5 [00:27<00:00,  5.43s/it]



Overall Quality Score: 97.8%

Properties:
Column Shapes: 98.9%
Column Pair Trends: 96.7%
Parent Child Relationships: NaN


  self._property_breakdown[prop] = np.nanmean(prop_scores) if (


In [44]:
quality_report.get_details(property_name='Column Shapes')

Unnamed: 0,Table,Column,Metric,Quality Score
0,d1,Balance,KSComplement,0.982929
1,d1,Last_Transaction_Date,KSComplement,0.997223
2,d1,Last_Transaction_Amount,KSComplement,0.998663
3,d1,Tot_Transaction_Amount,KSComplement,0.987003
4,d1,Credit_Score,KSComplement,0.910966
5,d1,Loan_Amount,KSComplement,0.99841
6,d1,Interest_Rate,KSComplement,0.998039
7,d1,Account_Type,TVComplement,0.999803
8,d1,Currency,TVComplement,1.0
9,d1,Region,TVComplement,0.997105
