### SDV 
The second package is the SDV , or Synthetic Data Vault, which serves to generate synthetic data based on the given dataset. The generated data can be a single table, multiple tables or time series depending on the need.

The generated data has the same properties and statistics as the original dataset.

SDV generates synthetic data by applying mathematical techniques and machine learning models, such as the deep learning model. Even if the data contains multiple data types and missing data, the SDV will handle them, so we only need to provide the data (and metadata when needed).

[package github](https://github.com/sdv-dev/SDV)

In [1]:
import pandas as pd
#from pathlib import Pathfrom 
#from gretel_synthetics.config import LocalConfig
import warnings
warnings.filterwarnings('ignore')

In [9]:
df = pd.read_csv("infringement_dataset_v2.csv")
array = df.to_numpy()

df["income_type"]

0                      Working
1                State servant
2                      Working
3                      Working
4                      Working
                  ...         
307506                 Working
307507               Pensioner
307508                 Working
307509    Commercial associate
307510    Commercial associate
Name: income_type, Length: 307511, dtype: object

Dataframe metadata:

In [11]:
type(df.columns)

dict = {}
it = 0
for col in df.columns:
    if type(array[0][it]) == float:
        dict[col] = {"type": "numerical", "subtype": "float"}
    elif type(array[0][it]) == int:
        dict[col] = {"type": "numerical", "subtype": "integer"}
    else:
        dict[col] = {"type": "categorical"}
    it+=1

dict["loan_id"] = {"type": "id", "subtype": "int"}
metadata = {"priamry_key": "loan_id", "fields": dict}

print(metadata)

{'priamry_key': 'loan_id', 'fields': {'loan_id': {'type': 'id', 'subtype': 'int'}, 'infringed': {'type': 'numerical', 'subtype': 'integer'}, 'contract_type': {'type': 'categorical'}, 'gender': {'type': 'categorical'}, 'has_own_car': {'type': 'categorical'}, 'has_own_realty': {'type': 'categorical'}, 'num_children': {'type': 'numerical', 'subtype': 'integer'}, 'annual_income': {'type': 'numerical', 'subtype': 'float'}, 'credit_amount': {'type': 'numerical', 'subtype': 'float'}, 'credit_annuity': {'type': 'numerical', 'subtype': 'float'}, 'goods_valuation': {'type': 'numerical', 'subtype': 'float'}, 'income_type': {'type': 'categorical'}, 'education': {'type': 'categorical'}, 'family_status': {'type': 'categorical'}, 'housing_type': {'type': 'categorical'}, 'age': {'type': 'numerical', 'subtype': 'integer'}, 'days_employed': {'type': 'numerical', 'subtype': 'integer'}, 'car_age': {'type': 'numerical', 'subtype': 'float'}, 'provided_mobilephone': {'type': 'numerical', 'subtype': 'integer'

## Trying with different models from de sdv module

In [4]:
from sdv.lite import TabularPreset

model = TabularPreset(name='FAST_ML')
model.fit(df[:100])
tabular_synth = model.sample(100)
tabular_synth

Unnamed: 0,loan_id,infringed,contract_type,gender,has_own_car,has_own_realty,num_children,annual_income,credit_amount,credit_annuity,...,first_name,last_name,past_avg_amount_annuity,past_avg_amt_application,past_avg_amt_credit,past_loans_approved,past_loans_refused,past_loans_canceled,past_loans_unused,past_loans_total
0,100093,0,Cash loans,M,Y,Y,1,306050.660706,726085.838219,28796.967683,...,Gerald,Gay,14617.604995,219850.262083,247453.619083,2.698322,0.040346,2.734833,0.000000,5.531634
1,100030,0,Cash loans,M,N,Y,1,38419.155000,881498.040228,35476.272937,...,Arthur,Stafford,2144.610000,23667.750000,36509.707098,2.498984,1.207580,,0.558449,2.305619
2,100109,0,Cash loans,F,N,Y,0,94683.829310,136219.105897,24387.659859,...,Robert,Holmes,7665.611152,77032.871747,78638.057354,1.368765,0.000000,,0.052579,2.165601
3,100102,0,Cash loans,F,N,Y,0,211254.156649,481795.269506,29509.522602,...,Donna,Martinez,2144.610000,23667.750000,20106.000000,4.232684,1.733439,1.294252,0.947104,8.092350
4,100050,0,Cash loans,F,N,Y,2,234624.472740,937278.316021,33626.458560,...,Juliette,Mccall,21332.487605,375843.643411,429920.815756,3.573030,,0.000000,,1.843422
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,100034,0,Cash loans,F,N,Y,0,38419.155000,331137.647013,18933.083658,...,Clara,Cowett,15992.814876,167000.294774,180878.405827,3.423399,0.000000,0.423664,,2.462820
96,100002,0,Cash loans,F,N,Y,1,256307.184620,374753.091250,13931.275323,...,Andrew,Mccall,24490.664888,251122.080885,233316.909315,5.359316,1.655449,,0.579590,
97,100069,0,Cash loans,M,N,Y,0,356949.930556,734158.677425,44330.327003,...,Donna,Genova,24610.719594,,303972.195833,4.981340,2.623144,1.167962,0.123868,
98,100032,1,Cash loans,F,N,Y,0,119920.698975,651291.507936,28618.930242,...,Andrew,Karl,8007.238458,23667.750000,20106.000000,3.750863,,0.000000,0.060040,1.591954


In [5]:
from sdv.tabular import GaussianCopula

model = GaussianCopula()
model.fit(df[:100])

gcopula_synth = model.sample(100)
gcopula_synth

Unnamed: 0,loan_id,infringed,contract_type,gender,has_own_car,has_own_realty,num_children,annual_income,credit_amount,credit_annuity,...,first_name,last_name,past_avg_amount_annuity,past_avg_amt_application,past_avg_amt_credit,past_loans_approved,past_loans_refused,past_loans_canceled,past_loans_unused,past_loans_total
0,100058,0,Cash loans,F,N,Y,0,114980.332,692927.0,27558.9,...,Megan,Conners,3140.0,85008.0,105810.0,1.0,1.0,1.0,0.0,1.0
1,100076,0,Cash loans,M,N,Y,0,257381.957,460240.0,27610.1,...,Rolf,Roberts,6488.0,176151.0,194586.0,3.0,1.0,1.0,0.0,4.0
2,100071,0,Cash loans,F,N,Y,1,380033.218,1459949.0,71064.6,...,Maria,Brumfield,27975.0,565582.0,574981.0,4.0,3.0,4.0,0.0,11.0
3,100077,0,Cash loans,F,N,N,0,244609.846,852412.0,31218.7,...,Sharon,Davis,20627.0,237552.0,348377.0,7.0,4.0,6.0,0.0,18.0
4,100093,0,Cash loans,F,Y,Y,0,296984.525,679528.0,23114.9,...,Donna,Dungan,21325.0,249122.0,258251.0,2.0,0.0,1.0,0.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,100014,0,Cash loans,M,Y,Y,0,237781.104,962612.0,41460.2,...,Angela,Ryder,21056.0,298337.0,293450.0,3.0,1.0,5.0,0.0,8.0
96,100116,0,Cash loans,F,N,N,1,40079.291,184300.0,6354.5,...,Linda,Scherer,13124.0,312278.0,228541.0,5.0,2.0,4.0,0.0,11.0
97,100014,0,Cash loans,F,N,Y,0,80218.707,481759.0,19338.8,...,Teresa,Seger,6227.0,39520.0,28506.0,2.0,2.0,1.0,0.0,4.0
98,100086,0,Cash loans,F,N,Y,1,163183.513,551493.0,38416.2,...,Ralph,Tosten,,,,,,,,


# Settling in the TabularPreset

This preset is recommended for starting with syntehtic data for a large dataset given. It also optimizes for the modeling time. Sadly TabularPreset returns numbers that are very far away for the desired analysis

In [12]:
#model = TabularPreset(name='FAST_ML', metadata=metadata)
#model.fit(df)
#synth_data = model.sample(df.shape[0])

model = GaussianCopula()
model.fit(df)
model.sample(df.shape[0])

synth_data

Unnamed: 0,loan_id,infringed,contract_type,gender,has_own_car,has_own_realty,num_children,annual_income,credit_amount,credit_annuity,...,first_name,last_name,past_avg_amount_annuity,past_avg_amt_application,past_avg_amt_credit,past_loans_approved,past_loans_refused,past_loans_canceled,past_loans_unused,past_loans_total
0,0,0,Cash loans,F,N,Y,1,459743.997212,928568.236565,31657.589614,...,Beverly,Prince,9455.140244,97602.466454,93784.146968,1.577791,0.000000,0.000000,,1.000000
1,1,1,Cash loans,F,N,Y,1,25650.000000,220327.431817,2984.444261,...,Timothy,Smith,17755.507992,,324740.282237,1.425179,0.779633,3.824660,0.000000,5.679605
2,2,0,Cash loans,XNA,Y,Y,0,25650.000000,484289.623449,31831.700466,...,Julia,Chavez,25206.687424,187924.892756,183931.187631,3.086753,2.239439,0.783311,,5.973208
3,3,0,Cash loans,F,Y,Y,0,154883.870794,519346.734784,21525.414563,...,Rose,Greenlee,8805.616773,231181.750996,243677.011677,5.198752,0.000000,0.000000,0.333253,4.849862
4,4,0,Cash loans,F,Y,N,2,25650.000000,45000.000000,5280.358132,...,Frances,Burton,2526.806960,65459.504325,71922.582608,3.946758,0.000000,1.332231,0.000000,3.790726
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307506,307506,0,Cash loans,F,N,Y,0,46290.878216,618315.053201,25380.640147,...,Glenda,Gaffney,15528.368414,69333.555204,103672.982477,1.642822,0.760827,0.000000,0.000000,
307507,307507,1,Cash loans,F,N,Y,1,25650.000000,381486.713048,26150.753743,...,Gina,Dommer,10532.862331,174650.981371,197760.955890,2.820287,0.391834,2.349107,0.148219,5.819858
307508,307508,0,Cash loans,F,N,Y,0,456673.220822,847801.918805,28294.040143,...,Juan,Damico,,283560.164036,344333.787112,1.047956,,1.256750,0.060086,3.177928
307509,307509,0,Cash loans,M,Y,Y,1,576906.186027,839673.284953,41146.993228,...,Linda,Purvis,19384.126627,237938.894720,,0.000000,0.000000,0.000000,0.253269,1.000000


## SDMetrics

Supported by SDV module to give usefull measures of the synthetic data generated

[link to the module](https://docs.sdv.dev/sdmetrics/)

### Quality Report

In [None]:
from sdmetrics.reports.single_table import QualityReport

quality_report = QualityReport()
quality_report.generate(df, synth_data, metadata) # quality report demora mais que a SD?

Creating report: 100%|██████████| 4/4 [1:55:01<00:00, 1725.29s/it]


KeyboardInterrupt: 

In [None]:
quality_report.get_score()
quality_report.get_properties()

quality_report.get_visualization(property_name='Column Pair Trends')

#fig = quality_report.get_visualization(property_name='age')
#fig.show()

UnboundLocalError: local variable 'fig' referenced before assignment

CorrelationSimilarity measures the correlation between a pair of numerical columns and computes the similarity between the real and synthetic data. As it is directed for numerical continuous values, it will be directed towards the columns that are subjected to the data analysis.

In [None]:
from sdmetrics.column_pairs import CorrelationSimilarity

CorrelationSimilarity.compute(
    real_data=df[['past_avg_amount_annuity', 'past_avg_amt_application']],
    synthetic_data=synth_data[['past_avg_amount_annuity', 'past_avg_amt_application']],
    coefficient='Pearson'
)

0.9637743530981878

### Diagnostic Report

In [None]:
from sdmetrics.reports.single_table import DiagnosticReport

diagnostic_report = DiagnosticReport()
diagnostic_report.generate(df, synth_data, metadata)

diagnostic_report.get_score()
diagnostic_report.get_results()

In [None]:
from sdmetrics.single_table import NewRowSynthesis

NewRowSynthesis.compute(
    real_data=df,
    synthetic_data=synth_data,
    metadata=metadata,
    numerical_match_tolerance=0.01,
    synthetic_sample_size=10_000
)

To save the model (if wanted):

In [None]:
model.save('sd_infringment_model.pkl')

## Executing the data analysis on the generated syntehtic dataset

In [15]:
original_count = []
original_count.append(len(df.query('age<30')))
original_count.append(len(df.query('age<40')) - original_count[-1])
original_count.append(len(df.query('age<50')) - original_count[-1])
original_count.append(len(df.query('age<60')) - original_count[-1])
original_count.append(len(df.query('age>=60')))
print(original_count)

[45000, 82299, 121543, 150373, 35595]


In [16]:
sel1 = df[df["infringed"] == 1]
sel2 = df[df["infringed"] == 0]

original_past = []
original_past.append([sel2["past_avg_amount_annuity"].mean(), sel1["past_avg_amount_annuity"].mean()])
original_past.append([sel2["past_avg_amt_application"].mean(), sel1["past_avg_amt_application"].mean()])
original_past.append([sel2["past_avg_amt_credit"].mean(), sel1["past_avg_amt_credit"].mean()])
original_past.append([sel2["past_loans_approved"].mean(), sel1["past_loans_approved"].mean()])
original_past.append([sel2["past_loans_canceled"].mean(), sel1["past_loans_canceled"].mean()])
original_past.append([sel2["past_loans_refused"].mean(), sel1["past_loans_refused"].mean()])
original_past.append([sel2["past_loans_total"].mean(), sel1["past_loans_total"].mean()])
original_past.append([sel2["past_loans_unused"].mean(), sel1["past_loans_unused"].mean()])
print(original_past)

[[14636.327058541274, 13364.183497179574], [155003.88215537314, 142817.25742247657], [170867.81323263212, 161129.7695755352], [3.0644432136281305, 2.8200041937513105], [0.8081672978758438, 1.2345565107989096], [0.8818503660015269, 0.9981128119102537], [4.832645988952592, 5.131474103585657], [0.0781851114470907, 0.07880058712518348]]


In [13]:
synth_count = []
synth_count.append(len(synth_data.query('age<30')))
synth_count.append(len(synth_data.query('age<40')) - synth_count[-1])
synth_count.append(len(synth_data.query('age<50')) - synth_count[-1])
synth_count.append(len(synth_data.query('age<60')) - synth_count[-1])
synth_count.append(len(synth_data.query('age>=60')))
print(synth_count)

[37635, 76412, 137436, 142835, 27240]


In [14]:
sel1 = synth_data[synth_data["infringed"] == 1]
sel2 = synth_data[synth_data["infringed"] == 0]

synth_past = []
synth_past.append([sel2["past_avg_amount_annuity"].mean(), sel1["past_avg_amount_annuity"].mean()])
synth_past.append([sel2["past_avg_amt_application"].mean(), sel1["past_avg_amt_application"].mean()])
synth_past.append([sel2["past_avg_amt_credit"].mean(), sel1["past_avg_amt_credit"].mean()])
synth_past.append([sel2["past_loans_approved"].mean(), sel1["past_loans_approved"].mean()])
synth_past.append([sel2["past_loans_canceled"].mean(), sel1["past_loans_canceled"].mean()])
synth_past.append([sel2["past_loans_refused"].mean(), sel1["past_loans_refused"].mean()])
synth_past.append([sel2["past_loans_total"].mean(), sel1["past_loans_total"].mean()])
synth_past.append([sel2["past_loans_unused"].mean(), sel1["past_loans_unused"].mean()])
print(synth_past)

[[14866.660345680353, 14037.132669524104], [166278.91349249645, 159516.63463607244], [182443.2340826047, 177200.9600774923], [3.112876194820398, 2.9491229142438664], [1.1921437892420812, 1.3488232726234304], [1.1870003858495184, 1.218165403069049], [5.205441940237427, 5.284548762607229], [0.16972044637956124, 0.1669293008177402]]
