In [1]:
from models.DECAF import DECAF
from data import DataModule, inject_synth_bias, load_credit, preprocess_credit
import numpy as np
import os
import torch
import pandas as pd
from sklearn.model_selection import train_test_split

import pytorch_lightning as pl

  from .autonotebook import tqdm as notebook_tqdm
  warn(


In [2]:
df = load_credit()
credit_data = preprocess_credit(df)
names = list(credit_data.columns)

In [3]:
models_dir = './cache/'
def train_decaf(train_dataset, dag_seed, test_dataset, biased_edges={}, dataset="credit",label="approved", bias=0, h_dim=200, lr=0.5e-3,
                batch_size=64, lambda_privacy=0, lambda_gp=10, d_updates=10,
                alpha=2, rho=2, weight_decay=1e-2, grad_dag_loss=False, l1_g=0,
                l1_W=1e-4, p_gen=-1, use_mask=True, epochs=50, generate_test=False):
    model_filename = os.path.join(models_dir, 'decaf_'+dataset+str(bias)+'.pkl')

    dm = DataModule(train_dataset.values)
    dm_test = DataModule(test_dataset.values)

    model = DECAF(
        dm.dims[0],
        dag_seed=dag_seed,
        h_dim=h_dim,
        lr=lr,
        batch_size=batch_size,
        lambda_privacy=lambda_privacy,
        lambda_gp=lambda_gp,
        d_updates=d_updates,
        alpha=alpha,
        rho=rho,
        weight_decay=weight_decay,
        grad_dag_loss=grad_dag_loss,
        l1_g=l1_g,
        l1_W=l1_W,
        p_gen=p_gen,
        use_mask=use_mask,
    )
    print("model name: ",model_filename)
    if os.path.exists(model_filename):
        model = torch.load(model_filename)
    else:
        trainer = pl.Trainer(max_epochs=epochs, logger=False)
        trainer.fit(model, dm)
        torch.save(model, model_filename)

    # Generate synthetic data
    synth_dataset = (
        model.gen_synthetic(
            dm.dataset.x,
            gen_order=model.get_gen_order(),
            biased_edges=biased_edges,
        )
        .detach()
        .numpy()
    )
    synth_dataset[:, -1] = synth_dataset[:, -1].astype(np.int8)

    synth_dataset = pd.DataFrame(synth_dataset,
                                 index=train_dataset.index,
                                 columns=train_dataset.columns)
    synth_dataset["ethnicity"] = np.round(synth_dataset["ethnicity"])

    # generate synthetic data of size X_test
    if generate_test:
        synth_dataset_x_test = (
            model.gen_synthetic(
                dm_test.dataset.x,
                gen_order=model.get_gen_order(),
                biased_edges=biased_edges,
            )
            .detach()
            .numpy()
        )
        synth_dataset_x_test[:, -1] = synth_dataset_x_test[:, -1].astype(np.int8)

        synth_dataset_x_test = pd.DataFrame(synth_dataset_x_test,
                                    index=test_dataset.index,
                                    columns=test_dataset.columns)
        synth_dataset_x_test["ethnicity"] = np.round(synth_dataset_x_test["ethnicity"])

        return synth_dataset, synth_dataset_x_test

    return synth_dataset

In [4]:
# Define DAG for Credit dataset
credit_dag= [    
    # Edges from age
    ['age', 'yearsemployed'],
    
    # Edges from ethnicity
    ['ethnicity', 'approved'],
    ['ethnicity', 'married'],
    
    # Edges from default
    ["priordefault", "creditscore"],
    ["priordefault", "approved"],
    ["priordefault", "employed"],
    
    # Edges from zip
    ["zip", "married"],
    # Edges from citizen
    ["citizen","married"],
    # Edges from driverslicense
    ["driverslicense","employed"],
    # Edges from education_level
    ["educationlevel","employed"],
    ["educationlevel","married"],
    
    # Edges from yearsemployed
    ["yearsemployed", "creditscore"],
    # Edges from creditscore
    ["creditscore", "approved"],
    ["creditscore", "debt"],
    
    # Edges from employed
    ["employed", "bankcustomer"],
    ["employed", "debt"],
    ["employed", "citizen"],
    
    # Edges from debt
    ["debt", "income"],
    # Edges from married
    ["married", "approved"],
    
    # Edges from income
    ["income", "approved"],
    ["income", "married"],
]

def dag_to_idx(df, dag):
    """Convert columns in a DAG to the corresponding indices."""

    dag_idx = []
    for edge in dag:
        dag_idx.append([df.columns.get_loc(edge[0]), df.columns.get_loc(edge[1])])

    return dag_idx

#Convert the DAG to one that can be provided to the DECAF model
dag_seed_paper = dag_to_idx(credit_data, credit_dag)
print("dag_seed of paper: ",dag_seed_paper)

dag_seed of paper:  [[1, 7], [6, 15], [6, 3], [8, 10], [8, 15], [8, 9], [13, 3], [12, 3], [11, 9], [5, 9], [5, 3], [7, 10], [10, 15], [10, 2], [9, 4], [9, 2], [9, 12], [2, 14], [3, 15], [14, 15], [14, 3]]


In [5]:
def create_bias_dict(df, edge_map):
    """
    Convert the given edge tuples to a bias dict used for generating
    debiased synthetic data.
    """
    bias_dict = {}
    for key, val in edge_map.items():
        bias_dict[df.columns.get_loc(key)] = [df.columns.get_loc(f) for f in val]
    
    return bias_dict

bias_dict_nd = {}
print('Bias dict ND:', bias_dict_nd)

# Bias dictionary to satisfy FTU
bias_dict_ftu = create_bias_dict(credit_data, {'approved': ['ethnicity']})
print('Bias dict FTU:', bias_dict_ftu)

# Bias dictionary to satisfy DP
bias_dict_dp = create_bias_dict(credit_data, {'approved': ['married','ethnicity']})
print('Bias dict DP:', bias_dict_dp)

Bias dict ND: {}
Bias dict FTU: {15: [6]}
Bias dict DP: {15: [3, 6]}


In [6]:
# Split data into train and testing sets
X_train, X_test = train_test_split(credit_data, test_size=0.2)            
_, X_synth = train_decaf(X_train, dag_seed_paper, X_test, bias=-1, epochs=250, generate_test=True)

Initialised adjacency matrix as parsed:
 Parameter containing:
tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 1.],
        [0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 

  rank_zero_deprecation("DataModule property `dims` was deprecated in v1.5 and will be removed in v1.7.")
  rank_zero_deprecation("DataModule property `dims` was deprecated in v1.5 and will be removed in v1.7.")


In [7]:
X_synth[X_synth['approved'] > 0]

Unnamed: 0,male,age,debt,married,bankcustomer,educationlevel,ethnicity,yearsemployed,priordefault,employed,creditscore,driverslicense,citizen,zip,income,approved
619,0.998504,0.041671,0.16624,0.697011,0.687802,0.362353,1.0,0.203573,0.008763,1.0,0.003966,0.9961885,4.121422e-06,0.677733,0.002375,1.0
463,0.296814,0.403299,0.011599,0.710245,0.651135,0.798047,1.0,0.00022,0.008544,0.999984,0.000951,4.226015e-11,6.113227e-06,0.029042,0.000277,1.0
213,0.986046,0.044959,0.009857,0.68665,0.285029,0.242211,1.0,0.137373,0.008194,1.0,0.000153,0.9958941,1.074315e-12,0.047845,0.000284,1.0
165,0.291636,0.047369,0.005433,0.682305,0.276013,0.796807,1.0,0.060315,0.01013,1.0,0.000146,0.9955711,1.015533e-10,0.205406,0.001334,1.0
2,0.999999,0.296838,0.007145,0.711416,0.299072,0.798134,1.0,0.001347,0.00812,0.999233,0.000262,0.1849099,1.430713e-08,0.680122,0.000992,1.0
80,0.999961,0.352864,0.327708,0.681553,0.257903,0.351548,1.0,0.008755,0.06571,1.0,0.023908,0.9957348,0.03901292,0.228109,0.000182,1.0
34,0.474442,0.460031,0.083659,0.706829,0.636795,0.787074,1.0,0.369804,0.009402,0.06537,8.9e-05,5.154211e-08,0.005206937,0.583997,0.00028,1.0
42,0.999846,0.452056,0.108552,0.700555,0.296957,0.402053,1.0,0.007048,0.00982,1.0,0.00841,2.474727e-05,3.474187e-13,0.487198,0.000159,1.0
346,0.285789,0.403353,0.00394,0.673438,0.33843,0.795446,1.0,0.435052,0.063813,0.999207,0.000156,0.9956285,1.234888e-10,0.009736,0.000644,1.0
112,0.991927,0.178315,0.293606,0.707005,0.332864,0.798147,1.0,0.093287,0.008819,1.0,9.7e-05,0.6835418,0.9440049,0.002376,0.000106,1.0


In [9]:
# save data due to dependencies
X_test.to_csv('x_test_credit.csv')
X_synth.to_csv('x_test_synth.csv')

#### evaluate

In [1]:
# synthcity dataloader and evaluation
import pandas as pd
import numpy as np

from synthcity.plugins.core.dataloader import GenericDataLoader
from synthcity.metrics import eval_detection, eval_performance, eval_statistical



In [2]:
def evaluate_synthetic(X_synth, X_test):
    quality_evaluator = eval_statistical.AlphaPrecision()
    qual_res = quality_evaluator.evaluate(X_test, X_synth)
    qual_res = {
        k: v for (k, v) in qual_res.items() if "naive" in k
    }  # use the naive implementation of AlphaPrecision
    qual_score = np.mean(list(qual_res.values()))

    xgb_evaluator = eval_performance.PerformanceEvaluatorXGB()
    linear_evaluator = eval_performance.PerformanceEvaluatorLinear()
    mlp_evaluator = eval_performance.PerformanceEvaluatorMLP()
    
    xgb_score = xgb_evaluator.evaluate(X_test, X_synth)
    linear_score = linear_evaluator.evaluate(X_test, X_synth)
    mlp_score = mlp_evaluator.evaluate(X_test, X_synth)
    gt_perf = (xgb_score["gt"] + linear_score["gt"] + mlp_score["gt"]) / 3

    synth_perf = (
        xgb_score["syn_ood"] + linear_score["syn_ood"] + mlp_score["syn_ood"]
    ) / 3

    xgb_detector = eval_detection.SyntheticDetectionXGB()
    mlp_detector = eval_detection.SyntheticDetectionMLP()
    gmm_detector = eval_detection.SyntheticDetectionGMM()
    xgb_det = xgb_detector.evaluate(X_test, X_synth)
    mlp_det = mlp_detector.evaluate(X_test, X_synth)
    gmm_det = gmm_detector.evaluate(X_test, X_synth)
    det_score = (xgb_det["mean"] + mlp_det["mean"] + gmm_det["mean"]) / 3
    
    return qual_score, (gt_perf, synth_perf), det_score


In [3]:
# load data due to dependency issues
X_test = pd.read_csv('x_test_credit.csv', index_col = 0)
X_synth_test = pd.read_csv('x_test_synth.csv', index_col=0)

In [4]:
label = "approved"

X_synth_loader = GenericDataLoader(
    X_synth_test,
    target_column=label,
)
X_test_loader = GenericDataLoader(
    X_test,
    target_column=label,
)

res = evaluate_synthetic(X_synth_loader, X_test_loader)

print(f"Quality: {res[0]:.3f}")
print(f"Detection: {res[2]:.3f}")
print(
    f"Performance on real: {res[1][0]:.3f}, on synth: {res[1][1]:.3f}, diff: {(res[1][0] - res[1][1]):.3f}"
)

Quality: 0.532
Detection: 0.725
Performance on real: 0.887, on synth: 0.663, diff: 0.224
