### Import the Data

In [1]:
import pandas as pd
from sdv.metadata import SingleTableMetadata
from sdv.single_table import CTGANSynthesizer

# train_df = pd.read_csv('test_data/adults_train-test.csv')
train_df = pd.read_parquet('datasets/adults_train.parquet')

control_df = pd.read_csv('test_data/adults_control-test.csv')
synth_df = pd.read_parquet('test_data/adults_syn_ctgan.parquet')
synth_filepath = 'test_data/dataset_adults_train_ctgan_synthesizer.pkl'
synthesizer = CTGANSynthesizer.load(filepath=synth_filepath)

##make sure these libraries work
import pykeops 
#make sure omp.h is also working, may need to google if warning occurs or run this code block twice
#make sure all libraries are include, you may need to find the header files and move them locally so they can be imported
pykeops.clean_pykeops()   
pykeops.test_numpy_bindings() 
pykeops.test_torch_bindings() 


[KeOps] /Users/chhduong/.cache/keops2.1.2/Darwin_CHHDUONG-M-F2GZ_23.2.0_p3.9.6 has been cleaned.
[KeOps] Generating code for formula Sum_Reduction((Var(0,3,0)-Var(1,3,1))|(Var(0,3,0)-Var(1,3,1)),1) ... OK
[pyKeOps] Compiling pykeops cpp cb73cd1bce module ... 

<stdin>:1:10: fatal error: 'omp.h' file not found
#include <omp.h>
         ^~~~~~~
1 error generated.


OK
pyKeOps with numpy bindings is working!
pyKeOps with torch bindings is working!


### LabelOnlyDecisionBoundary?

In [3]:
# from art.attacks.inference.membership_inference import LabelOnlyDecisionBoundary


### DOMIAS Overfitting

In [2]:
#convert columns to numbers
# dummies = pd.get_dummies(
#     data=train_df, 
#     columns =['type_employer', 'education', 'marital', 'occupation', 'relationship', 'race', 'sex', 'country', 'income'], 
#     dtype=int
# )
# merged = pd.concat([train_df, dummies], axis='columns')
# merged.drop(
#     ['type_employer', 'education', 'marital', 'occupation', 'relationship', 'race', 'sex', 'country', 'income'], 
#     axis='columns', 
#     inplace=True
# )
# train_df = dummies
# train_df.to_parquet('test_data/temp_train.parquet')
# train_df = pd.read_parquet('test_data/temp_train.parquet')

## Easier to just drop all non-numerical columns
# train_df.drop(
#     ['type_employer', 'education', 'marital', 'occupation', 'relationship', 'race', 'sex', 'country', 'income'], 
#     axis='columns', 
#     inplace=True
# )

#smarter way to convert columns to numbers
transformed_df = train_df
for col in ['type_employer', 'education', 'marital', 'occupation', 'relationship', 'race', 'sex', 'country', 'income']:
    uniques = transformed_df[col].unique()
    for index, value in enumerate(uniques):
        transformed_df.loc[transformed_df[col] == value, col] = index+1

train_df = transformed_df
train_df.to_parquet('test_data/temp_train.parquet')
train_df = pd.read_parquet('test_data/temp_train.parquet')

## For some reason, country is causing the evaluator to run into an singular matrix exception.
train_df.drop(
    ['country'], 
    axis='columns', 
    inplace=True
)

  if _pandas_api.is_sparse(col):


In [3]:
from domias.evaluator import evaluate_performance
from domias.models.generator import GeneratorInterface
from sklearn.preprocessing import StandardScaler
import numpy as np

# DOMIAS Requires a generator derived from GeneratorInterface 
# The generator only requires 'fit' and 'generator' methods
# For testing, we set this up with CTGANSynthesizer
def get_generator(
    synthesizer,
    data
) -> GeneratorInterface:
    class LocalGenerator(GeneratorInterface):
        def __init__(self) -> None:
            self.model = synthesizer
            self.data = data
        def fit(self, data: pd.DataFrame) -> "LocalGenerator":
            self.model.fit(self.data)
            # self.model.fit(data)
            return self
        def generate(self, count: int) -> pd.DataFrame:
            return self.model.sample(count)
    return LocalGenerator()

mem_set_size = 30000
reference_set_size = 39000
training_epochs = 10
synthetic_sizes = [10000]
density_estimator = "prior"  # prior, kde, bnaf

metadata = SingleTableMetadata()
metadata.detect_from_dataframe(train_df)
# metadata_dict = metadata.to_dict()
# print(metadata_dict)
dataset = train_df.to_numpy()
# scaler = StandardScaler()
# dataset = scaler.fit_transform(train_df)

synthesizer = CTGANSynthesizer(
    metadata=metadata,
    epochs=training_epochs,
)
generator = get_generator(synthesizer, train_df)

perf = evaluate_performance(
    generator,
    dataset,
    mem_set_size,
    reference_set_size,
    training_epochs=training_epochs,
    synthetic_sizes=synthetic_sizes,
    density_estimator=density_estimator,
)



[KeOps] Generating code for formula Max_SumShiftExpWeight_Reduction(Concat(Var(2,1,1)-Var(3,1,2)*(((Var(0,1,0)-Var(1,1,1))*Sum(Var(0,1,0)-Var(1,1,1)))/2),1),0) ... OK
[pyKeOps] Compiling pykeops cpp c0c0296eaf module ... 

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


OK


  return 1 / (1 + np.exp(-x))


In [4]:
#DONE: Go through https://github.com/vanderschaarlab/DOMIAS/blob/main/tutorials/experiment_1_housing.ipynb

# print(perf)
# print(type(perf)

for key in perf[10000]:
    print(key)
    print(perf[10000][key])

# print(perf[10000]['data']['Xtest'][0])
# print(perf[10000]['data']['Ytest'][0])

# AUC by number of iterations
# output = pd.DataFrame([], columns=["epoch", "src", "aucroc"])
# for key in perf[10000]['MIA_performance']:
#     print("====" + str(key))
#     print(perf[10000]['MIA_performance'][key])
    # output = pd.concat(
    #     [
    #         output,
    #         pd.DataFrame(
    #             [
    #                 [training_epochs, key, perf[10000]['MIA_performance'][key]["aucroc"]],
    #             ],
    #             columns=["epoch", "src", "aucroc"],
    #         ),
    #     ]
    # )

MIA_performance
{'ablated_eq1': {'accuracy': 0.4964131994261119, 'aucroc': 0.4924055248007086}, 'ablated_eq2': {'accuracy': 0.49800163968026234, 'aucroc': 0.49265832964275175}, 'LOGAN_D1': {'accuracy': 0.5001024800163968, 'aucroc': 0.4996614020519634}, 'MC': {'accuracy': 0.49789915966386555, 'aucroc': 0.4978228391644523}, 'gan_leaks': {'accuracy': 0.3386452141832343, 'aucroc': 0.49597678255093}, 'gan_leaks_cal': {'accuracy': 0.3326501332240213, 'aucroc': 0.4958167478594627}, 'LOGAN_0': {'accuracy': 0.49897519983603195, 'aucroc': 0.4999799712134632}, 'eq1': {'accuracy': 0.4964131994261119, 'aucroc': 0.4924055248007086}, 'domias': {'accuracy': 0.4964131994261119, 'aucroc': 0.4924055248007086}}
MIA_scores
{'ablated_eq1': array([1.46462664e-23, 9.67326085e-23, 1.98645225e-21, ...,
       4.42223316e-23, 1.26233193e-21, 1.16925141e-22]), 'ablated_eq2': array([0.00134959, 0.00276494, 0.01685144, ..., 0.00313534, 0.02328711,
       0.00668818]), 'LOGAN_D1': array([19438.15 , 48468.234, 34145.

### Save and Load Results

In [11]:
results = perf[10000]['data']
# xtest = results['Xtest']
ytest = results['Ytest']
# type(xtest)
# xtest_df = pd.DataFrame(xtest)
# train_ndarray = train_df.to_numpy()

# #result: xtest and train data are the same, ytest is just whether the train data row exists in the synthetic dataset
# anonymeter uses: n_attacks and n_success

mia_scores = perf[10000]['MIA_scores']['domias']
y_pred = mia_scores > np.median(mia_scores)
n_attacks = y_pred.size
n_success = (y_pred == True).sum()
print(n_attacks)
print(n_success)

#need to save mia_scores, y_pred, and ytest
print(type(mia_scores))
print(type(y_pred))
print(type(ytest))

results_tosave = {
    'y_true': ytest,
    'y_pred': y_pred,
    'mia_scores': mia_scores
}
results_df = pd.DataFrame(data=results_tosave)
# results_df.to_parquet('test_data/temp_domias_attack.parquet')

39032
19516
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


  if _pandas_api.is_sparse(col):
