In [1]:
import json
import os
from os import mkdir, path
from numpy.random import choice, seed
from argparse import ArgumentParser
from pandas import DataFrame

#import pickle
import pickle5 as pickle
from utils.datagen import load_s3_data_as_df, load_local_data_as_df
from utils.utils import json_numpy_serialzer
from utils.logging import LOGGER
from utils.constants import *

from feature_sets.independent_histograms import HistogramFeatureSet
from feature_sets.model_agnostic import NaiveFeatureSet, EnsembleFeatureSet
from feature_sets.bayes import CorrelationsFeatureSet

In [2]:
from generative_models.data_synthesiser import BayesianNet

In [3]:
from attack_models.mia_classifier import (MIAttackClassifierRandomForest,
                                          generate_mia_shadow_data,
                                          generate_mia_anon_data)

from warnings import simplefilter
simplefilter('ignore', category=FutureWarning)
simplefilter('ignore', category=DeprecationWarning)

SEED = 42

In [4]:
cwd="/home/cs/grad/sarminf/myProjects/synthetic_data_release/"

In [None]:
datapath="data/texas"
#help='Path relative to cwd of runconfig file')
runconfig="tests/linkage/runconfig.json"
#help='Path relative to cwd for storing output files'
outdir="tests/linkage"

# Load runconfig
with open(cwd+runconfig) as f:
    runconfig = json.load(f)
print('Runconfig:')
print(runconfig)

In [6]:
# Load data
rawPop, metadata = load_local_data_as_df(cwd+datapath)
dname = datapath.split('/')[-1]#data/texas

print(f'Loaded data {dname}:')
print(rawPop.info())

# Make sure outdir exists
if not path.isdir(outdir):
    mkdir(outdir)

seed(SEED)

Loaded data texas:
<class 'pandas.core.frame.DataFrame'>
Index: 100000 entries, ID0 to ID99999
Data columns (total 18 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   DISCHARGE                     100000 non-null  object 
 1   TYPE_OF_ADMISSION             100000 non-null  object 
 2   PAT_STATE                     100000 non-null  object 
 3   PAT_STATUS                    100000 non-null  object 
 4   SEX_CODE                      100000 non-null  object 
 5   RACE                          100000 non-null  object 
 6   ETHNICITY                     100000 non-null  object 
 7   ADMIT_WEEKDAY                 100000 non-null  object 
 8   PAT_AGE                       100000 non-null  object 
 9   RISK_MORTALITY                100000 non-null  object 
 10  ILLNESS_SEVERITY              100000 non-null  object 
 11  LENGTH_OF_STAY                100000 non-null  int64  
 12  TOTAL_CHARGES              

In [7]:
    ########################
    #### GAME INPUTS #######
    ########################
# Pick targets
targetIDs = choice(list(rawPop.index), size=runconfig['nTargets'], replace=False).tolist()
targetIDs

[]

In [8]:
# If specified: Add specific target records
targetAgainst=[]
if runconfig['Targets'] is not None:
    targetIDs.extend(runconfig['Targets'])
    targetAgainst.extend(runconfig['TarAgainst'])
print(targetIDs)
print(targetAgainst)

['ID14086']
['ID80559']


In [9]:
targets = rawPop.loc[["ID14086","ID80559"]]#targetIDs]

In [10]:
targets

Unnamed: 0_level_0,DISCHARGE,TYPE_OF_ADMISSION,PAT_STATE,PAT_STATUS,SEX_CODE,RACE,ETHNICITY,ADMIT_WEEKDAY,PAT_AGE,RISK_MORTALITY,ILLNESS_SEVERITY,LENGTH_OF_STAY,TOTAL_CHARGES,TOTAL_NON_COV_CHARGES,TOTAL_CHARGES_ACCOMM,TOTAL_NON_COV_CHARGES_ACCOMM,TOTAL_CHARGES_ANCIL,TOTAL_NON_COV_CHARGES_ANCIL
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
ID14086,2013Q1,3,TX,3,F,4,2,1,18,4,4,16,145746.55,0.0,33915.0,0.0,116847.2265,0.0
ID80559,2014Q3,1,TX,20,M,4,2,5,14,4,4,16,145746.55,0.0,33915.0,0.0,116847.2265,0.0


In [11]:
# Drop targets from population
rawPopDropTargets = rawPop.drop(["ID14086","ID80559"])
rawPopDropTargets

Unnamed: 0_level_0,DISCHARGE,TYPE_OF_ADMISSION,PAT_STATE,PAT_STATUS,SEX_CODE,RACE,ETHNICITY,ADMIT_WEEKDAY,PAT_AGE,RISK_MORTALITY,ILLNESS_SEVERITY,LENGTH_OF_STAY,TOTAL_CHARGES,TOTAL_NON_COV_CHARGES,TOTAL_CHARGES_ACCOMM,TOTAL_NON_COV_CHARGES_ACCOMM,TOTAL_CHARGES_ANCIL,TOTAL_NON_COV_CHARGES_ANCIL
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
ID0,2013Q4,3,TX,6,F,4,2,3,16,1,1,4,55076.06,0.0,5328.00,0.0,49748.06,0.0
ID1,2013Q1,4,TX,1,F,4,2,4,01,1,1,2,2119.00,0.0,1790.00,0.0,329.00,0.0
ID2,2013Q1,1,TX,1,F,4,2,1,11,1,1,1,9534.00,0.0,969.00,0.0,8565.00,0.0
ID3,2013Q3,2,TX,3,M,4,2,1,18,2,1,3,70710.30,0.0,2172.00,0.0,68538.30,0.0
ID4,2013Q4,2,TX,1,M,4,2,3,16,3,3,3,66653.04,0.0,10185.00,0.0,56468.04,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ID99995,2014Q4,2,TX,1,F,4,2,2,01,2,3,2,23188.62,0.0,3443.06,0.0,19745.56,0.0
ID99996,2014Q2,3,TX,61,F,4,2,5,15,3,4,16,135871.32,0.0,33915.00,0.0,78730.32,0.0
ID99997,2014Q2,3,TX,1,F,4,2,2,18,2,2,16,28815.16,0.0,20614.00,0.0,8201.16,0.0
ID99998,2014Q3,1,TX,1,M,3,2,3,12,1,1,2,21885.58,0.0,1246.24,0.0,20639.34,0.0


In [12]:
rawAidx = choice(list(rawPopDropTargets.index), size=runconfig['sizeRawA'], replace=False).tolist()
print(len(rawAidx))

10000


In [13]:
rawA = rawPop.loc[rawAidx, :]

In [14]:
from sklearn.model_selection import ShuffleSplit

seed(SEED)
kf = ShuffleSplit(n_splits=10, train_size=1000, random_state=0)
HundredIndex=[]
for train_index, _ in kf.split(rawA):
    if isinstance(rawA, DataFrame):
        rawAout = rawA.iloc[train_index]
    else:
        rawAout = rawA[train_index, :]
    HundredIndex.append(rawAout.index)
###End Edit###

In [15]:
rawA.loc[HundredIndex[0]]

Unnamed: 0_level_0,DISCHARGE,TYPE_OF_ADMISSION,PAT_STATE,PAT_STATUS,SEX_CODE,RACE,ETHNICITY,ADMIT_WEEKDAY,PAT_AGE,RISK_MORTALITY,ILLNESS_SEVERITY,LENGTH_OF_STAY,TOTAL_CHARGES,TOTAL_NON_COV_CHARGES,TOTAL_CHARGES_ACCOMM,TOTAL_NON_COV_CHARGES_ACCOMM,TOTAL_CHARGES_ANCIL,TOTAL_NON_COV_CHARGES_ANCIL
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
ID71705,2014Q4,3,TX,6,F,4,2,6,17,2,2,10,15206.00,0.0,8380.0,0.0,6826.00,0.0
ID31142,2013Q3,2,TX,1,F,5,2,3,01,1,2,1,15561.64,0.0,1399.0,0.0,14162.64,0.0
ID27601,2013Q1,3,TX,6,M,4,2,3,14,1,3,16,45919.53,0.0,15840.0,0.0,30079.53,0.0
ID84590,2014Q3,1,TX,1,F,5,2,7,01,1,1,2,14297.10,0.0,1560.0,0.0,12737.10,0.0
ID16375,2013Q4,2,TX,1,M,4,2,6,01,1,1,6,9036.00,0.0,9036.0,0.0,0.00,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ID2653,2013Q1,3,TX,1,F,5,1,4,01,1,2,3,32877.12,0.0,5792.5,0.0,27084.62,0.0
ID20522,2013Q4,1,TX,3,F,5,2,2,16,1,2,3,49846.00,0.0,6363.0,0.0,43483.00,0.0
ID89449,2014Q2,1,TX,1,F,3,2,2,01,1,3,9,13642.00,0.0,8118.0,0.0,5524.00,0.0
ID92528,2014Q2,4,TX,1,F,5,1,2,01,1,1,2,3433.65,0.0,1155.0,0.0,2278.65,0.0


In [16]:
runconfig.keys()

dict_keys(['nIter', 'sizeRawA', 'nSynA', 'nShadows', 'sizeRawT', 'sizeSynT', 'nSynT', 'nSynToriginal', 'nTargets', 'Targets', 'TarAgainst', 'TarAgainstNewOutliers', 'TarRest', 'PrivBayes', 'CTGAN', 'PATEGAN', 'generativeModels', 'SanitiserNHS', 'sanitisationTechniques'])

In [17]:
runconfig['generativeModels'].items()

dict_items([('BayesianNet', [[25, 1]])])

In [18]:
# List of candidate generative models to evaluate (It will need for gen model later)
gmList = []
if 'generativeModels' in runconfig.keys():
    for gm, paramsList in runconfig['generativeModels'].items():
        print(paramsList)
        print("\n\n")
        if gm == 'CTGAN':
            for params in paramsList:
                gmList.append(CTGAN(metadata, *params))
        elif gm == 'BayesianNet':
            for params in paramsList:
                gmList.append(BayesianNet(metadata, *params))
        elif gm == 'PrivBayes':
            for params in paramsList:
                gmList.append(PrivBayes(metadata, *params))
        elif gm == 'PATEGAN':
            for params in paramsList:
                gmList.append(PATEGAN(metadata, *params))
sanList = []
if 'sanitisationTechniques' in runconfig.keys():
    for name, paramsList in runconfig['sanitisationTechniques'].items():
        print(paramsList)
        print("\n\n")
            
        if name == 'SanitiserNHS':
            for params in paramsList:
                sanList.append(SanitiserNHS(metadata, *params))
        else:
            raise ValueError(f'Unknown sanitisation technique {name}')

[[25, 1]]





In [19]:
    ###################################
    #### ATTACK TRAINING #############
    ##################################
print('\n---- Attack training ----')
attacks = {}


---- Attack training ----


In [20]:
import copy
import pickle

synA=[]
labelsA = []
sanA = []
labelsA = []
for tid in targetIDs:
    print(f'\n--- Adversary picks target {tid} ---')
    target = targets.loc[[tid]]
    attacks[tid] = {}
    for GenModel in gmList:
        LOGGER.info(f'Start: Attack training for {GenModel.__name__}...')

        attacks[tid][GenModel.__name__] = {}

        synA, labelsSA = generate_mia_shadow_data(GenModel, target, rawA, runconfig['sizeRawT'], runconfig['sizeSynT'], runconfig['nShadows'], runconfig['nSynA'])#runconfig['nShadows'], runconfig['nSynA'])#1,1
        
        for Feature in [NaiveFeatureSet(GenModel.datatype), HistogramFeatureSet(GenModel.datatype, metadata), CorrelationsFeatureSet(GenModel.datatype, metadata)]:
            Attack  = MIAttackClassifierRandomForest(metadata, Feature)
            Attack.train(synA, labelsSA)
            attacks[tid][GenModel.__name__][f'{Feature.__name__}'] = Attack
           #Clean up  
        del synA, labelsSA #mine hash

        LOGGER.info(f'Finished: Attack training.')
    #break#one loop 


2024-12-11 11:44:39,836:root:INFO:Start: Attack training for BayesianNet...



--- Adversary picks target ID14086 ---
multiprocess


2024-12-11 11:45:37,765:root:INFO:Finished: Attack training.


In [21]:
    ##################################
    ######### EVALUATION #############
    ##################################
resultsTargetPrivacy = {tid: {gm.__name__: {} for gm in gmList + sanList} for tid in targetIDs}

# Without non-member outlier

In [22]:
#Without outlier original 
import pickle
non_sens_cols = ['DISCHARGE', 'TYPE_OF_ADMISSION', 'PAT_STATUS', 'ADMIT_WEEKDAY', 'RISK_MORTALITY','ILLNESS_SEVERITY', 'LENGTH_OF_STAY', 'TOTAL_CHARGES', 'TOTAL_NON_COV_CHARGES', 'TOTAL_CHARGES_ACCOMM', 'TOTAL_NON_COV_CHARGES_ACCOMM', 'TOTAL_CHARGES_ANCIL','TOTAL_NON_COV_CHARGES_ANCIL']

raw_setsA=[]
synth_sets={}
for tid in targetIDs:
    synth_sets[tid]=[]

print('\n---- Start the game ----') 
for nr in range(15):
    print(f'\n--- Game iteration {nr + 1} ---')
    
    rIdx = choice(list(rawPopDropTargets.index), size=runconfig['sizeRawT'], replace=False).tolist() 
    rawTout = rawPopDropTargets.loc[rIdx]
    
    rawToutB = copy.deepcopy(rawTout)
    raw_setsA.append(rawToutB)
    for GenModel in gmList:
        LOGGER.info(f'Start: Evaluation for model {GenModel.__name__}...')
        # Train a generative model
        GenModel.fit(rawTout)
        synTwithoutTarget = [GenModel.generate_samples(runconfig['sizeSynT']) for _ in range(runconfig['nSynT'])]
        synLabelsOut = [LABEL_OUT for _ in range(runconfig['nSynT'])]

        for tid in targetIDs:
            LOGGER.info(f'Target: {tid}')
            target = targets.loc[[tid]]
            resultsTargetPrivacy[tid][f'{GenModel.__name__}'][nr] = {}

            rawTin = rawTout.append(target)
            GenModel.fit(rawTin)
            synTwithTarget = [GenModel.generate_samples(runconfig['sizeSynT']) for _ in range(runconfig['nSynT'])]
            synLabelsIn = [LABEL_IN for _ in range(runconfig['nSynT'])]

            
            synT = synTwithoutTarget + synTwithTarget
            synTlabels = synLabelsOut + synLabelsIn
            
            synB=copy.deepcopy(synT)
            synth_sets[tid].extend(synB)
                        
            # Run attacks
            for feature, Attack in attacks[tid][f'{GenModel.__name__}'].items():
                # Produce a guess for each synthetic dataset
                attackerGuesses = Attack.attack(synT)

                resDict = {
                    'Secret': synTlabels,
                    'AttackerGuess': attackerGuesses
                }
                resultsTargetPrivacy[tid][f'{GenModel.__name__}'][nr][feature] = resDict

        del synT, synTwithoutTarget, synTwithTarget

        LOGGER.info(f'Finished: Evaluation for model {GenModel.__name__}.')
#break#one loop ''''''

    

2024-12-11 11:45:37,917:root:INFO:Start: Evaluation for model BayesianNet...



---- Start the game ----

--- Game iteration 1 ---


2024-12-11 11:45:42,100:root:INFO:Target: ID14086
2024-12-11 11:45:48,050:root:INFO:Finished: Evaluation for model BayesianNet.
2024-12-11 11:45:48,067:root:INFO:Start: Evaluation for model BayesianNet...



--- Game iteration 2 ---


2024-12-11 11:45:51,898:root:INFO:Target: ID14086
2024-12-11 11:45:57,561:root:INFO:Finished: Evaluation for model BayesianNet.
2024-12-11 11:45:57,578:root:INFO:Start: Evaluation for model BayesianNet...



--- Game iteration 3 ---


2024-12-11 11:46:01,452:root:INFO:Target: ID14086
2024-12-11 11:46:07,117:root:INFO:Finished: Evaluation for model BayesianNet.
2024-12-11 11:46:07,135:root:INFO:Start: Evaluation for model BayesianNet...



--- Game iteration 4 ---


2024-12-11 11:46:11,337:root:INFO:Target: ID14086
2024-12-11 11:46:17,001:root:INFO:Finished: Evaluation for model BayesianNet.
2024-12-11 11:46:17,019:root:INFO:Start: Evaluation for model BayesianNet...



--- Game iteration 5 ---


2024-12-11 11:46:21,058:root:INFO:Target: ID14086
2024-12-11 11:46:26,722:root:INFO:Finished: Evaluation for model BayesianNet.
2024-12-11 11:46:26,739:root:INFO:Start: Evaluation for model BayesianNet...



--- Game iteration 6 ---


2024-12-11 11:46:30,703:root:INFO:Target: ID14086
2024-12-11 11:46:36,391:root:INFO:Finished: Evaluation for model BayesianNet.
2024-12-11 11:46:36,408:root:INFO:Start: Evaluation for model BayesianNet...



--- Game iteration 7 ---


2024-12-11 11:46:40,476:root:INFO:Target: ID14086
2024-12-11 11:46:46,166:root:INFO:Finished: Evaluation for model BayesianNet.
2024-12-11 11:46:46,184:root:INFO:Start: Evaluation for model BayesianNet...



--- Game iteration 8 ---


2024-12-11 11:46:50,164:root:INFO:Target: ID14086
2024-12-11 11:46:55,925:root:INFO:Finished: Evaluation for model BayesianNet.
2024-12-11 11:46:55,942:root:INFO:Start: Evaluation for model BayesianNet...



--- Game iteration 9 ---


2024-12-11 11:46:59,921:root:INFO:Target: ID14086
2024-12-11 11:47:05,899:root:INFO:Finished: Evaluation for model BayesianNet.
2024-12-11 11:47:05,916:root:INFO:Start: Evaluation for model BayesianNet...



--- Game iteration 10 ---


2024-12-11 11:47:09,840:root:INFO:Target: ID14086
2024-12-11 11:47:15,617:root:INFO:Finished: Evaluation for model BayesianNet.
2024-12-11 11:47:15,634:root:INFO:Start: Evaluation for model BayesianNet...



--- Game iteration 11 ---


2024-12-11 11:47:19,559:root:INFO:Target: ID14086
2024-12-11 11:47:25,228:root:INFO:Finished: Evaluation for model BayesianNet.
2024-12-11 11:47:25,245:root:INFO:Start: Evaluation for model BayesianNet...



--- Game iteration 12 ---


2024-12-11 11:47:29,201:root:INFO:Target: ID14086
2024-12-11 11:47:34,904:root:INFO:Finished: Evaluation for model BayesianNet.
2024-12-11 11:47:34,921:root:INFO:Start: Evaluation for model BayesianNet...



--- Game iteration 13 ---


2024-12-11 11:47:38,787:root:INFO:Target: ID14086
2024-12-11 11:47:44,503:root:INFO:Finished: Evaluation for model BayesianNet.
2024-12-11 11:47:44,520:root:INFO:Start: Evaluation for model BayesianNet...



--- Game iteration 14 ---


2024-12-11 11:47:48,546:root:INFO:Target: ID14086
2024-12-11 11:47:54,515:root:INFO:Finished: Evaluation for model BayesianNet.
2024-12-11 11:47:54,532:root:INFO:Start: Evaluation for model BayesianNet...



--- Game iteration 15 ---


2024-12-11 11:47:58,498:root:INFO:Target: ID14086
2024-12-11 11:48:04,189:root:INFO:Finished: Evaluation for model BayesianNet.


In [23]:
outfile = "ResultsMIA_texas"
LOGGER.info(f"Write results to {path.join(f'{outdir}', f'{outfile}')}")

with open('/home/cs/grad/sarminf/myProjects/synthetic_data_release/tests/linkage/ResultsMIA_texas.json', 'w') as f:
    json.dump(resultsTargetPrivacy, f, indent=2, default=json_numpy_serialzer)
%load_ext autoreload
%autoreload 2
%matplotlib inline

from warnings import filterwarnings
filterwarnings('ignore')
import sys
sys.path.append('../')
from utils.analyse_results import *

2024-12-11 11:48:04,195:root:INFO:Write results to tests/linkage/ResultsMIA_texas


In [24]:
from utils.analyse_results import load_results_linkage
dirname = '/home/cs/grad/sarminf/myProjects/synthetic_data_release/tests/linkage'
load_results_linkage('/home/cs/grad/sarminf/myProjects/synthetic_data_release/tests/linkage')

/home/cs/grad/sarminf/myProjects/synthetic_data_release/tests/linkage/ResultsMIA_texas.json


Unnamed: 0,TargetID,TargetModel,FeatureSet,Run,TPSyn,FPSyn,AdvantageSyn,AdvantageRaw,PrivacyGain
0,ID14086,BayesianNet,Correlations,0,0.5,0.4,0.1,1,0.9
1,ID14086,BayesianNet,Correlations,1,0.9,0.6,0.3,1,0.7
2,ID14086,BayesianNet,Correlations,10,0.4,0.3,0.1,1,0.9
3,ID14086,BayesianNet,Correlations,11,0.8,0.4,0.4,1,0.6
4,ID14086,BayesianNet,Correlations,12,0.2,0.4,-0.2,1,1.2
5,ID14086,BayesianNet,Correlations,13,0.4,0.5,-0.1,1,1.1
6,ID14086,BayesianNet,Correlations,14,0.7,0.2,0.5,1,0.5
7,ID14086,BayesianNet,Correlations,2,0.4,0.2,0.2,1,0.8
8,ID14086,BayesianNet,Correlations,3,0.5,0.3,0.2,1,0.8
9,ID14086,BayesianNet,Correlations,4,0.6,0.7,-0.1,1,1.1


In [25]:
gg = load_results_linkage(dirname)
MYdf = []

for tid in targetIDs:
    df2 = gg.groupby(['TargetID', 'TargetModel', 'FeatureSet'])[['TPSyn', 'FPSyn']].sum()
    # Multiply TPSyn and FPSyn by 10
    df2['TPSyn'] *= 10
    df2['FPSyn'] *= 10
    # Compute PrivacyGain
    df2['PrivacyGain'] = 1 - ((df2['TPSyn'] / 150) - (df2['FPSyn'] / 150))
    MYdf.append(df2)
    print(df2)
    break

/home/cs/grad/sarminf/myProjects/synthetic_data_release/tests/linkage/ResultsMIA_texas.json
                                   TPSyn  FPSyn  PrivacyGain
TargetID TargetModel FeatureSet                             
ID14086  BayesianNet Correlations   87.0   75.0     0.920000
                     Histogram      95.0   94.0     0.993333
                     Naive          61.0   62.0     1.006667


# With non-member outlier

In [26]:
import pickle

print('\n---- Start the game ----')
for nr in range(15):
    print(f'\n--- Game iteration {nr + 1} ---')
    
    rIdx = choice(list(rawPopDropTargets.index), size=runconfig['sizeRawT'], replace=False).tolist()
    rawTout = rawPopDropTargets.loc[rIdx]


    for GenModel in gmList:
        LOGGER.info(f'Start: Evaluation for model {GenModel.__name__}...')      

        for ik in range(len(targetIDs)):
            tid=targetIDs[ik]
            tidvs=targetAgainst[ik]
            LOGGER.info(f'Target: {tid}, TargetAgainst: {tidvs}')
            target = targets.loc[[tid]]
            targetvs = targets.loc[[tidvs]]
            resultsTargetPrivacy[tid][f'{GenModel.__name__}'][nr] = {}
            
            # Train a generative model with non-member outlier
            rawTout2 = rawTout.append(targetvs)
            GenModel.fit(rawTout2)
            synTwithoutTarget = [GenModel.generate_samples(runconfig['sizeSynT']) for _ in range(runconfig['nSynT'])]
            synLabelsOut = [LABEL_OUT for _ in range(runconfig['nSynT'])]
        
            
            GenModel.fit(rawTout.append(target))
            synTwithTarget2 = [GenModel.generate_samples(runconfig['sizeSynT']) for _ in range(runconfig['nSynT'])]
            synLabelsIn2 = [LABEL_IN for _ in range(runconfig['nSynT'])]
            
            synT = synTwithoutTarget + synTwithTarget2
            synTlabels = synLabelsOut + synLabelsIn2

            # Run attacks
            for feature, Attack in attacks[tid][f'{GenModel.__name__}'].items():
                # Produce a guess for each synthetic dataset
                attackerGuesses = Attack.attack(synT)

                resDict = {
                    'Secret': synTlabels,
                    'AttackerGuess': attackerGuesses
                }
                resultsTargetPrivacy[tid][f'{GenModel.__name__}'][nr][feature] = resDict
    
        del synT, synTwithoutTarget, synTwithTarget2

        LOGGER.info(f'Finished: Evaluation for model {GenModel.__name__}.')
    

2024-12-11 11:49:11,389:root:INFO:Start: Evaluation for model BayesianNet...
2024-12-11 11:49:11,389:root:INFO:Target: ID14086, TargetAgainst: ID80559



---- Start the game ----

--- Game iteration 1 ---


2024-12-11 11:49:21,452:root:INFO:Finished: Evaluation for model BayesianNet.
2024-12-11 11:49:21,470:root:INFO:Start: Evaluation for model BayesianNet...
2024-12-11 11:49:21,470:root:INFO:Target: ID14086, TargetAgainst: ID80559



--- Game iteration 2 ---


2024-12-11 11:49:31,236:root:INFO:Finished: Evaluation for model BayesianNet.
2024-12-11 11:49:31,254:root:INFO:Start: Evaluation for model BayesianNet...
2024-12-11 11:49:31,254:root:INFO:Target: ID14086, TargetAgainst: ID80559



--- Game iteration 3 ---


2024-12-11 11:49:40,820:root:INFO:Finished: Evaluation for model BayesianNet.
2024-12-11 11:49:40,837:root:INFO:Start: Evaluation for model BayesianNet...
2024-12-11 11:49:40,838:root:INFO:Target: ID14086, TargetAgainst: ID80559



--- Game iteration 4 ---


2024-12-11 11:49:50,319:root:INFO:Finished: Evaluation for model BayesianNet.
2024-12-11 11:49:50,337:root:INFO:Start: Evaluation for model BayesianNet...
2024-12-11 11:49:50,337:root:INFO:Target: ID14086, TargetAgainst: ID80559



--- Game iteration 5 ---


2024-12-11 11:50:00,125:root:INFO:Finished: Evaluation for model BayesianNet.
2024-12-11 11:50:00,142:root:INFO:Start: Evaluation for model BayesianNet...
2024-12-11 11:50:00,143:root:INFO:Target: ID14086, TargetAgainst: ID80559



--- Game iteration 6 ---


2024-12-11 11:50:09,842:root:INFO:Finished: Evaluation for model BayesianNet.
2024-12-11 11:50:09,859:root:INFO:Start: Evaluation for model BayesianNet...
2024-12-11 11:50:09,860:root:INFO:Target: ID14086, TargetAgainst: ID80559



--- Game iteration 7 ---


2024-12-11 11:50:19,425:root:INFO:Finished: Evaluation for model BayesianNet.
2024-12-11 11:50:19,443:root:INFO:Start: Evaluation for model BayesianNet...
2024-12-11 11:50:19,443:root:INFO:Target: ID14086, TargetAgainst: ID80559



--- Game iteration 8 ---


2024-12-11 11:50:28,966:root:INFO:Finished: Evaluation for model BayesianNet.
2024-12-11 11:50:28,983:root:INFO:Start: Evaluation for model BayesianNet...
2024-12-11 11:50:28,984:root:INFO:Target: ID14086, TargetAgainst: ID80559



--- Game iteration 9 ---


2024-12-11 11:50:38,466:root:INFO:Finished: Evaluation for model BayesianNet.
2024-12-11 11:50:38,483:root:INFO:Start: Evaluation for model BayesianNet...
2024-12-11 11:50:38,484:root:INFO:Target: ID14086, TargetAgainst: ID80559



--- Game iteration 10 ---


2024-12-11 11:50:48,322:root:INFO:Finished: Evaluation for model BayesianNet.
2024-12-11 11:50:48,339:root:INFO:Start: Evaluation for model BayesianNet...
2024-12-11 11:50:48,339:root:INFO:Target: ID14086, TargetAgainst: ID80559



--- Game iteration 11 ---


2024-12-11 11:50:58,205:root:INFO:Finished: Evaluation for model BayesianNet.
2024-12-11 11:50:58,222:root:INFO:Start: Evaluation for model BayesianNet...
2024-12-11 11:50:58,223:root:INFO:Target: ID14086, TargetAgainst: ID80559



--- Game iteration 12 ---


2024-12-11 11:51:07,800:root:INFO:Finished: Evaluation for model BayesianNet.
2024-12-11 11:51:07,817:root:INFO:Start: Evaluation for model BayesianNet...
2024-12-11 11:51:07,817:root:INFO:Target: ID14086, TargetAgainst: ID80559



--- Game iteration 13 ---


2024-12-11 11:51:17,655:root:INFO:Finished: Evaluation for model BayesianNet.
2024-12-11 11:51:17,673:root:INFO:Start: Evaluation for model BayesianNet...
2024-12-11 11:51:17,673:root:INFO:Target: ID14086, TargetAgainst: ID80559



--- Game iteration 14 ---


2024-12-11 11:51:27,436:root:INFO:Finished: Evaluation for model BayesianNet.
2024-12-11 11:51:27,454:root:INFO:Start: Evaluation for model BayesianNet...
2024-12-11 11:51:27,454:root:INFO:Target: ID14086, TargetAgainst: ID80559



--- Game iteration 15 ---


2024-12-11 11:51:36,933:root:INFO:Finished: Evaluation for model BayesianNet.


In [27]:
outfile = "ResultsMIA_texas"
LOGGER.info(f"Write results to {path.join(f'{outdir}', f'{outfile}')}")

with open('/home/cs/grad/sarminf/myProjects/synthetic_data_release/tests/linkage/ResultsMIA_texas.json', 'w') as f:
    json.dump(resultsTargetPrivacy, f, indent=2, default=json_numpy_serialzer)

2024-12-11 11:51:36,948:root:INFO:Write results to tests/linkage/ResultsMIA_texas


In [29]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

from warnings import filterwarnings
filterwarnings('ignore')

import sys
sys.path.append('../')
from utils.analyse_results import *

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [30]:
from utils.analyse_results import load_results_linkage
dirname = '/home/cs/grad/sarminf/myProjects/synthetic_data_release/tests/linkage'
load_results_linkage('/home/cs/grad/sarminf/myProjects/synthetic_data_release/tests/linkage')

/home/cs/grad/sarminf/myProjects/synthetic_data_release/tests/linkage/ResultsMIA_texas.json


Unnamed: 0,TargetID,TargetModel,FeatureSet,Run,TPSyn,FPSyn,AdvantageSyn,AdvantageRaw,PrivacyGain
0,ID14086,BayesianNet,Correlations,0,0.7,0.7,0.0,1,1.0
1,ID14086,BayesianNet,Correlations,1,0.7,0.7,0.0,1,1.0
2,ID14086,BayesianNet,Correlations,10,0.8,0.6,0.2,1,0.8
3,ID14086,BayesianNet,Correlations,11,0.6,0.4,0.2,1,0.8
4,ID14086,BayesianNet,Correlations,12,0.8,0.7,0.1,1,0.9
5,ID14086,BayesianNet,Correlations,13,0.6,0.4,0.2,1,0.8
6,ID14086,BayesianNet,Correlations,14,0.4,0.5,-0.1,1,1.1
7,ID14086,BayesianNet,Correlations,2,0.3,0.6,-0.3,1,1.3
8,ID14086,BayesianNet,Correlations,3,0.5,0.6,-0.1,1,1.1
9,ID14086,BayesianNet,Correlations,4,0.3,0.1,0.2,1,0.8


In [31]:
gg = load_results_linkage(dirname)
MYdf = []

for tid in targetIDs:
    df2 = gg.groupby(['TargetID', 'TargetModel', 'FeatureSet'])[['TPSyn', 'FPSyn']].sum()
    # Multiply TPSyn and FPSyn by 10
    df2['TPSyn'] *= 10
    df2['FPSyn'] *= 10
    # Compute PrivacyGain
    df2['PrivacyGain'] = 1 - ((df2['TPSyn'] / 150) - (df2['FPSyn'] / 150))
    MYdf.append(df2)
    print(df2)
    break

/home/cs/grad/sarminf/myProjects/synthetic_data_release/tests/linkage/ResultsMIA_texas.json
                                   TPSyn  FPSyn  PrivacyGain
TargetID TargetModel FeatureSet                             
ID14086  BayesianNet Correlations   82.0   80.0     0.986667
                     Histogram      90.0   95.0     1.033333
                     Naive          87.0   88.0     1.006667
