# Load Library

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import warnings
import numpy as np
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore', np.RankWarning)

In [3]:
from tqdm import tqdm
import os
import data_utils
import model_utils
from attack_utils import get_CSMIA_case_by_case_results, CSMIA_attack, LOMIA_attack, get_LOMIA_results
from data_utils import oneHotCatVars, filter_random_data_by_conf_score
from experiment_utils import MIAExperiment
from disparity_inference_utils import get_confidence_array, draw_confidence_array_scatter, get_indices_by_group_condition, get_corr_btn_sens_and_out_per_subgroup, get_slopes, get_angular_difference, calculate_stds, get_mutual_info_btn_sens_and_out_per_subgroup
from targeted_inference import get_angular_difference_range_for_subgroup,single_attribute_based_targeted_imputation, nested_attribute_based_targeted_imputation, single_attribute_based_targeted_ai, nested_attribute_based_targeted_ai
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.neural_network._base import ACTIVATIONS
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.metrics import roc_curve, auc, roc_auc_score, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.decomposition import PCA
from sklearn.inspection import permutation_importance
import matplotlib.pyplot as plt
import seaborn as sns
import tabulate
import pickle
# import utils
import copy
from scipy.stats import kendalltau, spearmanr

import matplotlib as mpl

# Setting the font family, size, and weight globally
mpl.rcParams['font.family'] = 'DejaVu Sans'
mpl.rcParams['font.size'] = 8
mpl.rcParams['font.weight'] = 'light'

In [4]:
experiments = {}
for i in range(1):
    experiment = MIAExperiment(sampling_condition_dict = 
        {
                'subgroup_col_name': 'ST',
                'n': 1000,
        }, random_state = i,
        shortname = f"Corr_btn_sens_and_output_for_ST_ranging_from_0_to_-0.5_random_state_{i}"
    )
    experiments[experiment.shortname] = experiment

    subgroup_vals = [1, 2, 3, 4, 6, 20, 50, 51, 62, 63]
    experiment_texas = MIAExperiment(sampling_condition_dict = 
        {
                'subgroup_col_name': 'PAT_STATUS',
                'subgroup_values': subgroup_vals,
                # 'subgroup_values': ['1', '2', '3', '4', '6', '20', '50', '51', '62', '63'],
                'n': 5000,
        }, random_state = i,
        shortname = f"Corr_btn_sens_and_output_for_PAT_STATUS_ranging_from_0_to_-0.5_random_state_{i}", 
        name='Texas100', sensitive_column='SEX_CODE'
    )
    experiments[experiment_texas.shortname] = experiment_texas
    experiment_adult = MIAExperiment(name='Adult', random_state = i)
    experiments['Adult'] = experiment_adult

In [5]:
save_model = True

for experiment_key in experiments:
    experiment = experiments[experiment_key]
    
    print(f"Training classifier for experiment: {experiment}")
    try:
        experiment.clf = model_utils.load_model(f'<PATH_TO_MODEL>/{experiment.ds.ds.filenameroot}_target_model.pkl')
        print(f"Loaded classifier for experiment from file: {experiment}")
    except:
        # clf = model_utils.get_model(max_iter=500, hidden_layer_sizes=(256, 256))
        experiment.clf = model_utils.get_model(max_iter=500)
        experiment.clf.fit(experiment.X_train, experiment.y_tr_onehot)

        if save_model:
            model_utils.save_model(experiment.clf, f'<PATH_TO_MODEL>/{experiment.ds.ds.filenameroot}_target_model.pkl')

Training classifier for experiment: Census19_subgroup_col_name_ST_n_1000_rs0
Loaded classifier for experiment from file: Census19_subgroup_col_name_ST_n_1000_rs0
Training classifier for experiment: Texas100_subgroup_col_name_PAT_STATUS_subgroup_values_[1, 2, 3, 4, 6, 20, 50, 51, 62, 63]_n_5000_rs0
Loaded classifier for experiment from file: Texas100_subgroup_col_name_PAT_STATUS_subgroup_values_[1, 2, 3, 4, 6, 20, 50, 51, 62, 63]_n_5000_rs0
Training classifier for experiment: Adult
Loaded classifier for experiment from file: Adult


In [6]:
n_aux_samples = 5000
for experiment_key in experiments:
    experiment = experiments[experiment_key]
    aux_indices_same_distrib = experiment.X_train.sample(n=n_aux_samples, random_state=experiment.random_state).index
    experiment.X_aux_same_distrib, experiment.y_aux_same_distrib = experiment.X_train.loc[aux_indices_same_distrib].copy().reset_index(drop=True), experiment.y_tr[aux_indices_same_distrib]

    if experiment.name == 'Adult':
        X_aux = experiment.X_test.copy()
        y_aux = experiment.y_te
        married_indices = X_aux[X_aux['marital_Married']==True].index
        single_indices = X_aux[X_aux['marital_Married']==False].index
        total_count, pcnt_single = n_aux_samples, 0.15
        married_sample_indices = X_aux.loc[married_indices].sample(n=total_count-int(total_count * pcnt_single), replace=False, random_state=42).index
        single_sample_indices = X_aux.loc[single_indices].sample(n=int(total_count * pcnt_single), replace=False, random_state=42).index

        all_sample_indices = married_sample_indices.append(single_sample_indices)
        experiment.X_aux_diff_distrib, experiment.y_aux_diff_distrib = X_aux.loc[all_sample_indices].copy().reset_index(drop=True), y_aux[all_sample_indices]
    else:
        aux_indices_diff_distrib = experiment.X_test.sample(n=n_aux_samples, random_state=experiment.random_state).index
        experiment.X_aux_diff_distrib, experiment.y_aux_diff_distrib = experiment.X_test.loc[aux_indices_diff_distrib].copy().reset_index(drop=True), experiment.y_te[aux_indices_diff_distrib]

In [7]:
for experiment_key in experiments:
    experiment = experiments[experiment_key]
    experiment.confidence_array = get_confidence_array(experiment, experiment.X_train, experiment.y_te, experiment.clf)
    sens_pred, case_indices = CSMIA_attack(experiment.clf, experiment.X_train, experiment.y_tr, experiment.ds.ds.meta)
    case_2_indices = case_indices[2]
    experiment.case_2_indices = case_2_indices
    experiment.confidence_array_case_2 = experiment.confidence_array[case_2_indices, :]
    experiment.X_case_2 = experiment.X_train.loc[case_2_indices].copy().reset_index(drop=True)
    experiment.y_case_2 = experiment.y_tr.ravel()[case_2_indices]
    experiment.sens_pred = sens_pred
    experiment.sens_pred_LOMIA = LOMIA_attack(experiment, experiment.clf, experiment.X_train, experiment.y_tr, experiment.ds.ds.meta)

In [8]:
for experiment_key in experiments:
    experiment = experiments[experiment_key]
    experiment.nested_attrib_cols = {
        'Census19': ['ST', 'SCHL', 'RAC1P', 'SEX'],
        'Texas100': ['PAT_STATUS', 'RACE', 'ADMITTING_DIAGNOSIS',  'TYPE_OF_ADMISSION', 'SOURCE_OF_ADMISSION'],
        'Adult': ['occupation', 'work', 'race', 'sex'],
    }[experiment.name]
    experiment.single_kappas = {
        'Census19': [1, 0.75, 0.5, 0.375, 0.25, 0.1, 0.05],
        'Texas100': [1, 0.75, 0.5, 0.375, 0.25, 0.1],
        'Adult': [1, 0.75, 0.5, 0.25, 0.1],
    }[experiment.name]
    experiment.nested_kappas = {
        'Census19': [0.5, 0.375, 0.25, 0.1],
        'Texas100': [0.5, 0.25, 0.1, 0.05, 0.01],
        'Adult': [0.5, 0.375, 0.25, 0.1],
    }[experiment.name]
    experiment.subgroup_col_name = {
        'Census19': 'ST',
        'Texas100': 'PAT_STATUS',
        'Adult': 'occupation',
    }[experiment.name]

# Single Attribute-based Targeted Attacks

In [9]:
print('Performance of Single Attribute Based Targeted AI: CSMIA')
for experiment_key in experiments:
    experiment = experiments[experiment_key]
    print(f'\n{experiment.name}\n')
    print(single_attribute_based_targeted_ai(experiment, experiment.sens_pred, subgroup_col_name=experiment.subgroup_col_name, kappas=experiment.single_kappas))

Performance of Single Attribute Based Targeted AI: CSMIA

Census19

       attack_accuracy
0.050            71.57
0.100            69.59
0.250            67.27
0.375            65.87
0.500            64.55
0.750            62.14
1.000            60.24

Texas100

       attack_accuracy
0.100            69.76
0.250            71.19
0.375            67.04
0.500            67.11
0.750            63.68
1.000            62.10

Adult

      attack_accuracy
0.10            81.61
0.25            74.83
0.50            72.40
0.75            69.16
1.00            69.98


In [10]:
print('Performance of Single Attribute Based Targeted AI: LOMIA')
for experiment_key in experiments:
    experiment = experiments[experiment_key]
    print(f'\n{experiment.name}\n')
    print(single_attribute_based_targeted_ai(experiment, experiment.sens_pred_LOMIA, subgroup_col_name=experiment.subgroup_col_name, kappas=experiment.single_kappas))

Performance of Single Attribute Based Targeted AI: LOMIA

Census19

       attack_accuracy
0.050            73.84
0.100            72.96
0.250            70.39
0.375            69.04
0.500            67.86
0.750            65.95
1.000            62.70

Texas100

       attack_accuracy
0.100            70.10
0.250            71.31
0.375            66.81
0.500            66.89
0.750            62.93
1.000            61.10

Adult

      attack_accuracy
0.10            81.68
0.25            74.86
0.50            73.36
0.75            69.79
1.00            70.61


In [11]:
print('Performance of Single Attribute Based Targeted Imputation: ImpI')
for experiment_key in experiments:
    experiment = experiments[experiment_key]
    print(f'\n{experiment.name}\n')
    print(single_attribute_based_targeted_imputation(experiment, experiment.X_train, experiment.y_tr, experiment.X_aux_same_distrib, experiment.y_aux_same_distrib, subgroup_col_name=experiment.subgroup_col_name, kappas=experiment.single_kappas))

Performance of Single Attribute Based Targeted Imputation: ImpI

Census19

       imputation_attack_accuracy
0.050                       71.42
0.100                       68.80
0.250                       66.24
0.375                       65.83
0.500                       66.23
0.750                       65.16
1.000                       64.50

Texas100

       imputation_attack_accuracy
0.100                       68.08
0.250                       66.53
0.375                       65.75
0.500                       66.21
0.750                       62.41
1.000                       61.44

Adult

      imputation_attack_accuracy
0.10                       75.14
0.25                       78.29
0.50                       77.74
0.75                       76.81
1.00                       74.59


In [12]:
print('Performance of Single Attribute Based Targeted Imputation: ImpP')
for experiment_key in experiments:
    experiment = experiments[experiment_key]
    print(f'\n{experiment.name}\n')
    print(single_attribute_based_targeted_imputation(experiment, experiment.X_train, experiment.y_tr, experiment.X_aux_diff_distrib, experiment.y_aux_diff_distrib, subgroup_col_name=experiment.subgroup_col_name, kappas=experiment.single_kappas))

Performance of Single Attribute Based Targeted Imputation: ImpP

Census19

       imputation_attack_accuracy
0.050                       58.97
0.100                       60.64
0.250                       60.90
0.375                       60.59
0.500                       60.94
0.750                       60.88
1.000                       61.01

Texas100

       imputation_attack_accuracy
0.100                       49.65
0.250                       49.40
0.375                       48.10
0.500                       49.48
0.750                       52.67
1.000                       52.50

Adult

      imputation_attack_accuracy
0.10                       65.06
0.25                       64.94
0.50                       65.93
0.75                       64.68
1.00                       62.88


# Nested Attribute-based Targeted Attacks

In [None]:
print('Performance of Nested Attribute Based Targeted AI: CSMIA')
for experiment_key in experiments:
    experiment = experiments[experiment_key]
    print(f'\n{experiment.name}\n')
    print(nested_attribute_based_targeted_ai(experiment, experiment.sens_pred, subgroup_cols=experiment.nested_attrib_cols, kappas=experiment.nested_kappas))

Performance of Nested Attribute Based Targeted AI: CSMIA

Census19

       Depth  attack_accuracy
1.000      0            60.24
0.500      1            64.89
0.375      2            64.39
0.250      3            64.28
0.100      4            68.31

Texas100

      Depth  attack_accuracy
1.00      0            62.10
0.50      1            67.11
0.25      2            66.14
0.10      3            66.04
0.05      4            66.96
0.01      5            62.79

Adult

       Depth  attack_accuracy
1.000      0            69.98
0.500      1            71.17
0.375      2            73.73
0.250      3            77.52
0.100      4            86.74


In [14]:
print('Performance of Nested Attribute Based Targeted AI: LOMIA')
for experiment_key in experiments:
    experiment = experiments[experiment_key]
    print(f'\n{experiment.name}\n')
    print(nested_attribute_based_targeted_ai(experiment, experiment.sens_pred_LOMIA, subgroup_cols=experiment.nested_attrib_cols, kappas=experiment.nested_kappas))

Performance of Nested Attribute Based Targeted AI: LOMIA

Census19

       Depth  attack_accuracy
1.000      0            62.70
0.500      1            68.06
0.375      2            67.98
0.250      3            67.95
0.100      4            71.24

Texas100

      Depth  attack_accuracy
1.00      0            61.10
0.50      1            66.89
0.25      2            65.91
0.10      3            65.71
0.05      4            66.30
0.01      5            61.24

Adult

       Depth  attack_accuracy
1.000      0            70.61
0.500      1            72.19
0.375      2            73.90
0.250      3            76.88
0.100      4            86.77


In [15]:
print('Performance of Nested Attribute Based Targeted Imputation: ImpI')
for experiment_key in experiments:
    experiment = experiments[experiment_key]
    print(f'\n{experiment.name}\n')
    print(nested_attribute_based_targeted_imputation(experiment, experiment.X_train, experiment.y_tr, experiment.X_aux_same_distrib, experiment.y_aux_same_distrib, subgroup_cols=experiment.nested_attrib_cols, kappas=experiment.nested_kappas))

Performance of Nested Attribute Based Targeted Imputation: ImpI

Census19

       Depth  attack_accuracy
1.000      0            64.50
0.500      1            66.08
0.375      2            66.17
0.250      3            66.17
0.100      4            69.00

Texas100

      Depth  attack_accuracy
1.00      0            61.44
0.50      1            64.51
0.25      2            64.51
0.10      3            72.49
0.05      4            79.03
0.01      5            83.18

Adult

       Depth  attack_accuracy
1.000      0            74.59
0.500      1            77.74
0.375      2            77.74
0.250      3            77.93
0.100      4            86.21


In [16]:
print('Performance of Nested Attribute Based Targeted Imputation: ImpP')
for experiment_key in experiments:
    experiment = experiments[experiment_key]
    print(f'\n{experiment.name}\n')
    print(nested_attribute_based_targeted_imputation(experiment, experiment.X_train, experiment.y_tr, experiment.X_aux_diff_distrib, experiment.y_aux_diff_distrib, subgroup_cols=experiment.nested_attrib_cols, kappas=experiment.nested_kappas))

Performance of Nested Attribute Based Targeted Imputation: ImpP

Census19

       Depth  attack_accuracy
1.000      0            61.01
0.500      1            61.06
0.375      2            61.19
0.250      3            61.19
0.100      4            64.36

Texas100

      Depth  attack_accuracy
1.00      0            52.50
0.50      1            50.31
0.25      2            50.31
0.10      3            54.90
0.05      4            54.90
0.01      5            56.19

Adult

       Depth  attack_accuracy
1.000      0            62.88
0.500      1            65.93
0.375      2            65.73
0.250      3            66.50
0.100      4            70.30
