# Load Library

In [19]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
import warnings

warnings.filterwarnings('ignore')

In [4]:
from tqdm import tqdm
import os
import data_utils
import model_utils
from attack_utils import get_CSMIA_case_by_case_results, CSMIA_attack, LOMIA_attack, get_LOMIA_results
from data_utils import oneHotCatVars, filter_random_data_by_conf_score
from vulnerability_score_utils import get_vulnerability_score, draw_hist_plot
from experiment_utils import MIAExperiment
from disparity_inference_utils import get_confidence_array, draw_confidence_array_scatter, get_indices_by_group_condition, get_corr_btn_sens_and_out_per_subgroup, get_slopes, get_angular_difference, calculate_stds, get_mutual_info_btn_sens_and_out_per_subgroup
from targeted_inference import get_angular_difference_range_for_subgroup,single_attribute_based_targeted_imputation, nested_attribute_based_targeted_imputation, single_attribute_based_targeted_ai, nested_attribute_based_targeted_ai
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.neural_network._base import ACTIVATIONS
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.metrics import roc_curve, auc, roc_auc_score, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.decomposition import PCA
from sklearn.inspection import permutation_importance
import matplotlib.pyplot as plt
import seaborn as sns
import tabulate
import pickle
# import utils
import copy
from scipy.stats import kendalltau, spearmanr

import matplotlib as mpl

# Setting the font family, size, and weight globally
mpl.rcParams['font.family'] = 'DejaVu Sans'
mpl.rcParams['font.size'] = 8
mpl.rcParams['font.weight'] = 'light'

In [13]:
experiments = {}
for i in range(1):
    experiment = MIAExperiment(sampling_condition_dict = 
        {
                'subgroup_col_name': 'ST',
                'n': 1000,
        }, random_state = i,
        shortname = f"Corr_btn_sens_and_output_for_ST_ranging_from_0_to_-0.5_random_state_{i}"
    )
    experiments[experiment.shortname] = experiment

  2%|▏         | 1/51 [00:01<01:05,  1.30s/it]

before scaling: 248 125 1.984
after scaling: 125 125 1.0


 14%|█▎        | 7/51 [00:09<00:57,  1.31s/it]

before scaling: 233 219 1.0639269406392695
after scaling: 219 219 1.0


 24%|██▎       | 12/51 [00:15<00:51,  1.31s/it]

before scaling: 221 164 1.3475609756097562
after scaling: 164 164 1.0


 37%|███▋      | 19/51 [00:24<00:42,  1.33s/it]

before scaling: 204 163 1.2515337423312884
after scaling: 163 163 1.0


 51%|█████     | 26/51 [00:34<00:32,  1.31s/it]

before scaling: 187 126 1.4841269841269842
after scaling: 126 126 1.0


 67%|██████▋   | 34/51 [00:44<00:22,  1.31s/it]

before scaling: 167 120 1.3916666666666666
after scaling: 120 120 1.0


 80%|████████  | 41/51 [00:53<00:12,  1.30s/it]

before scaling: 150 76 1.9736842105263157
after scaling: 76 76 1.0


 88%|████████▊ | 45/51 [00:58<00:07,  1.30s/it]

before scaling: 361 325 1.1107692307692307
after scaling: 325 325 1.0
before scaling: 140 101 1.386138613861386
after scaling: 101 101 1.0


 98%|█████████▊| 50/51 [01:05<00:01,  1.29s/it]

before scaling: 373 326 1.1441717791411044
after scaling: 326 326 1.0
before scaling: 128 87 1.471264367816092
after scaling: 87 87 1.0


100%|██████████| 51/51 [01:06<00:00,  1.30s/it]


[500, 500, 500, 252, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 469, 500, 500, 500, 500, 500, 500, 500, 500, 500, 371, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 399, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 336, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 359, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 253, 500, 500, 500, 500, 500, 500, 450, 360, 500, 500, 500, 500, 500, 500, 500, 500, 436, 339]


In [14]:
save_model = False

for experiment_key in experiments:
    experiment = experiments[experiment_key]
    
    print(f"Training classifier for experiment: {experiment}")
    try:
        experiment.clf = model_utils.load_model(f'<PATH_TO_MODEL>/{experiment.ds.ds.filenameroot}_target_model_.pkl')
        print(f"Loaded classifier for experiment from file: {experiment}")
    except:
        # clf = model_utils.get_model(max_iter=500, hidden_layer_sizes=(256, 256))
        experiment.clf = model_utils.get_model(max_iter=500)
        experiment.clf.fit(experiment.X_train, experiment.y_tr_onehot)

        if save_model:
            model_utils.save_model(experiment.clf, f'<PATH_TO_MODEL>/{experiment.ds.ds.filenameroot}_target_model.pkl')

Training classifier for experiment: Census19_subgroup_col_name_ST_n_1000_rs0


In [15]:
for experiment_key in experiments:
    experiment = experiments[experiment_key]
    experiment.confidence_array = get_confidence_array(experiment, experiment.X_train, experiment.y_te, experiment.clf)
    sens_pred, case_indices = CSMIA_attack(experiment.clf, experiment.X_train, experiment.y_tr, experiment.ds.ds.meta)
    case_2_indices = case_indices[2]
    experiment.case_2_indices = case_2_indices
    experiment.confidence_array_case_2 = experiment.confidence_array[case_2_indices, :]
    experiment.X_case_2 = experiment.X_train.loc[case_2_indices].copy().reset_index(drop=True)
    experiment.y_case_2 = experiment.y_tr.ravel()[case_2_indices]
    experiment.sens_pred = sens_pred
    experiment.sens_pred_LOMIA = LOMIA_attack(experiment, experiment.clf, experiment.X_train, experiment.y_tr, experiment.ds.ds.meta)

In [16]:
nested_attrib_cols = ['ST', 'SCHL', 'RAC1P', 'SEX']
single_kappas = [1, 0.75, 0.5, 0.375, 0.25, 0.1, 0.05]
nested_kappas = [0.5, 0.375, 0.25, 0.1, 0.05, 0.01]

In [17]:
for experiment_key in experiments:
    experiment = experiments[experiment_key]
    print(f'\n{experiment_key}\n')
    print(single_attribute_based_targeted_ai(experiment, experiment.sens_pred, subgroup_col_name='ST', kappas=single_kappas))


Corr_btn_sens_and_output_for_ST_ranging_from_0_to_-0.5_random_state_0

{'ST': [49, 50, 47]}
          attack_accuracy
0.056033            71.57
0.096418            69.59
0.250121            67.27
0.367438            65.87
0.505472            64.55
0.745174            62.14
1.000000            60.24


In [18]:
for experiment_key in experiments:
    experiment = experiments[experiment_key]
    print(f'\n{experiment_key}\n')
    print(nested_attribute_based_targeted_ai(experiment, experiment.sens_pred, subgroup_cols=['ST', 'SCHL', 'RAC1P', 'SEX'], kappas=nested_kappas))


Corr_btn_sens_and_output_for_ST_ranging_from_0_to_-0.5_random_state_0



  return [np.polyfit(confidence_array[indices_by_y_values[y_value], 1], confidence_array[indices_by_y_values[y_value], 0], 1)[0] for y_value in y_values]
  return [np.polyfit(confidence_array[indices_by_y_values[y_value], 1], confidence_array[indices_by_y_values[y_value], 0], 1)[0] for y_value in y_values]
  return [np.polyfit(confidence_array[indices_by_y_values[y_value], 1], confidence_array[indices_by_y_values[y_value], 0], 1)[0] for y_value in y_values]
  return [np.polyfit(confidence_array[indices_by_y_values[y_value], 1], confidence_array[indices_by_y_values[y_value], 0], 1)[0] for y_value in y_values]


          i  attack_accuracy
1.000000  0            60.24
0.485280  1            64.89
0.365257  2            64.39
0.360391  3            64.28
0.219409  4            68.31


In [26]:
for experiment_key in experiments:
    experiment = experiments[experiment_key]
    aux_indices_same_distrib = experiment.X_train.sample(n=5000, random_state=experiment.random_state).index
    experiment.X_aux_same_distrib, experiment.y_aux_same_distrib = experiment.X_train.loc[aux_indices_same_distrib].copy().reset_index(drop=True), experiment.y_tr[aux_indices_same_distrib]

    aux_indices_diff_distrib = experiment.X_test.sample(n=5000, random_state=experiment.random_state).index
    experiment.X_aux_diff_distrib, experiment.y_aux_diff_distrib = experiment.X_test.loc[aux_indices_diff_distrib].copy().reset_index(drop=True), experiment.y_te[aux_indices_diff_distrib]

In [23]:
for experiment_key in experiments:
    experiment = experiments[experiment_key]
    print(f'\n{experiment_key}\n')
    print(single_attribute_based_targeted_imputation(experiment, experiment.X_train, experiment.y_tr, experiment.X_aux_same_distrib, experiment.y_aux_same_distrib, subgroup_col_name='ST', kappas=single_kappas))

          imputation_attack_accuracy
0.056033                       71.42
0.096418                       68.80
0.246264                       66.24
0.382259                       65.83
0.498425                       66.23
0.740732                       65.16
1.000000                       64.50


In [24]:
for experiment_key in experiments:
    experiment = experiments[experiment_key]
    print(f'\n{experiment_key}\n')
    print(single_attribute_based_targeted_imputation(experiment, experiment.X_train, experiment.y_tr, experiment.X_aux_diff_distrib, experiment.y_aux_diff_distrib, subgroup_col_name='ST', kappas=single_kappas))

          imputation_attack_accuracy
0.055407                       58.97
0.105989                       60.64
0.241984                       60.90
0.376888                       60.59
0.495194                       60.94
0.757693                       60.88
1.000000                       61.01


In [28]:
for experiment_key in experiments:
    experiment = experiments[experiment_key]
    print(f'\n{experiment_key}\n')
    print(nested_attribute_based_targeted_imputation(experiment, experiment.X_train, experiment.y_tr, experiment.X_aux_same_distrib, experiment.y_aux_same_distrib, subgroup_cols=['ST', 'SCHL', 'RAC1P', 'SEX'], kappas=nested_kappas))


Corr_btn_sens_and_output_for_ST_ranging_from_0_to_-0.5_random_state_0

          i  attack_accuracy
1.000000  0            64.50
0.498425  1            66.23
0.318694  2            66.59
0.039456  3            65.71
0.383592  4            66.17


In [29]:
for experiment_key in experiments:
    experiment = experiments[experiment_key]
    print(f'\n{experiment_key}\n')
    print(nested_attribute_based_targeted_imputation(experiment, experiment.X_train, experiment.y_tr, experiment.X_aux_diff_distrib, experiment.y_aux_diff_distrib, subgroup_cols=['ST', 'SCHL', 'RAC1P', 'SEX'], kappas=nested_kappas))


Corr_btn_sens_and_output_for_ST_ranging_from_0_to_-0.5_random_state_0

          i  attack_accuracy
1.000000  0            61.01
0.495194  1            60.94
0.367135  2            61.48
0.044443  3            60.47
0.381411  4            61.19


In [10]:
aux_df = experiment.ds.ds.df.copy()

sens_col_name = f'{experiment.sensitive_column}_{experiment.sensitive_positive}'
married_indices = aux_df[aux_df[sens_col_name]==False][aux_df["is_train"]==1][[sens_col_name]].index
single_indices = aux_df[aux_df[sens_col_name]==True][aux_df["is_train"]==1][[sens_col_name]].index

pcnt_single = 0.15
total_count = 5000
married_sample_indices = aux_df.loc[married_indices].sample(n=total_count-int(total_count * pcnt_single), replace=False).index
single_sample_indices = aux_df.loc[single_indices].sample(n=int(total_count * pcnt_single), replace=False).index

all_sample_indices = married_sample_indices.append(single_sample_indices)
aux_df_distrib_drift = aux_df.loc[all_sample_indices].copy().reset_index(drop=True)