In [1]:
import warnings
import time
warnings.filterwarnings('ignore')

import copy
import itertools

from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer
from sklearn.preprocessing import StandardScaler

from load_dataset import *
from savage import *
from pipelines import *

%reload_ext autoreload
%autoreload 2

## Import and Split data

In [2]:
dataset = 'adult'
# X_train, X_test, y_train, y_test = load(dataset, sample=True)
X_train, X_test, y_train, y_test = load(dataset)

In [3]:
# for Adult
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

In [4]:
# for Employee, Insurance, Credit Card, Concrete
# X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

In [5]:
# use sampling for efficiency
X_train = X_train.sample(frac=0.05, random_state=42)
X_test = X_test.sample(frac=0.05, random_state=42)
y_train = y_train.sample(frac=0.05, random_state=42)
y_test = y_test.sample(frac=0.05, random_state=42)
X_val = X_val.sample(frac=0.05, random_state=42)
y_val = y_val.sample(frac=0.05, random_state=42)

X_train_orig = copy.deepcopy(X_train).reset_index(drop=True)
X_test_orig = copy.deepcopy(X_test).reset_index(drop=True)
X_val_orig = copy.deepcopy(X_val).reset_index(drop=True)
y_train, y_test, y_val = y_train.reset_index(drop=True), y_test.reset_index(drop=True), y_val.reset_index(drop=True)

In [6]:
y_train

0       1
1       0
2       1
3       0
4       0
       ..
1503    1
1504    0
1505    0
1506    1
1507    0
Name: income, Length: 1508, dtype: int64

In [7]:
X_train

Unnamed: 0,age,workclass,education,marital,relationship,race,gender,hours
217,0,7,11,0,0,1,1,1
24913,0,7,8,2,1,0,1,0
17780,1,7,8,2,1,1,1,1
12484,0,7,8,2,1,1,1,1
8890,0,3,10,2,0,1,0,0
...,...,...,...,...,...,...,...,...
26575,1,7,14,2,1,1,1,0
3707,1,3,8,1,0,1,0,0
963,0,7,9,0,0,1,0,1
28875,0,7,10,2,1,1,1,1


In [8]:
sens_attr = 'gender'

## Run SAVAGE

**Load predefined pipeline.**

In [9]:
rf = LogisticRegression(random_state=42, max_iter=1000)
loaded_pipeline = make_pipeline_func('h2o', rf)

**Customized ML Pipeline: Impute with Iterative Imputer and Standardize, then train with non-differentiable random forest.**

In [10]:
def pipeline(X_train, y_train, X_test):
    # imputer = SimpleImputer(strategy='mean')
    imputer = IterativeImputer(random_state=42)
    model = LogisticRegression(random_state=42, max_iter=1000)
    X_train_imputed = imputer.fit_transform(X_train)
    ss = StandardScaler()
    ss.fit(X_train_imputed)
    model.fit(ss.transform(X_train_imputed), y_train)
    return model.predict_proba(ss.transform(X_test))

**Target Metric for Measuring Model Utility: AUC**

Design all metrics to be lower the worse

In [11]:
def auc(X_test, y_test, y_pred):
    return roc_auc_score(y_test, y_pred[:, 1])

**Target Metric for Measuring Model Unfairness: EOD**

In [12]:
# Equality of opportunity difference (EOD)
def eod(X_test, y_test, y_pred):
    # negative making sure lower the worse
    return -abs(np.mean(y_pred[X_test[((X_test[sens_attr] == 1) & (y_test == 1))].index][:, 1]) - \
                np.mean(y_pred[X_test[((X_test[sens_attr] == 0) & (y_test == 1))].index][:, 1]))

**Task: what is the worst-case harm caused by 10% of systematic missing data?**

In [13]:
# clean AUC
clean_auc = auc(X_test_orig, y_test, pipeline(X_train_orig, y_train, X_test_orig))
print(f'Clean AUC: {clean_auc}')

# clean EOD
clean_eod = eod(X_test_orig, y_test, pipeline(X_train_orig, y_train, X_test_orig))
print(f'Clean EOD: {clean_eod}')

Clean AUC: 0.8537699615928845
Clean EOD: -0.2377067126602436


In [14]:
# maximum number of missing data
budget_pct = 0.3
budget = int(X_train_orig.shape[0] * budget_pct)

In [15]:
# take top-3 patterns
top_k = 3

In [16]:
top_results = run_beam_search(X_train_orig, X_test_orig, y_train, y_test, pipeline, auc, budget, clean_auc, top_k=top_k)

Start Beam Search...


Beam search rounds:   0%|          | 0/2 [00:00<?, ?it/s]

Round 1 candidates:   0%|          | 0/1 [00:00<?, ?it/s]

Expanding candidate ['Y']:   0%|          | 0/8 [00:00<?, ?it/s]

target_col: age, id: 0, cols: ('age', 'Y')
Trial 0: Found pattern for 382 rows, injecting 382 errors.
Trial 1: Found pattern for 314 rows, injecting 314 errors.
Trial 3: Found pattern for 474 rows, injecting 452 errors.
Trial 5: Found pattern for 222 rows, injecting 222 errors.
Trial 7: Found pattern for 812 rows, injecting 452 errors.
Trial 12: Found pattern for 160 rows, injecting 160 errors.
Injected 160 errors (10.61%). Best value: 0.78448
target_col: workclass, id: 1, cols: ('workclass', 'Y')
Trial 0: Found pattern for 342 rows, injecting 342 errors.
Trial 1: Found pattern for 32 rows, injecting 32 errors.
Trial 2: Found pattern for 257 rows, injecting 257 errors.
Trial 3: Found pattern for 33 rows, injecting 33 errors.
Trial 4: Found pattern for 85 rows, injecting 85 errors.
Trial 5: Found pattern for 146 rows, injecting 146 errors.
Trial 7: Found pattern for 1355 rows, injecting 452 errors.
Trial 8: Found pattern for 32 rows, injecting 32 errors.
Trial 9: Found pattern for 24 ro

Round 2 candidates:   0%|          | 0/3 [00:00<?, ?it/s]

Expanding candidate ['gender', 'Y']:   0%|          | 0/8 [00:00<?, ?it/s]

target_col: gender, id: 6, cols: ('gender', 'age', 'Y')
Trial 0: Found pattern for 160 rows, injecting 160 errors.
Trial 1: Found pattern for 54 rows, injecting 54 errors.
Trial 2: Found pattern for 328 rows, injecting 328 errors.
Trial 3: Found pattern for 36 rows, injecting 36 errors.
Trial 4: Found pattern for 125 rows, injecting 125 errors.
Trial 5: Found pattern for 143 rows, injecting 143 errors.
Trial 6: Found pattern for 688 rows, injecting 452 errors.
Trial 7: Found pattern for 142 rows, injecting 142 errors.
Trial 14: Found pattern for 382 rows, injecting 382 errors.
Injected 328 errors (21.75%). Best value: 0.82082
target_col: gender, id: 6, cols: ('gender', 'workclass', 'Y')
Trial 0: Found pattern for 941 rows, injecting 452 errors.
Trial 1: Found pattern for 1295 rows, injecting 452 errors.
Trial 2: Found pattern for 39 rows, injecting 39 errors.
Trial 3: Found pattern for 46 rows, injecting 46 errors.
Trial 4: Found pattern for 6 rows, injecting 6 errors.
Trial 5: Found p

Expanding candidate ['marital', 'Y']:   0%|          | 0/8 [00:00<?, ?it/s]

target_col: marital, id: 3, cols: ('marital', 'age', 'Y')
Trial 0: Found pattern for 152 rows, injecting 152 errors.
Trial 1: Found pattern for 24 rows, injecting 24 errors.
Trial 2: Found pattern for 326 rows, injecting 326 errors.
Trial 3: Found pattern for 16 rows, injecting 16 errors.
Trial 4: Found pattern for 135 rows, injecting 135 errors.
Trial 5: Found pattern for 151 rows, injecting 151 errors.
Trial 7: Found pattern for 136 rows, injecting 136 errors.
Trial 8: Found pattern for 25 rows, injecting 25 errors.
Trial 10: Found pattern for 742 rows, injecting 452 errors.
Trial 12: Found pattern for 289 rows, injecting 289 errors.
Trial 14: Found pattern for 474 rows, injecting 452 errors.
Trial 15: Found pattern for 154 rows, injecting 154 errors.
Trial 17: Found pattern for 1508 rows, injecting 452 errors.
Injected 289 errors (19.16%). Best value: 0.84993
target_col: marital, id: 3, cols: ('marital', 'workclass', 'Y')
Trial 0: Found pattern for 573 rows, injecting 452 errors.
Tr

Expanding candidate ['race', 'Y']:   0%|          | 0/8 [00:00<?, ?it/s]

target_col: race, id: 5, cols: ('race', 'age', 'Y')
Trial 0: Found pattern for 160 rows, injecting 160 errors.
Trial 1: Found pattern for 28 rows, injecting 28 errors.
Trial 2: Found pattern for 354 rows, injecting 354 errors.
Trial 3: Found pattern for 19 rows, injecting 19 errors.
Trial 4: Found pattern for 48 rows, injecting 48 errors.
Trial 5: Found pattern for 57 rows, injecting 57 errors.
Trial 6: Found pattern for 955 rows, injecting 452 errors.
Trial 7: Found pattern for 151 rows, injecting 151 errors.
Trial 14: Found pattern for 382 rows, injecting 382 errors.
Injected 28 errors (1.86%). Best value: 0.83347
target_col: race, id: 5, cols: ('race', 'workclass', 'Y')
Trial 0: Found pattern for 941 rows, injecting 452 errors.
Trial 1: Found pattern for 1295 rows, injecting 452 errors.
Trial 2: Found pattern for 63 rows, injecting 63 errors.
Trial 3: Found pattern for 49 rows, injecting 49 errors.
Trial 4: Found pattern for 3 rows, injecting 3 errors.
Trial 5: Found pattern for 32 

In [17]:
r = top_results[0]
print(f'Missing data in column {r[0][0]} depending on columns {r[0]} could lead to an AUC drop of {clean_auc - r[1][0]}')

Missing data in column gender depending on columns ('gender', 'Y') could lead to an AUC drop of 0.25959167172023445


In [18]:
X_train_dirty = r[1][1]
X_train_dirty

Unnamed: 0,age,workclass,education,marital,relationship,race,gender,hours
0,0,7,11,0,0,1,1.0,1
1,0,7,8,2,1,0,1.0,0
2,1,7,8,2,1,1,1.0,1
3,0,7,8,2,1,1,1.0,1
4,0,3,10,2,0,1,,0
...,...,...,...,...,...,...,...,...
1503,1,7,14,2,1,1,1.0,0
1504,1,3,8,1,0,1,,0
1505,0,7,9,0,0,1,,1
1506,0,7,10,2,1,1,1.0,1


In [19]:
y_train_dirty = r[1][2]
y_train_dirty

0       1
1       0
2       1
3       0
4       0
       ..
1503    1
1504    0
1505    0
1506    1
1507    0
Name: income, Length: 1508, dtype: int64

In [64]:
X_train_dirty.to_csv("X_train_dirty.csv", index=False)
X_train_orig.to_csv("X_train_clean.csv", index=False)
X_val_orig.to_csv("X_val.csv", index=False)
X_test_orig.to_csv("X_test.csv", index=False)
y_train_dirty.to_csv("y_train.csv", index=False)
y_val.to_csv("y_val.csv", index=False)
y_test.to_csv("y_test.csv", index=False)

In [65]:
X_full = pd.concat([X_train_orig, X_val_orig, X_test_orig], axis=0)
X_full.to_csv("X_full.csv", index=False)

In [66]:
y_full = pd.concat([y_train, y_val, y_test], axis=0)
y_full.to_csv("y_full.csv", index=False)

In [67]:
y_val.shape

(376,)

In [68]:
y_test.shape

(376,)

In [69]:
print(X_train_dirty.isna().any(axis=1).sum())

171


In [70]:
clean_auc

0.8537699615928845

In [71]:
imputer = IterativeImputer(random_state=42)
model = LogisticRegression(random_state=42, max_iter=1000)
X_train_imputed = imputer.fit_transform(X_train_dirty)
ss = StandardScaler()
ss.fit(X_train_imputed)
model.fit(ss.transform(X_train_imputed), y_train_dirty)
y_pred = model.predict_proba(ss.transform(X_test_orig))
print(roc_auc_score(y_test, y_pred[:, 1]))

0.7353547604608854


In [72]:
if y_train.equals(y_train_dirty):
    print("The Series are identical.")
else:
    print("The Series are NOT identical. Proceeding to find differences...")

The Series are identical.


In [13]:
top_results = run_beam_search(X_train_orig, X_test_orig, y_train, y_test, pipeline, eod, budget, top_k=top_k)

Start Beam Search...


Beam search rounds:   0%|          | 0/2 [00:00<?, ?it/s]

Round 1 candidates:   0%|          | 0/1 [00:00<?, ?it/s]

Expanding candidate ['Y']:   0%|          | 0/8 [00:00<?, ?it/s]

target_col: age, id: 0, cols: ('age', 'Y')
Injected 150 errors (9.95%). Best value: -0.19563
target_col: workclass, id: 1, cols: ('workclass', 'Y')
Injected 146 errors (9.68%). Best value: -0.20542
target_col: education, id: 2, cols: ('education', 'Y')
Injected 99 errors (6.56%). Best value: -0.21720
target_col: marital, id: 3, cols: ('marital', 'Y')
Injected 24 errors (1.59%). Best value: -0.19975
target_col: relationship, id: 4, cols: ('relationship', 'Y')
Injected 150 errors (9.95%). Best value: -0.20096
target_col: race, id: 5, cols: ('race', 'Y')
Injected 150 errors (9.95%). Best value: -0.19724
target_col: gender, id: 6, cols: ('gender', 'Y')
Injected 54 errors (3.58%). Best value: -0.42817
target_col: hours, id: 7, cols: ('hours', 'Y')
Injected 150 errors (9.95%). Best value: -0.22095
----------- ROUND BEST -----------
[(['gender', 'Y'], -0.4281736355969331), (['hours', 'Y'], -0.2209541035039611), (['education', 'Y'], -0.21719703263877826)]
----------------------------------


Round 2 candidates:   0%|          | 0/3 [00:00<?, ?it/s]

Expanding candidate ['gender', 'Y']:   0%|          | 0/8 [00:00<?, ?it/s]

target_col: gender, id: 6, cols: ('gender', 'age', 'Y')
Injected 54 errors (3.58%). Best value: -0.42817
target_col: gender, id: 6, cols: ('gender', 'workclass', 'Y')
Injected 33 errors (2.19%). Best value: -0.31642
target_col: gender, id: 6, cols: ('gender', 'education', 'Y')
Injected 54 errors (3.58%). Best value: -0.42817
target_col: gender, id: 6, cols: ('gender', 'marital', 'Y')
Injected 54 errors (3.58%). Best value: -0.42817
target_col: gender, id: 6, cols: ('gender', 'relationship', 'Y')
Injected 54 errors (3.58%). Best value: -0.42817
target_col: gender, id: 6, cols: ('gender', 'race', 'Y')
Injected 54 errors (3.58%). Best value: -0.42817
target_col: gender, id: 6, cols: ('gender', 'hours', 'Y')
Injected 54 errors (3.58%). Best value: -0.42817


Expanding candidate ['hours', 'Y']:   0%|          | 0/8 [00:00<?, ?it/s]

target_col: hours, id: 7, cols: ('hours', 'age', 'Y')
Injected 150 errors (9.95%). Best value: -0.22095
target_col: hours, id: 7, cols: ('hours', 'workclass', 'Y')
Injected 138 errors (9.15%). Best value: -0.21652
target_col: hours, id: 7, cols: ('hours', 'education', 'Y')
Injected 150 errors (9.95%). Best value: -0.21682
target_col: hours, id: 7, cols: ('hours', 'marital', 'Y')
Injected 139 errors (9.22%). Best value: -0.25618
target_col: hours, id: 7, cols: ('hours', 'relationship', 'Y')
Injected 150 errors (9.95%). Best value: -0.22095
target_col: hours, id: 7, cols: ('hours', 'race', 'Y')
Injected 150 errors (9.95%). Best value: -0.22095
target_col: hours, id: 7, cols: ('hours', 'gender', 'Y')
Injected 150 errors (9.95%). Best value: -0.22095


Expanding candidate ['education', 'Y']:   0%|          | 0/8 [00:00<?, ?it/s]

target_col: education, id: 2, cols: ('education', 'age', 'Y')
Injected 150 errors (9.95%). Best value: -0.21673
target_col: education, id: 2, cols: ('education', 'workclass', 'Y')
Injected 113 errors (7.49%). Best value: -0.20555
target_col: education, id: 2, cols: ('education', 'marital', 'Y')
Injected 150 errors (9.95%). Best value: -0.21657
target_col: education, id: 2, cols: ('education', 'relationship', 'Y')
Injected 150 errors (9.95%). Best value: -0.21673
target_col: education, id: 2, cols: ('education', 'race', 'Y')
Injected 150 errors (9.95%). Best value: -0.21673
target_col: education, id: 2, cols: ('education', 'gender', 'Y')
Injected 84 errors (5.57%). Best value: -0.22375
target_col: education, id: 2, cols: ('education', 'hours', 'Y')
Injected 150 errors (9.95%). Best value: -0.21673
----------- ROUND BEST -----------
[(['gender', 'age', 'Y'], -0.4281736355969331), (['gender', 'education', 'Y'], -0.4281736355969331), (['gender', 'marital', 'Y'], -0.4281736355969331)]
-----

In [14]:
r = top_results[0]
print(f'Missing data in column {r[0][0]} depending on columns {r[0]} could lead to an unfairness increase of {clean_eod-r[1]}')

Missing data in column gender depending on columns ('gender', 'Y') could lead to an unfairness increase of 0.23652730330363736


**Evaluation for loaded pipeline**

In [11]:
top_results = run_beam_search(X_train_orig, X_test_orig, y_train, y_test, loaded_pipeline, auc, budget, top_k=top_k)

Start Beam Search...


Beam search rounds:   0%|          | 0/2 [00:00<?, ?it/s]

Round 1 candidates:   0%|          | 0/1 [00:00<?, ?it/s]

Expanding candidate ['Y']:   0%|          | 0/8 [00:00<?, ?it/s]

target_col: age, id: 0, cols: ('age', 'Y')
Injected 150 errors (9.95%). Best value: 0.80716
target_col: workclass, id: 1, cols: ('workclass', 'Y')
Injected 109 errors (7.23%). Best value: 0.75343
target_col: education, id: 2, cols: ('education', 'Y')
Injected 150 errors (9.95%). Best value: 0.76626
target_col: marital, id: 3, cols: ('marital', 'Y')
Injected 150 errors (9.95%). Best value: 0.80883
target_col: relationship, id: 4, cols: ('relationship', 'Y')
Injected 150 errors (9.95%). Best value: 0.80735
target_col: race, id: 5, cols: ('race', 'Y')
Injected 28 errors (1.86%). Best value: 0.80663
target_col: gender, id: 6, cols: ('gender', 'Y')
Injected 150 errors (9.95%). Best value: 0.81241
target_col: hours, id: 7, cols: ('hours', 'Y')
Injected 150 errors (9.95%). Best value: 0.80263
----------- ROUND BEST -----------
[(['workclass', 'Y'], 0.7534277478380749), (['education', 'Y'], 0.766258018206884), (['hours', 'Y'], 0.8026250726338456)]
----------------------------------


Round 2 candidates:   0%|          | 0/3 [00:00<?, ?it/s]

Expanding candidate ['workclass', 'Y']:   0%|          | 0/8 [00:00<?, ?it/s]

target_col: workclass, id: 1, cols: ('workclass', 'age', 'Y')
Injected 143 errors (9.48%). Best value: 0.73349
target_col: workclass, id: 1, cols: ('workclass', 'education', 'Y')
Injected 147 errors (9.75%). Best value: 0.73582
target_col: workclass, id: 1, cols: ('workclass', 'marital', 'Y')
Injected 84 errors (5.57%). Best value: 0.77963
target_col: workclass, id: 1, cols: ('workclass', 'relationship', 'Y')
Injected 125 errors (8.29%). Best value: 0.72810
target_col: workclass, id: 1, cols: ('workclass', 'race', 'Y')
Injected 125 errors (8.29%). Best value: 0.72810
target_col: workclass, id: 1, cols: ('workclass', 'gender', 'Y')
Injected 150 errors (9.95%). Best value: 0.80474
target_col: workclass, id: 1, cols: ('workclass', 'hours', 'Y')
Injected 139 errors (9.22%). Best value: 0.72703


Expanding candidate ['education', 'Y']:   0%|          | 0/8 [00:00<?, ?it/s]

target_col: education, id: 2, cols: ('education', 'age', 'Y')
Injected 118 errors (7.82%). Best value: 0.73701
target_col: education, id: 2, cols: ('education', 'workclass', 'Y')
Injected 70 errors (4.64%). Best value: 0.78648
target_col: education, id: 2, cols: ('education', 'marital', 'Y')
Injected 45 errors (2.98%). Best value: 0.80133
target_col: education, id: 2, cols: ('education', 'relationship', 'Y')
Injected 150 errors (9.95%). Best value: 0.77698
target_col: education, id: 2, cols: ('education', 'race', 'Y')
Injected 57 errors (3.78%). Best value: 0.79954
target_col: education, id: 2, cols: ('education', 'gender', 'Y')
Injected 84 errors (5.57%). Best value: 0.76932
target_col: education, id: 2, cols: ('education', 'hours', 'Y')
Injected 84 errors (5.57%). Best value: 0.76932


Expanding candidate ['hours', 'Y']:   0%|          | 0/8 [00:00<?, ?it/s]

target_col: hours, id: 7, cols: ('hours', 'age', 'Y')
Injected 150 errors (9.95%). Best value: 0.76325
target_col: hours, id: 7, cols: ('hours', 'workclass', 'Y')
Injected 118 errors (7.82%). Best value: 0.76924
target_col: hours, id: 7, cols: ('hours', 'education', 'Y')
Injected 150 errors (9.95%). Best value: 0.73168
target_col: hours, id: 7, cols: ('hours', 'marital', 'Y')
Injected 150 errors (9.95%). Best value: 0.73345
target_col: hours, id: 7, cols: ('hours', 'relationship', 'Y')
Injected 150 errors (9.95%). Best value: 0.68493
target_col: hours, id: 7, cols: ('hours', 'race', 'Y')
Injected 150 errors (9.95%). Best value: 0.77500
target_col: hours, id: 7, cols: ('hours', 'gender', 'Y')
Injected 150 errors (9.95%). Best value: 0.77481
----------- ROUND BEST -----------
[(['hours', 'relationship', 'Y'], 0.6849320375075483), (['workclass', 'hours', 'Y'], 0.727027766067746), (['workclass', 'relationship', 'Y'], 0.7281044560152218)]
----------------------------------
Beam Search execu

In [12]:
r = top_results[0]
print(f'Missing data in column {r[0][0]} depending on columns {r[0]} could lead to an AUC drop of {clean_auc-r[1]}')

Missing data in column hours depending on columns ('hours', 'relationship', 'Y') could lead to an AUC drop of 0.13995602091854753
