In [15]:
import copy
import tqdm
import optuna
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from load_dataset import load
from classifier import *
from utils import *
from metrics import *  # include fairness and corresponding derivatives
from API_Design_a import MissingValueError, SamplingError, Injector, DuplicateError, LabelError, OutlierError
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, plot_tree
from sklearn.neural_network import MLPClassifier
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
from sklearn.metrics import mutual_info_score, auc, roc_curve, roc_auc_score, f1_score, accuracy_score
from scipy.stats import wasserstein_distance
from optuna.samplers import *
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
from sklearn.experimental import enable_iterative_imputer

In [16]:
# ignore all the warnings
import warnings
warnings.filterwarnings('ignore')

In [17]:
dataset = 'adult'
sens_attr = 'gender'

In [18]:
X_train, X_test, y_train, y_test = load(dataset)

In [19]:
X_train_orig = copy.deepcopy(X_train)
X_test_orig = copy.deepcopy(X_test)

# Use 1/4 of training data as validation set
X_train_orig, X_val_orig, y_train, y_val = \
    train_test_split(X_train_orig, y_train, test_size=0.25, random_state=seed)

In [20]:
X_train_orig, X_val_orig, X_test_orig = (X_train_orig.reset_index(drop=True), 
                                         X_val_orig.reset_index(drop=True),
                                         X_test_orig.reset_index(drop=True))
y_train, y_test = y_train.reset_index(drop=True), y_test.reset_index(drop=True)

## Default pipeline

In [21]:
X_train_orig.columns

Index(['age', 'workclass', 'education', 'marital', 'relationship', 'race',
       'gender', 'hours'],
      dtype='object')

In [22]:
# clean data
clf = RandomForestClassifier(random_state=42, n_jobs=-1, n_estimators=50)
clf.fit(X_train_orig, y_train)
baseline_auc = roc_auc_score(y_test, clf.predict_proba(X_test_orig)[:, 1])
baseline_auc

0.8523087647506661

In [None]:
X_train, X_test, y_train, y_test = load(dataset)
X_train_orig, X_test_orig = X_train.copy(), X_test.copy()

imputers = {
    'IterativeImputerAsc': IterativeImputer(max_iter=1, random_state=42, imputation_order='ascending'),
    'IterativeImputerDesc': IterativeImputer(max_iter=1, random_state=42, imputation_order='descending'),
    'MeanImputer': SimpleImputer(strategy='mean'),
    'MedianImputer': SimpleImputer(strategy='median'),
    'KNN5Imputer': KNNImputer(n_neighbors=5),
    'KNN10Imputer': KNNImputer(n_neighbors=10)
}

columns_to_nan = ['education', 'race', 'marital']
nan_counts = range(1000, 10001, 1000)

results = {col: {imputer: {'ACC': [], 'AUC': [], 'w_distance': []} for imputer in imputers} 
           for col in columns_to_nan}

for col in tqdm.tqdm(columns_to_nan):
    for count in nan_counts:
        nan_indices = np.random.choice(X_train.index, count, replace=False)
        X_train_naned = X_train.copy()
        X_train_naned.loc[nan_indices, col] = np.nan
        
        for name, imputer in imputers.items():

            imputed_train = imputer.fit_transform(X_train_naned)
            X_train_imputed = pd.DataFrame(imputed_train, columns=X_train_naned.columns)
            
            clf = DecisionTreeClassifier(random_state=42)
            clf.fit(X_train_imputed, y_train)
            
            y_pred_proba = clf.predict_proba(X_test)[:, 1]
            auc_score = roc_auc_score(y_test, y_pred_proba)
            y_pred = clf.predict(X_test)
            accuracy = accuracy_score(y_test, y_pred)
            w_distance = wasserstein_distance(X_train_orig[col].to_numpy(), 
                                              X_train_imputed[col].to_numpy())
            
            results[col][name]['ACC'].append(accuracy)
            results[col][name]['AUC'].append(auc_score)
            results[col][name]['w_distance'].append(w_distance)

Now we use the learned injector to verify