This notebook runs all the functions in notebooks 0 to 2 (e.g. 0_generate_datasets.ipynb). For additional information kindly refer to the repo document

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import sys
sys.path.append('../modules')
import aftSandBox as sb

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [2]:
from sklearn.linear_model import LogisticRegression, RidgeClassifier
logr = LogisticRegression()
ridge = RidgeClassifier()

from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis, LinearDiscriminantAnalysis
qda = QuadraticDiscriminantAnalysis()
lda = LinearDiscriminantAnalysis()

from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()

import sklearn.svm 
svm = sklearn.svm.SVC(kernel='linear', C=1.0, probability=True)

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
rforest = RandomForestClassifier()
adaboost = AdaBoostClassifier()
gradboost = GradientBoostingClassifier()

from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier()

from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()

from xgboost import XGBClassifier
xgb = XGBClassifier()

from lightgbm import LGBMClassifier
lgbm = LGBMClassifier()

algos = [logr, rforest, gnb, qda, gradboost, mlp, knn, xgb, lda, adaboost]

In [3]:
from tqdm import tqdm

# how many times to run the experiment
n_experiments = 3

# run enitre study n_experiments times
for num in tqdm(range(n_experiments), desc="Working", ascii=True, position=0, leave=True):
    
    # generate dataset
    print(f'Generating datasets for study {num}')
    sb.create_dataset(sample_sizes=[2000, 5000, 10000, 15000, 20000], 
                       n_features=10, 
                       n_informative=10, 
                       n_classes=2, 
                       class_separations=[0.05, 0.1, 0.5, 1, 5, 10, 15, 20], 
                       p_weights=[0.55, 0.35, 0.10], 
                       p_feature_size=3, 
                       p_informative=3, 
                       p_label='gender', 
                       save_path=f'../output/study{num}', 
                       log=True
                       )
    
    # flip labels (see repo notes for more info)
    print(f'Flipping labels for study {num}')
    sb.flip_labels(target_col='accept',
                   protected_col='gender', 
                   protected_classes_to_flip=[1,2], 
                   flip_percentages=[0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9], 
                   data_loc=f'../output/study{num}/', 
                   save_path=f'../output/study{num}/flipped', 
                   log=True
                   )
    
    # remove disparate impact
    print(f'Removing disparate impact for study {num}')
    sb.create_fair_impact(data_loc=f'../output/study{num}/flipped', 
                      save_path=f'../output/study{num}/direm', 
                      drop_cols = ['accept','sample_id', 'protected_id'], 
                      maj_group = 0, 
                      min_group = [1,2], 
                      protected_class = 'gender', 
                      label = ['flipped_accept'], 
                      disparate_repair_level = 1.0,
                      )
    
    # unfair predictions (see repo notes for more info)
    print(f'Running unfair predictions for study {num}')
    sb.run_predictions(data_loc=f'../output/study{num}/flipped', 
                       X_features_drop=['sample_id','gender','accept','protected_id','flipped_accept'], 
                       y_target=['flipped_accept'], 
                       scimodels=algos, 
                       experiment_name='unfair', 
                       protected_col='gender', 
                       ground_truth_label='accept',
                       results_file='../results/results_110224.csv',
                       study_num=f'study{num}', 
                       log=True
                       )
    
    print(f'Running fair_oversample predictions for study {num}')
    # fair predictions (see repo notes for more info)
    sb.run_predictions(data_loc=f'../output/study{num}/flipped', 
                       X_features_drop=['sample_id','accept','gender','protected_id','flipped_accept'], 
                       y_target=['flipped_accept'], 
                       scimodels=algos, 
                       experiment_name='fair_oversample', 
                       protected_col='gender', 
                       ground_truth_label='accept', 
                       results_file='../results/results_110224.csv', 
                       study_num=f'study{num}',
                       log=True,
                       oversample=True
                       )
    
    print(f'Running fair_dir predictions for study {num}')
    sb.run_predictions(data_loc=f'../output/study{num}/direm', 
                       X_features_drop=['sample_id','accept','gender','protected_id','flipped_accept'], 
                       y_target=['flipped_accept'], 
                       scimodels=algos, 
                       experiment_name='fair_dir', 
                       protected_col='gender', 
                       ground_truth_label='accept', 
                       results_file='../results/results_110224.csv', 
                       study_num=f'study{num}',
                       log=True,
                       )
    
    print(f'Running fair_ftu predictions for study {num}')
    # fair predictions (see repo notes for more info)
    sb.run_predictions(data_loc=f'../output/study{num}/flipped', 
                       X_features_drop=['sample_id','accept','gender','protected_id','flipped_accept','proc_feature_0','proc_feature_1', 'proc_feature_2'], 
                       y_target=['flipped_accept'], 
                       scimodels=algos, 
                       experiment_name='fair_ftu', 
                       protected_col='gender', 
                       ground_truth_label='accept', 
                       results_file='../results/results_110224.csv', 
                       study_num=f'study{num}',
                       log=True,
                       )

Working:   0%|          | 0/3 [00:00<?, ?it/s]

Generating datasets for study 0
Flipping labels for study 0
Removing disparate impact for study 0
Running unfair predictions for study 0


Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


Running fair_oversample predictions for study 0


Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


Running fair_dir predictions for study 0


Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


Running fair_ftu predictions for study 0


Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)
Working:  33%|###3      | 1/3 [7:34:45<15:09:31, 27285.86s/it]

Generating datasets for study 1
Flipping labels for study 1
Removing disparate impact for study 1
Running unfair predictions for study 1


Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


Running fair_oversample predictions for study 1


Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


Running fair_dir predictions for study 1


Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


Running fair_ftu predictions for study 1


Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)
Working:  67%|######6   | 2/3 [14:41:49<7:18:28, 26308.30s/it]

Generating datasets for study 2
Flipping labels for study 2
Removing disparate impact for study 2
Running unfair predictions for study 2


Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


Running fair_oversample predictions for study 2


Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


Running fair_dir predictions for study 2


Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


Running fair_ftu predictions for study 2


Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)
Working: 100%|##########| 3/3 [21:43:47<00:00, 26075.69s/it]  
