In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow.compat.v1 as tf
tf.disable_eager_execution()
tf.logging.set_verbosity(tf.logging.ERROR)

import warnings
warnings.filterwarnings('ignore')

from sklearn.compose import make_column_transformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import OneHotEncoder

from aif360.sklearn.preprocessing import Reweighing, ReweighingMeta, FairAdapt, LearnedFairRepresentations
from aif360.sklearn.inprocessing import AdversarialDebiasing, ExponentiatedGradientReduction, GridSearchReduction
from aif360.sklearn.postprocessing import CalibratedEqualizedOdds, RejectOptionClassifier, PostProcessingMeta, RejectOptionClassifierCV
from aif360.sklearn.datasets import fetch_adult
from aif360.sklearn.metrics import disparate_impact_ratio, average_odds_error, generalized_fpr
from aif360.sklearn.metrics import generalized_fnr, difference

# from glob import glob
# import json,os

2023-06-09 22:58:59.326622: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-06-09 22:58:59.443763: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-06-09 22:59:00.337308: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-06-09 22:59:00.337395: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] 

In [2]:
import json,os
from glob import glob
import traceback


def writejson(path,arr,name):
    try:
        os.mkdir(path+'preds/')
    except:
        pass
    f = open(path+'preds/'+name,'w')
    json.dump(arr.tolist(),f)
    f.close()

In [3]:
paths  = ['data/datasets/publiccov_ca/', 'data/datasets/employment_ca/', 'data/datasets/law_school/', 'data/datasets/diabetes/']
cnames = [['AGEP', 'SCHL', 'MAR', 'SEX', 'DIS', 'ESP', 'CIT', 'MIG', 'MIL', 'ANC',
       'NATIVITY', 'DEAR', 'DEYE', 'DREM', 'PINCP', 'ESR', 'FER', 'RAC1P',
       'PUBCOV'],
          ['AGEP', 'SCHL', 'MAR', 'RELP', 'DIS', 'ESP', 'CIT', 'MIG', 'MIL', 'ANC',
       'NATIVITY', 'DEAR', 'DEYE', 'DREM', 'SEX', 'RAC1P', 'ESR'],
          ['zfygpa', 'zgpa', 'DOB_yr', 'weighted_lsat_ugpa', 'cluster_tier',
       'family_income', 'lsat', 'ugpa', 'isPartTime', 'sex', 'race',
       'pass_bar'],
          ['race', 'sex', 'age', 'admissiontypeid', 'dischargedispositionid',
       'admissionsourceid', 'timeinhospital', 'numlabprocedures',
       'numprocedures', 'nummedications', 'numberoutpatient',
       'numberemergency', 'numberinpatient', 'diag1', 'diag2', 'diag3',
       'numberdiagnoses', 'maxgluserum', 'A1Cresult', 'metformin',
       'glimepiride', 'glipizide', 'glyburide', 'pioglitazone',
       'rosiglitazone', 'insulin', 'change', 'diabetesMed', 'readmitted']
]

In [4]:
def task(path,cols,num,vi):
    
#     if os.path.exists(path+'preds/baseline_pred.json') == False:
    print(path)
    
    ss = ""
    if 'SEX' in cols:
        ss = 'SEX'
    else:
        ss = 'sex'
        
    tgt = cols[-1]

    train_df = pd.read_csv(path+'train.csv',header=None)

    train_df.columns = cols


    train_df.index = train_df[ss]

    train_df = train_df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)

    test_df = pd.read_csv(path+'test.csv',header=None)

    test_df.columns = cols


    test_df.index = test_df[ss]

    test_df = test_df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
    


    X_train = train_df.drop([tgt],axis=1)
    X_test = test_df.drop([tgt],axis=1)

    y_train = pd.Series(train_df[tgt])
    y_test = pd.Series(test_df[tgt])

    y_train = pd.Series(y_train.factorize(sort=True)[0], index=y_train.index)
    y_test = pd.Series(y_test.factorize(sort=True)[0], index=y_test.index)

    X_merged = pd.concat([X_train,X_test])

    ohe = make_column_transformer(
        (OneHotEncoder(sparse=False), X_merged.dtypes == 'object'),
        remainder='passthrough', verbose_feature_names_out=False)

    X_merged_temp  = pd.DataFrame(ohe.fit_transform(X_merged), columns=ohe.get_feature_names_out(), index=X_merged.index)

    X_train  = pd.DataFrame(ohe.transform(X_train), columns=ohe.get_feature_names_out(), index=X_train.index)
    X_test = pd.DataFrame(ohe.transform(X_test), columns=ohe.get_feature_names_out(), index=X_test.index)
    

   

    

    #### ExponentiatedGradientReduction. (Inproc.)
    
    try:
        consts = ['EqualizedOdds']#['DemographicParity','EqualizedOdds','TruePositiveRateParity','FalsePositiveRateParity','ErrorRateParity']
        for const in consts:
            EGR = ExponentiatedGradientReduction(prot_attr=[ss+'_Female',ss+'_Male'], estimator=LogisticRegression(solver='liblinear'), constraints = const, eps = vi)
            EGR.fit(X_train, y_train)
            print(const,EGR.score(X_test, y_test))
            y_pred_egr = EGR.predict(X_test)
            writejson(path,y_pred_egr,'egr_pred_'+const+'_'+str(num)+'_violation_'+str(vi)+'.json')
    except:
        traceback.print_exc()

    #### GridSearchReduction. (Inproc.)

    try:
        consts = ['EqualizedOdds']#['DemographicParity','EqualizedOdds','TruePositiveRateParity','FalsePositiveRateParity','ErrorRateParity']
        for const in consts:
            GSR = GridSearchReduction(prot_attr=[ss+'_Female',ss+'_Male'], estimator=LogisticRegression(solver='liblinear'), constraints = const, constraint_weight = vi)
            GSR.fit(X_train, y_train)
            print(const,GSR.score(X_test, y_test))
            y_pred_gsr = GSR.predict(X_test)
            writejson(path,y_pred_gsr,'gsr_pred_'+const+'_'+str(num)+'_violation_'+str(vi)+'.json')
    except:
        traceback.print_exc()

    
    return path,"Done"


In [5]:
# task(path)
violations = [0.01,0.05,0.10,0.15,0.20]

In [6]:
for path,cols in zip(paths,cnames):
    for i in range(10):
        for vi in violations:
            try:
                task(path,cols,i,vi)
            except:
                traceback.print_exc()

data/datasets/publiccov_ca/
EqualizedOdds 0.6952834614413049
EqualizedOdds 0.6923965212370539
data/datasets/publiccov_ca/
EqualizedOdds 0.6938760780917326
EqualizedOdds 0.6923965212370539
data/datasets/publiccov_ca/
EqualizedOdds 0.6962578037602396
EqualizedOdds 0.6923965212370539
data/datasets/publiccov_ca/
EqualizedOdds 0.698711702933853
EqualizedOdds 0.6923965212370539
data/datasets/publiccov_ca/
EqualizedOdds 0.6951030276785392
EqualizedOdds 0.6923965212370539
data/datasets/publiccov_ca/
EqualizedOdds 0.6950308541734329
EqualizedOdds 0.6923965212370539
data/datasets/publiccov_ca/
EqualizedOdds 0.6938760780917326
EqualizedOdds 0.6923965212370539
data/datasets/publiccov_ca/
EqualizedOdds 0.6963299772653458
EqualizedOdds 0.6923965212370539
data/datasets/publiccov_ca/
EqualizedOdds 0.698459095665981
EqualizedOdds 0.6923965212370539
data/datasets/publiccov_ca/
EqualizedOdds 0.6954278084515174
EqualizedOdds 0.6923965212370539
data/datasets/publiccov_ca/
EqualizedOdds 0.6949225939157735
E

In [None]:
# from multiprocessing.pool import ThreadPool as Pool

# pool = Pool(10)
# for result in pool.imap_unordered(task, paths):
#     print(result)