In [1]:
import os,sys,json
from glob import glob

os.chdir('/home/avijit/projects/Awareness_vs_Unawareness')
sys.path.insert(0,"/home/avijit/projects/Awareness_vs_Unawareness/robustfairnesscode")

In [2]:
import warnings
warnings.filterwarnings('ignore')

from robustfairnesscode import data, losses, optimization, model, utils, dro_training, softweights_training
import tensorflow as tf
import numpy as np
import pandas as pd
import pickle
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder
import shap
import traceback




In [3]:
def writejson(path,arr,name):
    try:
        os.mkdir(path+'preds/')
    except:
        pass
    f = open(path+'preds/'+name,'w')
    json.dump(arr,f)
    f.close()

In [4]:
paths = {0:'data/datasets/credit_tw/',
         0.2:'data/datasets/credit_tw/synthetic/gender_flip_labels0.2_version0/',
         0.4:'data/datasets/credit_tw/synthetic/gender_flip_labels0.4_version0/',
         0.6:'data/datasets/credit_tw/synthetic/gender_flip_labels0.6_version0/',
         0.8:'data/datasets/credit_tw/synthetic/gender_flip_labels0.8_version0/'
        }

In [5]:
def dothings(model,name):
    session = tf.Session()
    session.run((tf.global_variables_initializer(),tf.local_variables_initializer()))
    
    def f(test_df):
        val = session.run(model.predictions_tensor,feed_dict=model.feed_dict_helper(test_df))
        vall = [float(v[0]) for v in val]
        val_m = [0 if i < 0 else 1 for i in vall]
        return val_m
    
    explainer = shap.Explainer(f, test_df)
    shap_values = explainer(test_df.sample(n=500))
    
    clist = list(FEATURE_NAMES)
    features = {}
    for i in range(len(clist)):
        fname = clist[i]
        item = fname.split('_')[0]
        if item not in features:
            features[item]=[]
        features[item].append(i)
    newshap = {}
    for k in features:
        nshap = []
        for x in shap_values.values:
            val = 0
            for i in features[k]:
                val+= x[i]
            nshap.append(val)
        newshap[k] = nshap
    new_shap_values = list(zip(*newshap.values()))
    shap_values.values = np.array(new_shap_values)
    shap_values.feature_names = list(features.keys())
    
    f = open('SavedShap/'+name+'.pkl','wb')
    pickle.dump(shap_values,f)
    f.close()
    
    
    shap.plots.bar(shap_values,max_display=18)

In [6]:
for frac in paths:
    path = paths[frac]
    
    train_df = pd.read_csv(path+'train.csv',header=None)

    train_df.columns = ['LIMITBAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY0', 'PAY2',
           'PAY3', 'PAY4', 'PAY5', 'PAY6', 'BILLAMT1', 'BILLAMT2',
           'BILLAMT3', 'BILLAMT4', 'BILLAMT5', 'BILLAMT6', 'PAYAMT1',
           'PAYAMT2', 'PAYAMT3', 'PAYAMT4', 'PAYAMT5', 'PAYAMT6',
           'DEFAULT_PAY']

    train_df = train_df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)

    test_df = pd.read_csv(path+'test.csv',header=None)

    test_df.columns = ['LIMITBAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY0', 'PAY2',
           'PAY3', 'PAY4', 'PAY5', 'PAY6', 'BILLAMT1', 'BILLAMT2',
           'BILLAMT3', 'BILLAMT4', 'BILLAMT5', 'BILLAMT6', 'PAYAMT1',
           'PAYAMT2', 'PAYAMT3', 'PAYAMT4', 'PAYAMT5', 'PAYAMT6',
           'DEFAULT_PAY']

    X_train = train_df.drop(['DEFAULT_PAY'],axis=1)
    X_test = test_df.drop(['DEFAULT_PAY'],axis=1)

    Y_train = pd.Series(train_df['DEFAULT_PAY'])
    Y_test = pd.Series(test_df['DEFAULT_PAY'])

    Y_train = pd.Series(Y_train.factorize(sort=True)[0], index=Y_train.index)
    Y_test = pd.Series(Y_test.factorize(sort=True)[0], index=Y_test.index)

    X_merged = pd.concat([X_train,X_test])

    ohe = make_column_transformer(
        (OneHotEncoder(sparse=False), X_merged.dtypes == 'object'),
        remainder='passthrough', verbose_feature_names_out=False)

    X_merged_temp  = pd.DataFrame(ohe.fit_transform(X_merged), columns=ohe.get_feature_names_out(), index=X_merged.index)

    train_df  = pd.DataFrame(ohe.transform(X_train), columns=ohe.get_feature_names_out(), index=X_train.index)
    test_df = pd.DataFrame(ohe.transform(X_test), columns=ohe.get_feature_names_out(), index=X_test.index)

    train_df['DEFAULT_PAY'] = Y_train
    test_df['DEFAULT_PAY'] = Y_test

    print(train_df.columns)

    LABEL_COLUMN = "DEFAULT_PAY"
    FEATURE_NAMES = list(train_df.columns)
    FEATURE_NAMES.remove(LABEL_COLUMN)
    PROTECTED_COLUMNS = ['SEX_Female','SEX_Male']

    PROXY_COLUMNS = PROTECTED_COLUMNS  
    
    valmain2,best_learning_rate_theta2,best_learning_rate_lambda2,best_learning_rate_W2 = softweights_training.get_results_for_learning_rates(train_df, test_df, FEATURE_NAMES, 
                                                                       PROTECTED_COLUMNS, PROXY_COLUMNS, LABEL_COLUMN, num_loops = 1, constraint='tpr_and_fpr')

    swmodel = softweights_training.get_model_for_learning_rates(train_df, test_df,  FEATURE_NAMES, PROTECTED_COLUMNS, PROXY_COLUMNS, LABEL_COLUMN, learning_rate_theta = best_learning_rate_theta2, 
                                                        learning_rate_lambda = best_learning_rate_lambda2, learning_rate_W = best_learning_rate_W2, num_loops = 1,constraint='tpr_and_fpr')


    name = 'credit_tw_soft_'+str(frac)
    
    dothings(swmodel,name)


    valmain,best_learning_rate_theta,best_learning_rate_lambda,best_learning_rate_p_list = dro_training.get_results_for_learning_rates(train_df, test_df, FEATURE_NAMES, 
                                                      PROTECTED_COLUMNS, PROXY_COLUMNS, LABEL_COLUMN, num_loops = 1,constraint='tpr_and_fpr')

    model = dro_training.get_model_for_learning_rates(train_df, test_df,  FEATURE_NAMES, PROTECTED_COLUMNS, PROXY_COLUMNS, LABEL_COLUMN, learning_rate_theta = best_learning_rate_theta, 
                                                        learning_rate_lambda = best_learning_rate_lambda, learning_rate_p_list = best_learning_rate_p_list, num_loops = 1,constraint='tpr_and_fpr')

    name = 'credit_tw_mdro_'+str(frac)
    
    dothings(model,name)

Index(['SEX_Female', 'SEX_Male', 'LIMITBAL', 'EDUCATION', 'MARRIAGE', 'AGE',
       'PAY0', 'PAY2', 'PAY3', 'PAY4', 'PAY5', 'PAY6', 'BILLAMT1', 'BILLAMT2',
       'BILLAMT3', 'BILLAMT4', 'BILLAMT5', 'BILLAMT6', 'PAYAMT1', 'PAYAMT2',
       'PAYAMT3', 'PAYAMT4', 'PAYAMT5', 'PAYAMT6', 'DEFAULT_PAY'],
      dtype='object')
Split 0 of 10
Time since start: 0.005113124847412109
Starting optimizing learning rate theta: 0.100, learning rate lambda: 1.000, learning rate W: 0.100
Instructions for updating:
Use keras.layers.dense instead.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where







KeyboardInterrupt: 