In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow.compat.v1 as tf
tf.disable_eager_execution()
tf.logging.set_verbosity(tf.logging.ERROR)

import warnings
warnings.filterwarnings('ignore')

from sklearn.compose import make_column_transformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import OneHotEncoder

from aif360.sklearn.preprocessing import Reweighing, ReweighingMeta, FairAdapt, LearnedFairRepresentations
from aif360.sklearn.inprocessing import AdversarialDebiasing, ExponentiatedGradientReduction, GridSearchReduction
from aif360.sklearn.postprocessing import CalibratedEqualizedOdds, RejectOptionClassifier, PostProcessingMeta, RejectOptionClassifierCV
from aif360.sklearn.datasets import fetch_adult
from aif360.sklearn.metrics import disparate_impact_ratio, average_odds_error, generalized_fpr
from aif360.sklearn.metrics import generalized_fnr, difference

# from glob import glob
# import json,os

2023-01-19 11:12:54.795386: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-01-19 11:12:54.934637: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-01-19 11:12:59.075887: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-01-19 11:12:59.076003: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] 

In [2]:
import json,os
from glob import glob


def writejson(path,arr,name):
    try:
        os.mkdir(path+'preds/')
    except:
        pass
    f = open(path+'preds/'+name,'w')
    json.dump(arr.tolist(),f)
    f.close()

In [3]:
path = 'data/datasets/uci_adult/'
synthfols = glob("data/datasets/uci_adult/synthetic/*_version0/")
paths = [path]
paths.extend(synthfols)

In [4]:
def task(path):
    
#     if os.path.exists(path+'preds/baseline_pred.json') == False:
    print(path)

    train_df = pd.read_csv(path+'train.csv',header=None)

    train_df.columns = ['age', 'workclass', 'fnlwgt','education', 'education-num', 'marital-status',
           'occupation', 'relationship', 'race', 'sex', 'capital-gain',
           'capital-loss', 'hours-per-week', 'native-country','income']

    train_df.index = train_df['sex']

    train_df = train_df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)

    test_df = pd.read_csv(path+'test.csv',header=None)

    test_df.columns = ['age', 'workclass', 'fnlwgt','education', 'education-num', 'marital-status',
           'occupation', 'relationship', 'race', 'sex', 'capital-gain',
           'capital-loss', 'hours-per-week', 'native-country','income']

    test_df.index = test_df['sex']

    test_df = test_df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)


    X_train = train_df.drop(['income'],axis=1)
    X_test = test_df.drop(['income'],axis=1)

    y_train = pd.Series(train_df['income'])
    y_test = pd.Series(test_df['income'])

    y_train = pd.Series(y_train.factorize(sort=True)[0], index=y_train.index)
    y_test = pd.Series(y_test.factorize(sort=True)[0], index=y_test.index)

    X_merged = pd.concat([X_train,X_test])

    ohe = make_column_transformer(
        (OneHotEncoder(sparse=False), X_merged.dtypes == 'object'),
        remainder='passthrough', verbose_feature_names_out=False)

    X_merged_temp  = pd.DataFrame(ohe.fit_transform(X_merged), columns=ohe.get_feature_names_out(), index=X_merged.index)

    X_train  = pd.DataFrame(ohe.transform(X_train), columns=ohe.get_feature_names_out(), index=X_train.index)
    X_test = pd.DataFrame(ohe.transform(X_test), columns=ohe.get_feature_names_out(), index=X_test.index)


    #### BASELINE
    y_pred_baseline = LogisticRegression(solver='liblinear').fit(X_train, y_train).predict(X_test)
    writejson(path,y_pred_baseline,'baseline_pred.json')

    #### Adv. Reweighting (Preproc.)

    rew = ReweighingMeta(estimator=LogisticRegression(solver='liblinear'),
                     reweigher=Reweighing('sex'))

    params = {'estimator__C': [1, 10]}

    clf = GridSearchCV(rew, params, scoring='accuracy', cv=5)
    clf.fit(X_train, y_train)
    y_pred_advrew = clf.predict(X_test)
    writejson(path,y_pred_advrew,'adv_rew_pred.json')

    #### LearnedFairRepresentations (Preproc.)

    LFR = LearnedFairRepresentations(prot_attr='sex')
    LFR.fit(X_train, y_train)

    y_pred_lfr = LFR.predict(X_test)
    writejson(path,y_pred_lfr,'lfr_pred.json')

    #### Adv Deb. (Inproc.)

    adv_deb = AdversarialDebiasing(prot_attr='sex')
    adv_deb.fit(X_train, y_train)
    adv_deb.score(X_test, y_test)

    y_pred_adv_deb = adv_deb.predict(X_test)
    writejson(path,y_pred_adv_deb,'adv_deb_pred.json')

    adv_deb.sess_.close()

    #### ExponentiatedGradientReduction. (Inproc.)

    consts = ['ErrorRateParity']#['DemographicParity','EqualizedOdds','TruePositiveRateParity','FalsePositiveRateParity','ErrorRateParity']
    for const in consts:
        EGR = ExponentiatedGradientReduction(prot_attr=['sex_Female','sex_Male'], estimator=LogisticRegression(solver='liblinear'), constraints = const)
        EGR.fit(X_train, y_train)
        print(const,EGR.score(X_test, y_test))
        y_pred_egr = EGR.predict(X_test)
        writejson(path,y_pred_egr,'egr_pred_'+const+'.json')

    #### GridSearchReduction. (Inproc.)

    consts = ['ErrorRateParity']#['DemographicParity','EqualizedOdds','TruePositiveRateParity','FalsePositiveRateParity','ErrorRateParity']
    for const in consts:
        GSR = GridSearchReduction(prot_attr=['sex_Female','sex_Male'], estimator=LogisticRegression(solver='liblinear'), constraints = const)
        GSR.fit(X_train, y_train)
        print(const,GSR.score(X_test, y_test))
        y_pred_gsr = GSR.predict(X_test)
        writejson(path,y_pred_gsr,'gsr_pred_'+const+'.json')



    X_train_temp = X_train.set_index(['sex_Male'], drop = False)
    y_train_temp = y_train.copy()
    y_train_temp.index = X_train_temp.index

    X_test_temp = X_test.set_index(['sex_Male'], drop = False)
    y_test_temp = y_test.copy()
    y_test_temp.index = X_test_temp.index

    #### Cal. Eq.Odds (Postproc.)

    consts = ['weighted']#['fnr','fpr','weighted']
    for const in consts:
        cal_eq_odds = CalibratedEqualizedOdds(prot_attr='sex_Male', cost_constraint=const)
        postproc = PostProcessingMeta(estimator=LogisticRegression(solver='liblinear'), postprocessor=cal_eq_odds)
        postproc.fit(X_train_temp, y_train_temp)
        print(accuracy_score(y_test_temp, postproc.predict(X_test_temp)))
        y_pred_caleq = postproc.predict(X_test_temp)
        writejson(path,y_pred_caleq,'caleq_pred_'+const+'.json')

    #### RejectOptionClassifier (Postproc.)

    consts = ['average_odds']#['statistical_parity', 'average_odds', 'equal_opportunity']
    for const in consts:
        rocv = PostProcessingMeta(LogisticRegression(solver='liblinear'), RejectOptionClassifierCV('sex_Male', scoring=const))
        rocv.fit(X_train_temp, y_train_temp)
        print(accuracy_score(y_test_temp, rocv.predict(X_test_temp)))
        y_pred_rocv = rocv.predict(X_test_temp)
        writejson(path,y_pred_rocv,'rocv_pred_'+const+'.json')
        
    
    return path,"Done"


In [5]:
#task(paths[0])

In [6]:
from multiprocessing.pool import ThreadPool as Pool

pool = Pool(10)
for result in pool.imap_unordered(task, paths):
    print(result)

2023-01-19 11:13:58.082877: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


TypeError: Encoders require their input to be uniformly strings or numbers. Got ['int', 'str']

2023-01-19 11:14:02.022987: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1613] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 45677 MB memory:  -> device: 0, name: NVIDIA RTX A6000, pci bus id: 0000:b1:00.0, compute capability: 8.6
2023-01-19 11:14:02.058489: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1613] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 46704 MB memory:  -> device: 1, name: NVIDIA RTX A6000, pci bus id: 0000:d9:00.0, compute capability: 8.6
2023-01-19 11:14:02.060508: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1613] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 45677 MB memory:  -> device: 0, name: NVIDIA RTX A6000, pci bus id: 0000:b1:00.0, compute capability: 8.6
2023-01-19 11:14:02.060725: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1613] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 46704 MB memory:  -> device: 1, name: NVIDIA RTX A6000, pci bus id: 0000:d9:00.0, 

#### FairAdapt (Preproc.)

In [None]:
# XY_df = pd.concat([X_train, y_train], axis=1)
# adj_mat = pd.DataFrame(
#     np.zeros((len(train_df.columns), len(train_df.columns)), dtype=int),
#     index=train_df.columns.values,
#     columns=train_df.columns.values
# )

# # Construct the adjacency matrix of the causal graph
# adj_mat.at[
#     ["sex", "age", "native-country"],
#     ["marital-status", "education-num", "workclass", "hours-per-week",
#      "occupation", "annual-income"]
# ] = 1
# adj_mat.at[
#     "marital-status",
#     ["education-num", "workclass", "hours-per-week", "occupation",
#      "annual-income"]
# ] = 1
# adj_mat.at[
#     "education-num",
#     ["workclass", "hours-per-week", "occupation", "annual-income"]
# ] = 1
# adj_mat.at[
#     ["workclass", "hours-per-week", "occupation"],
#     "annual-income"
# ] = 1

# FA = FairAdapt(prot_attr='sex', adj_mat = adj_mat)


# Xf_train, yf_train, Xf_test = FA.fit_transform(X_train, y_train, X_test)
# # y_pred_advrew = clf.predict(X_test)
# # writejson(y_pred_advrew,'adv_rew_pred.json')


#### TAKING TOO LONG TO PROCESS #####

In [139]:
set(X_train.columns).intersection(set(X_test.columns))

{'age',
 'capital-gain',
 'capital-loss',
 'education-num',
 'fnlwgt',
 'hours-per-week'}

In [136]:
X_test.columns

Index(['workclass_?', 'workclass_Federal-gov', 'workclass_Local-gov',
       'workclass_Never-worked', 'workclass_Private', 'workclass_Self-emp-inc',
       'workclass_Self-emp-not-inc', 'workclass_State-gov',
       'workclass_Without-pay', 'education_10th',
       ...
       'native-country_Trinadad&Tobago', 'native-country_United-States',
       'native-country_Vietnam', 'native-country_Yugoslavia', 'age', 'fnlwgt',
       'education-num', 'capital-gain', 'capital-loss', 'hours-per-week'],
      dtype='object', length=107)

In [None]:
print('still runnung')

In [2]:
import aif360

In [3]:
aif360.__file__

'/home/avijit/.local/lib/python3.7/site-packages/aif360/__init__.py'