In [1]:
import pandas as pd 
import numpy as np

In [2]:
# load data
compas_scores = pd.read_csv('compas/cox-violent-parsed_filt.csv')

In [3]:
compas_scores.columns

Index(['id', 'name', 'first', 'last', 'sex', 'dob', 'age', 'age_cat', 'race',
       'juv_fel_count', 'decile_score', 'juv_misd_count', 'juv_other_count',
       'priors_count', 'days_b_screening_arrest', 'c_jail_in', 'c_jail_out',
       'c_days_from_compas', 'c_charge_degree', 'c_charge_desc', 'is_recid',
       'r_charge_degree', 'r_days_from_arrest', 'r_offense_date',
       'r_charge_desc', 'r_jail_in', 'violent_recid', 'is_violent_recid',
       'vr_charge_degree', 'vr_offense_date', 'vr_charge_desc',
       'type_of_assessment', 'decile_score.1', 'score_text', 'screening_date',
       'v_type_of_assessment', 'v_decile_score', 'v_score_text',
       'priors_count.1', 'event'],
      dtype='object')

In [4]:
# clean null
compas_scores = compas_scores.dropna(subset=['race', 'age','juv_fel_count','juv_misd_count','priors_count','is_recid','c_charge_degree'])



In [5]:
#extract type of crime
compas_scores["type"] = compas_scores["c_charge_degree"].map(lambda x: "fel" if "F" in x else "other" if "MO" in x else "misd" if "M" in x else "other")


In [9]:
compas_scores['race'].unique()

array(['Other', 'African-American', 'Caucasian', 'Hispanic', 'Asian',
       'Native American'], dtype=object)

In [None]:
# #simplify race to binary case
compas = compas_scores[compas_scores['race'].isin(['African-American','Caucasian'])]
# compas_scores['race'] = compas_scores['race'].map(lambda x: "African-American" if x == "African-American" else "Other")




In [14]:
#simplify label
compas['score_text'] = compas['score_text'].map(lambda x: 0 if x == "Low" else 1)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  compas['score_text'] = compas['score_text'].map(lambda x: 0 if x == "Low" else 1)


In [33]:
# reset index 
compas = compas.reset_index().drop(columns=['index','id'])

In [34]:
# extract dataframes

# features 
features = compas[['juv_fel_count','juv_misd_count','priors_count','type','age']]

# sensitive attribute
sensitive = compas['race']

#predicted (O)
y = compas['score_text']

#actual outcome (Y)
ground_truth = compas['is_recid']

## Fairness through Unawareness

In [36]:
# for logistic regression: one-hot encode type variable
X = pd.concat([features,pd.get_dummies(features['type'])],axis=1).drop(columns='type')

In [37]:
#scale/normalize values 

from sklearn.preprocessing import StandardScaler 
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [38]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.33)


In [39]:
# actual recividism outcome for test dataset 
y_truth = ground_truth[y_test.index]

In [43]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

clf = LogisticRegression(random_state=0).fit(X_train, y_train)

In [44]:
# model predictions 
y_pred = clf.predict(X_test)

In [109]:
# add index 
y_pred = pd.Series(y_pred,index=y_test.index)

In [120]:
from sklearn.metrics import accuracy_score, f1_score
accuracy_score(y_test, y_pred)

0.7438561438561438

In [121]:
f1_score(y_test, y_pred, average='macro')

0.7336659142047282

In [141]:
def data_parser(y_pred, y_truth, sensitive):
    
    # test dataset index for African Americans   
    protected = sensitive[sensitive=='African-American'].index.intersection(y_truth.index)
    # test dataset index for Caucasians
    unprotected = sensitive[sensitive=='Caucasian'].index.intersection(y_truth.index)
    #actual recividism outcome + model predictions for African Americans
    y_truth_protected = y_truth[protected]
    y_pred_protected = y_pred[protected]
    #actual recividism outcome + model predictions for Caucasians
    y_truth_unprotected = y_truth[unprotected]
    y_pred_unprotected = y_pred[unprotected]
    return y_truth_protected, y_pred_protected, y_truth_unprotected, y_pred_unprotected

In [139]:
def ppv_diff(y_pred, y_truth, sensitive):
    
    #extract relevant data
    y_truth_protected, y_pred_protected, y_truth_unprotected, y_pred_unprotected = data_parser(y_pred, y_truth, sensitive)
    # index for African Americans predicted to recidivate 
    p_predicted_true = y_pred_protected[y_pred_protected == 1].index
    # portion of African Americans predicted to recidivate who actually recidivated 
    p_ppv = y_truth_protected[p_predicted_true].sum()/len(y_truth_protected[p_predicted_true])
    # index for Caucasians predicted to recidivate
    up_predicted_value = y_pred_unprotected[y_pred_unprotected == 1].index
    # portion of Caucasians predicted to recidivate who actually recidivated
    up_ppv = y_truth_unprotected[up_predicted_value].sum()/len(y_truth_unprotected[up_predicted_value])
    
    return abs(p_ppv-up_ppv)

In [142]:
ppv_diff(y_pred,y_truth, sensitive)

0.06416249726251244

In [164]:
def eo_diff(y_pred,y_truth,sensitive):
    y_truth_protected, y_pred_protected, y_truth_unprotected, y_pred_unprotected = data_parser(y_pred, y_truth, sensitive)
    total_eo_diff = 0
    # add FNR and FPR
    for i in range(0,2):
        #index for African Americans who actually didn't/did't recidivated
        p_truth_value = y_truth_protected[y_truth_protected == i].index
        #portion of African Americans who didn't/did recidivated who were predicted to/not to 
        p_eo = (y_pred_protected[p_truth_value] == 1-i).sum()/len(y_pred_protected[p_truth_value])
        #index for Caucasians who actually didn't/did't recidivated 
        up_truth_value = y_truth_unprotected[y_truth_unprotected == i].index
        #portion of Caucasians who didn't/did recidivated who were predicted to/not to 
        up_eo = (y_pred_unprotected[up_truth_value] == 1-i).sum()/len(y_pred_unprotected[up_truth_value])
        total_eo_diff+=abs(p_eo-up_eo)
    return total_eo_diff/2
    

In [165]:
eo_diff(y_pred,y_truth,sensitive)

0.24027512690644837

## Multi-Objective Optimization

In [None]:
def custom_loss(y_pred, y_true, y_outcome, sample_weights=None):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    assert len(y_true) == len(y_pred)
    
    if np.any(y_true==0):
        print("Found zeroes in y_true. MAPE undefined. Removing from set...")
        idx = np.where(y_true==0)
        y_true = np.delete(y_true, idx)
        y_pred = np.delete(y_pred, idx)
        if type(sample_weights) != type(None):
            sample_weights = np.array(sample_weights)
            sample_weights = np.delete(sample_weights, idx)
        
    if type(sample_weights) == type(None):
        return(np.mean(np.abs((y_true - y_pred) / y_true)) * 100)
    else:
        sample_weights = np.array(sample_weights)
        assert len(sample_weights) == len(y_true)
        return(100/sum(sample_weights)*np.dot(
                sample_weights, (np.abs((y_true - y_pred) / y_true))
        ))


In [None]:
class CustomLogisticClassifier:
    def __init__(self, loss_function=custom_loss, X=None, Y=None, sample_weights=None, 
                 beta_init=None, regularization=0.00012):
        self.regularization = regularization
        self.beta = None 
        self.loss_function = loss_function
        self.sample_weights = sample_weights
        self.beta_init = beta_init
        
        self.X = X
        self.Y = Y
        
    def predict(self, X):
        x_dot_weights = np.matmul(x, self.sample_weights.transpose()) + self.bias
        probabilities = self._sigmoid(x_dot_weights)
        return probabilities
    
    def model_error(self):
        error = self.loss_function(
            self.predict(self.X), self.Y, sample_weights=self.sample_weights
        )
        return(error)



        
        