In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.preprocessing import normalize
from sklearn.impute import SimpleImputer

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import SCORERS

import seaborn as sns
sns.set(rc={'figure.figsize':(6,6)}) 
import warnings
warnings.simplefilter("ignore")

%matplotlib inline

# Using Two Year Data

In [10]:
df = pd.read_csv("../data/compas-scores-two-years.csv")

In [11]:
df = df[["decile_score","race","two_year_recid"]]

In [12]:
df.head()

Unnamed: 0,decile_score,race,two_year_recid
0,1,Other,0
1,3,African-American,1
2,4,African-American,1
3,8,African-American,0
4,1,Other,0


In [13]:
#only looking at African American vs Caucasian
df.race = df.race.replace({
    'Asian':'Other',
    'Hispanic':'Other',
    'Native American':'Other'
})

## Feature Engineering

In [14]:
categorical_cols = df.select_dtypes(exclude=np.number).columns
numerical_cols = df.select_dtypes(np.number).columns

In [15]:
#taking care of nan in numerical columns
numerical = pd.DataFrame(
    SimpleImputer(strategy="median").fit_transform(df[numerical_cols]),
    columns=numerical_cols
)

In [16]:
#dealing with categorical columns to prepare for sklearn
categorical = pd.get_dummies(df[categorical_cols])
categorical.head()

Unnamed: 0,race_African-American,race_Caucasian,race_Other
0,0,0,1
1,1,0,0
2,1,0,0
3,1,0,0
4,0,0,1


In [17]:
#combining categorical and numerical columns to make final dataframe
df_new = pd.merge(numerical, categorical, left_index=True, right_index=True)
df_new.head()

Unnamed: 0,decile_score,two_year_recid,race_African-American,race_Caucasian,race_Other
0,1.0,0.0,0,0,1
1,3.0,1.0,1,0,0
2,4.0,1.0,1,0,0
3,8.0,0.0,1,0,0
4,1.0,0.0,0,0,1


# Logistic Regression

In [18]:
target_variable = 'two_year_recid'
independent_variables = df_new.drop(columns=target_variable).columns
X = df_new[independent_variables]
y = df_new[target_variable]

In [19]:
model = LogisticRegression(class_weight = "balanced")

In [20]:
cross_val_score(
    model, X, y, scoring = "accuracy", cv = 5
).mean()

0.6537256236285363

In [21]:
cross_val_score(
    model, X, y, scoring = "roc_auc", cv = 5
).mean()

0.7027428852441769

In [22]:
from sklearn.model_selection import train_test_split
from sklearn import metrics
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2,
                                                    random_state=42)

In [23]:
#removing the race columns before predicting and fitting
X_test_addback = X_test[['race_African-American','race_Caucasian','race_Other']]

drops = ['race_African-American','race_Caucasian','race_Other']
X_train = X_train.drop(drops, axis=1)
X_test = X_test.drop(drops, axis=1)

In [24]:
#added target and prediction to existing df
model.fit(X=X_train, y=y_train)
predictions = model.predict(X_test)

X_test["target"] = y_test.tolist()
X_test["prediction"] = predictions

In [25]:
X_rejoin = pd.concat([X_test, X_test_addback], axis = 1)

## False Positives and False Negatives as bias

In [26]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, predictions)

array([[550, 273],
       [224, 396]], dtype=int64)

In [27]:
X_rejoin['race_African-American'].sum()

731

In [28]:
X_rejoin['race_Caucasian'].sum()

505

In [29]:
#manipulate the existing df to only show false positive rows
false_positives = X_rejoin[(X_rejoin.target == 0.0) & (X_rejoin.prediction == 1.0)]
false_positives

Unnamed: 0,decile_score,target,prediction,race_African-American,race_Caucasian,race_Other
5952,5.0,0.0,1.0,0,1,0
676,9.0,0.0,1.0,1,0,0
6240,8.0,0.0,1.0,0,1,0
5227,6.0,0.0,1.0,1,0,0
5926,5.0,0.0,1.0,0,1,0
...,...,...,...,...,...,...
2440,9.0,0.0,1.0,1,0,0
864,6.0,0.0,1.0,0,1,0
2818,8.0,0.0,1.0,1,0,0
371,8.0,0.0,1.0,1,0,0


In [30]:

print('African American false positive count: ' + str(false_positives['race_African-American'].sum()))
print('African American false positive rate is : ' + str(false_positives['race_African-American'].sum() / X_rejoin['race_African-American'].sum()))
print('\n')
print('Caucasian false positive count: ' + str(false_positives['race_Caucasian'].sum()))
print('Caucasian false positive rate is : ' + str(false_positives['race_Caucasian'].sum() / X_rejoin['race_Caucasian'].sum()))

African American false positive count: 159
African American false positive rate is : 0.21751025991792067


Caucasian false positive count: 80
Caucasian false positive rate is : 0.15841584158415842


In [31]:
#manipulating data to show false negative rates
false_negatives = X_rejoin[(X_rejoin.target == 1.0) & (X_rejoin.prediction == 0.0)]
false_negatives

Unnamed: 0,decile_score,target,prediction,race_African-American,race_Caucasian,race_Other
3238,4.0,1.0,0.0,1,0,0
3858,1.0,1.0,0.0,1,0,0
132,2.0,1.0,0.0,0,1,0
4985,4.0,1.0,0.0,0,0,1
1042,2.0,1.0,0.0,0,1,0
...,...,...,...,...,...,...
2609,3.0,1.0,0.0,1,0,0
730,4.0,1.0,0.0,1,0,0
290,2.0,1.0,0.0,0,1,0
7088,4.0,1.0,0.0,1,0,0


In [32]:
print('African American false negative count: ' + str(false_negatives['race_African-American'].sum()))
print('African American false negative rate is : ' + str(false_negatives['race_African-American'].sum() / X_rejoin['race_African-American'].sum()))
print('\n')
print('Caucasian false negative count: ' + str(false_negatives['race_Caucasian'].sum()))
print('Caucasian false negative rate is : ' + str(false_negatives['race_Caucasian'].sum() / X_rejoin['race_Caucasian'].sum()))

African American false negative count: 96
African American false negative rate is : 0.13132694938440492


Caucasian false negative count: 88
Caucasian false negative rate is : 0.17425742574257425


# Key Takeaways

- African Americans have a higher false positive rate whereas Caucasians have a higher false negative rate
- This means that low risk African Americans are more likely to be classified as high risk, and high risk Caucasians are more likely to be classified as low risk.