In [115]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.preprocessing import normalize
from sklearn.impute import SimpleImputer

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import SCORERS

import seaborn as sns
sns.set(rc={'figure.figsize':(6,6)}) 
import warnings
warnings.simplefilter("ignore")

%matplotlib inline

# Using Two Year Data

In [116]:
df = pd.read_csv("data/compas-scores-two-years.csv")

In [117]:
#dropping unecessary columns
drop_columns = ['compas_screening_date', 'juv_fel_count',
                'juv_misd_count','c_case_number','vr_charge_degree',
                'vr_offense_date','vr_charge_desc','c_arrest_date',
                'r_case_number', 'vr_case_number','start',
                'juv_other_count','days_b_screening_arrest','c_days_from_compas',
                'first','last','name','dob','c_jail_in','c_jail_out','c_offense_date',
                'c_charge_degree','c_charge_desc','r_charge_degree','r_days_from_arrest',
                'r_offense_date','r_charge_desc','r_jail_in','r_jail_out','violent_recid',
                'score_text','screening_date','v_score_text','v_screening_date','in_custody',
                'out_custody','id','end','type_of_assessment','v_type_of_assessment','is_recid','is_violent_recid','event',
               'decile_score.1','priors_count.1', 'age_cat']
df = df.drop(drop_columns,axis=1)

In [118]:
df.head()

Unnamed: 0,sex,age,race,decile_score,priors_count,v_decile_score,two_year_recid
0,Male,69,Other,1,0,1,0
1,Male,34,African-American,3,0,1,1
2,Male,24,African-American,4,4,3,1
3,Male,23,African-American,8,1,6,0
4,Male,43,Other,1,2,1,0


In [138]:
#only looking at African American vs Caucasian
df.race = df.race.replace({
    'Asian':'Other',
    'Hispanic':'Other',
    'Native American':'Other'
})

## Feature Engineering

In [120]:
categorical_cols = df.select_dtypes(exclude=np.number).columns
numerical_cols = df.select_dtypes(np.number).columns

In [121]:
#taking care of nan in numerical columns
numerical = pd.DataFrame(
    SimpleImputer(strategy="median").fit_transform(df[numerical_cols]),
    columns=numerical_cols
)

In [122]:
#dealing with categorical columns to prepare for sklearn
categorical = pd.get_dummies(df[categorical_cols])
categorical.head()

Unnamed: 0,sex_Female,sex_Male,race_African-American,race_Caucasian,race_Other
0,0,1,0,0,1
1,0,1,1,0,0
2,0,1,1,0,0
3,0,1,1,0,0
4,0,1,0,0,1


In [123]:
#combining categorical and numerical columns to make final dataframe
df_new = pd.merge(numerical, categorical, left_index=True, right_index=True)
df_new.head()

Unnamed: 0,age,decile_score,priors_count,v_decile_score,two_year_recid,sex_Female,sex_Male,race_African-American,race_Caucasian,race_Other
0,69.0,1.0,0.0,1.0,0.0,0,1,0,0,1
1,34.0,3.0,0.0,1.0,1.0,0,1,1,0,0
2,24.0,4.0,4.0,3.0,1.0,0,1,1,0,0
3,23.0,8.0,1.0,6.0,0.0,0,1,1,0,0
4,43.0,1.0,2.0,1.0,0.0,0,1,0,0,1


# Logistic Regression

In [124]:
target_variable = 'two_year_recid'
independent_variables = df_new.drop(columns=target_variable).columns
X = df_new[independent_variables]
y = df_new[target_variable]

In [125]:
model = LogisticRegression(class_weight = "balanced")

In [139]:
cross_val_score(
    model, X, y, scoring = "accuracy", cv = 5
).mean()

0.6754905551022056

In [126]:
cross_val_score(
    model, X, y, scoring = "roc_auc", cv = 5
).mean()

0.7322736346654095

In [127]:
from sklearn.model_selection import train_test_split
from sklearn import metrics
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2,
                                                    random_state=42)

In [128]:
#removing the race columns before predicting and fitting
X_test_addback = X_test[['race_African-American','race_Caucasian','race_Other']]

drops = ['race_African-American','race_Caucasian','race_Other']
X_train = X_train.drop(drops, axis=1)
X_test = X_test.drop(drops, axis=1)

In [129]:
#added target and prediction to existing df
model.fit(X=X_train, y=y_train)
predictions = model.predict(X_test)

X_test["target"] = y_test.tolist()
X_test["prediction"] = predictions

In [130]:
X_rejoin = pd.concat([X_test, X_test_addback], axis = 1)

## False Positives and False Negatives as bias

In [131]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, predictions)

array([[570, 253],
       [216, 404]], dtype=int64)

In [132]:
X_rejoin['race_African-American'].sum()

731

In [133]:
X_rejoin['race_Caucasian'].sum()

505

In [134]:
#manipulate the existing df to only show false positive rows
false_positives = X_rejoin[(X_rejoin.target == 0.0) & (X_rejoin.prediction == 1.0)]
false_positives

Unnamed: 0,age,decile_score,priors_count,v_decile_score,sex_Female,sex_Male,target,prediction,race_African-American,race_Caucasian,race_Other
5952,26.0,5.0,2.0,4.0,0,1,0.0,1.0,0,1,0
509,35.0,2.0,12.0,1.0,1,0,0.0,1.0,1,0,0
676,22.0,9.0,3.0,8.0,0,1,0.0,1.0,1,0,0
6240,21.0,8.0,1.0,9.0,1,0,0.0,1.0,0,1,0
5321,21.0,4.0,1.0,6.0,0,1,0.0,1.0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...
864,22.0,6.0,1.0,7.0,1,0,0.0,1.0,0,1,0
2818,21.0,8.0,1.0,8.0,0,1,0.0,1.0,1,0,0
4105,33.0,4.0,5.0,2.0,0,1,0.0,1.0,1,0,0
371,57.0,8.0,18.0,7.0,0,1,0.0,1.0,1,0,0


In [135]:

print('African American false positive count: ' + str(false_positives['race_African-American'].sum()))
print('African American false positive rate is : ' + str(false_positives['race_African-American'].sum() / X_rejoin['race_African-American'].sum()))
print('\n')
print('Caucasian false positive count: ' + str(false_positives['race_Caucasian'].sum()))
print('Caucasian false positive rate is : ' + str(false_positives['race_Caucasian'].sum() / X_rejoin['race_Caucasian'].sum()))

African American false positive count: 151
African American false positive rate is : 0.20656634746922026


Caucasian false positive count: 68
Caucasian false positive rate is : 0.13465346534653466


In [136]:
#manipulating data to show false negative rates
false_negatives = X_rejoin[(X_rejoin.target == 1.0) & (X_rejoin.prediction == 0.0)]
false_negatives

Unnamed: 0,age,decile_score,priors_count,v_decile_score,sex_Female,sex_Male,target,prediction,race_African-American,race_Caucasian,race_Other
3238,57.0,4.0,8.0,2.0,0,1,1.0,0.0,1,0,0
3858,52.0,1.0,2.0,1.0,1,0,1.0,0.0,1,0,0
132,28.0,2.0,0.0,3.0,0,1,1.0,0.0,0,1,0
1042,32.0,2.0,2.0,2.0,0,1,1.0,0.0,0,1,0
3353,37.0,5.0,1.0,3.0,0,1,1.0,0.0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...
2609,32.0,3.0,0.0,2.0,1,0,1.0,0.0,1,0,0
730,45.0,4.0,0.0,2.0,0,1,1.0,0.0,1,0,0
290,54.0,2.0,6.0,1.0,0,1,1.0,0.0,0,1,0
3786,36.0,6.0,4.0,3.0,1,0,1.0,0.0,1,0,0


In [137]:
print('African American false negative count: ' + str(false_negatives['race_African-American'].sum()))
print('African American false negative rate is : ' + str(false_negatives['race_African-American'].sum() / X_rejoin['race_African-American'].sum()))
print('\n')
print('Caucasian false negative count: ' + str(false_negatives['race_Caucasian'].sum()))
print('Caucasian false negative rate is : ' + str(false_negatives['race_Caucasian'].sum() / X_rejoin['race_Caucasian'].sum()))

African American false negative count: 92
African American false negative rate is : 0.12585499316005472


Caucasian false negative count: 87
Caucasian false negative rate is : 0.17227722772277226


# Key Takeaways

- African Americans have a higher false positive rate whereas Caucasians have a higher false negative rate
- This means that low risk African Americans are more likely to be classified as high risk, and high risk Caucasians are more likely to be classified as low risk.