In [62]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.preprocessing import normalize
from sklearn.impute import SimpleImputer

from sklearn.ensemble import BaggingRegressor # default classifier is a DT
from sklearn.ensemble import AdaBoostRegressor
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestRegressor
from mlxtend.regressor import StackingRegressor

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import SCORERS

import seaborn as sns
sns.set(rc={'figure.figsize':(6,6)}) 
import warnings
warnings.simplefilter("ignore")

%matplotlib inline

In [63]:
df = pd.read_csv("data/compas-scores-two-years.csv")

In [64]:
#dropping unecessary columns
drop_columns = ['compas_screening_date',
                'c_case_number','vr_charge_degree',
                'vr_offense_date','vr_charge_desc','c_arrest_date',
                'r_case_number', 'vr_case_number','start',
                'days_b_screening_arrest','c_days_from_compas',
                'first','last','name','dob','c_jail_in','c_jail_out','c_offense_date',
                'c_charge_degree','c_charge_desc','r_charge_degree','r_days_from_arrest',
                'r_offense_date','r_charge_desc','r_jail_in','r_jail_out','violent_recid',
                'score_text','screening_date','v_score_text','v_screening_date','in_custody',
                'out_custody','id','end','type_of_assessment','v_type_of_assessment','is_recid','is_violent_recid','event',
               'decile_score.1','priors_count.1']
df = df.drop(drop_columns,axis=1)

In [65]:
df.race = df.race.replace({
    'Asian':'Other',
    'Hispanic':'Other',
    'Native American':'Other'
})

In [66]:
categorical_cols = df.select_dtypes(exclude=np.number).columns
numerical_cols = df.select_dtypes(np.number).columns

In [67]:
numerical = pd.DataFrame(
    SimpleImputer(strategy="median").fit_transform(df[numerical_cols]),
    columns=numerical_cols
)

In [68]:
categorical = pd.get_dummies(df[categorical_cols])
categorical.head()

Unnamed: 0,sex_Female,sex_Male,age_cat_25 - 45,age_cat_Greater than 45,age_cat_Less than 25,race_African-American,race_Caucasian,race_Other
0,0,1,0,1,0,0,0,1
1,0,1,1,0,0,1,0,0
2,0,1,0,0,1,1,0,0
3,0,1,0,0,1,1,0,0
4,0,1,1,0,0,0,0,1


In [69]:
df_new = pd.merge(numerical, categorical, left_index=True, right_index=True)
df_new.head()

Unnamed: 0,age,juv_fel_count,decile_score,juv_misd_count,juv_other_count,priors_count,v_decile_score,two_year_recid,sex_Female,sex_Male,age_cat_25 - 45,age_cat_Greater than 45,age_cat_Less than 25,race_African-American,race_Caucasian,race_Other
0,69.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0,1,0,1,0,0,0,1
1,34.0,0.0,3.0,0.0,0.0,0.0,1.0,1.0,0,1,1,0,0,1,0,0
2,24.0,0.0,4.0,0.0,1.0,4.0,3.0,1.0,0,1,0,0,1,1,0,0
3,23.0,0.0,8.0,1.0,0.0,1.0,6.0,0.0,0,1,0,0,1,1,0,0
4,43.0,0.0,1.0,0.0,0.0,2.0,1.0,0.0,0,1,1,0,0,0,0,1


In [70]:
target_variable = 'two_year_recid'
independent_variables = df_new.drop(columns=target_variable).columns
X = df_new[independent_variables]
y = df_new[target_variable]

In [71]:
model = LogisticRegression(class_weight = "balanced")

In [72]:
cross_val_score(
    model, X, y, scoring = "roc_auc", cv = 5
).mean()

0.7332810022091639

In [73]:
from sklearn.model_selection import train_test_split
from sklearn import metrics
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2,
                                                    random_state=42)

In [74]:
X_test_addback = X_test[['race_African-American','race_Caucasian','race_Other']]

drops = ['race_African-American','race_Caucasian','race_Other']
X_train = X_train.drop(drops, axis=1)
X_test = X_test.drop(drops, axis=1)

In [75]:
#added target and prediction to existing df
model.fit(X=X_train, y=y_train)
predictions = model.predict(X_test)

X_test["target"] = y_test.tolist()
X_test["prediction"] = predictions

In [76]:
X_rejoin = pd.concat([X_test, X_test_addback], axis = 1)

In [77]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, predictions)

array([[569, 254],
       [216, 404]], dtype=int64)

In [78]:
X_rejoin['race_African-American'].sum()

731

In [79]:
X_rejoin['race_Caucasian'].sum()

505

In [80]:
#manipulate the existing df to only show false positive rows
false_positives = X_rejoin[(X_rejoin.target == 0.0) & (X_rejoin.prediction == 1.0)]
false_positives

Unnamed: 0,age,juv_fel_count,decile_score,juv_misd_count,juv_other_count,priors_count,v_decile_score,sex_Female,sex_Male,age_cat_25 - 45,age_cat_Greater than 45,age_cat_Less than 25,target,prediction,race_African-American,race_Caucasian,race_Other
5952,26.0,0.0,5.0,0.0,0.0,2.0,4.0,0,1,1,0,0,0.0,1.0,0,1,0
509,35.0,0.0,2.0,0.0,0.0,12.0,1.0,1,0,1,0,0,0.0,1.0,1,0,0
676,22.0,0.0,9.0,0.0,1.0,3.0,8.0,0,1,0,0,1,0.0,1.0,1,0,0
6240,21.0,0.0,8.0,0.0,0.0,1.0,9.0,1,0,0,0,1,0.0,1.0,0,1,0
5321,21.0,0.0,4.0,0.0,0.0,1.0,6.0,0,1,0,0,1,0.0,1.0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
864,22.0,0.0,6.0,0.0,0.0,1.0,7.0,1,0,0,0,1,0.0,1.0,0,1,0
2818,21.0,0.0,8.0,0.0,0.0,1.0,8.0,0,1,0,0,1,0.0,1.0,1,0,0
4105,33.0,0.0,4.0,0.0,0.0,5.0,2.0,0,1,1,0,0,0.0,1.0,1,0,0
371,57.0,0.0,8.0,0.0,0.0,18.0,7.0,0,1,0,1,0,0.0,1.0,1,0,0


In [81]:

print('African American false positive count: ' + str(false_positives['race_African-American'].sum()))
print('African American false positive rate is : ' + str(false_positives['race_African-American'].sum() / X_rejoin['race_African-American'].sum()))
print('\n')
print('Caucasian false positive count: ' + str(false_positives['race_Caucasian'].sum()))
print('Caucasian false positive rate is : ' + str(false_positives['race_Caucasian'].sum() / X_rejoin['race_Caucasian'].sum()))

African American false positive count: 149
African American false positive rate is : 0.20383036935704515


Caucasian false positive count: 70
Caucasian false positive rate is : 0.13861386138613863


In [82]:
#manipulating data to show false negative rates
false_negatives = X_rejoin[(X_rejoin.target == 1.0) & (X_rejoin.prediction == 0.0)]
false_negatives

Unnamed: 0,age,juv_fel_count,decile_score,juv_misd_count,juv_other_count,priors_count,v_decile_score,sex_Female,sex_Male,age_cat_25 - 45,age_cat_Greater than 45,age_cat_Less than 25,target,prediction,race_African-American,race_Caucasian,race_Other
3238,57.0,0.0,4.0,0.0,0.0,8.0,2.0,0,1,0,1,0,1.0,0.0,1,0,0
3858,52.0,0.0,1.0,0.0,0.0,2.0,1.0,1,0,0,1,0,1.0,0.0,1,0,0
132,28.0,0.0,2.0,0.0,0.0,0.0,3.0,0,1,1,0,0,1.0,0.0,0,1,0
1042,32.0,0.0,2.0,0.0,0.0,2.0,2.0,0,1,1,0,0,1.0,0.0,0,1,0
3353,37.0,0.0,5.0,0.0,0.0,1.0,3.0,0,1,1,0,0,1.0,0.0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2533,38.0,0.0,3.0,0.0,0.0,2.0,3.0,0,1,1,0,0,1.0,0.0,0,0,1
2609,32.0,0.0,3.0,0.0,0.0,0.0,2.0,1,0,1,0,0,1.0,0.0,1,0,0
730,45.0,0.0,4.0,0.0,0.0,0.0,2.0,0,1,0,1,0,1.0,0.0,1,0,0
290,54.0,0.0,2.0,0.0,0.0,6.0,1.0,0,1,0,1,0,1.0,0.0,0,1,0


In [83]:
print('African American false negative count: ' + str(false_negatives['race_African-American'].sum()))
print('African American false negative rate is : ' + str(false_negatives['race_African-American'].sum() / X_rejoin['race_African-American'].sum()))
print('\n')
print('Caucasian false negative count: ' + str(false_negatives['race_Caucasian'].sum()))
print('Caucasian false negative rate is : ' + str(false_negatives['race_Caucasian'].sum() / X_rejoin['race_Caucasian'].sum()))

African American false negative count: 89
African American false negative rate is : 0.12175102599179206


Caucasian false negative count: 90
Caucasian false negative rate is : 0.1782178217821782
