In [89]:
from IPython.display import Image
import pandas as pd
import numpy as np
import warnings
warnings.simplefilter("ignore")
from sklearn.preprocessing import normalize
from sklearn.impute import SimpleImputer

import seaborn as sns
sns.set(rc={'figure.figsize':(6,6)}) 

# Using Two Year Data

In [90]:
df = pd.read_csv("data/compas-scores-two-years.csv")

In [91]:
#drop unecessary columns
drop_columns = ['compas_screening_date', 'juv_fel_count',
                'juv_misd_count','c_case_number','vr_charge_degree',
                'vr_offense_date','vr_charge_desc','c_arrest_date',
                'r_case_number', 'vr_case_number','start',
                'juv_other_count','days_b_screening_arrest','c_days_from_compas',
                'first','last','name','dob','c_jail_in','c_jail_out','c_offense_date',
                'c_charge_degree','c_charge_desc','r_charge_degree','r_days_from_arrest',
                'r_offense_date','r_charge_desc','r_jail_in','r_jail_out','violent_recid',
                'score_text','screening_date','v_score_text','v_screening_date','in_custody',
                'out_custody','id','end','type_of_assessment','v_type_of_assessment']
df = df.drop(drop_columns,axis=1)

In [92]:
df.head()

Unnamed: 0,sex,age,age_cat,race,decile_score,priors_count,is_recid,is_violent_recid,decile_score.1,v_decile_score,priors_count.1,event,two_year_recid
0,Male,69,Greater than 45,Other,1,0,0,0,1,1,0,0,0
1,Male,34,25 - 45,African-American,3,0,1,1,3,1,0,1,1
2,Male,24,Less than 25,African-American,4,4,1,0,4,3,4,0,1
3,Male,23,Less than 25,African-American,8,1,0,0,8,6,1,0,0
4,Male,43,25 - 45,Other,1,2,0,0,1,1,2,0,0


In [93]:
#only looking at african american vs caucasian, make every other race = other
df.race = df.race.replace({
    'Asian':'Other',
    'Hispanic':'Other',
    'Native American':'Other'
})

In [94]:
categorical_cols = df.select_dtypes(exclude=np.number).columns
numerical_cols = df.select_dtypes(np.number).columns

In [95]:
#taking care of nan in numerical columns
numerical = pd.DataFrame(
    SimpleImputer(strategy="median").fit_transform(df[numerical_cols]),
    columns=numerical_cols
)
numerical.head()

Unnamed: 0,age,decile_score,priors_count,is_recid,is_violent_recid,decile_score.1,v_decile_score,priors_count.1,event,two_year_recid
0,69.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
1,34.0,3.0,0.0,1.0,1.0,3.0,1.0,0.0,1.0,1.0
2,24.0,4.0,4.0,1.0,0.0,4.0,3.0,4.0,0.0,1.0
3,23.0,8.0,1.0,0.0,0.0,8.0,6.0,1.0,0.0,0.0
4,43.0,1.0,2.0,0.0,0.0,1.0,1.0,2.0,0.0,0.0


In [96]:
#dealing with categorical columns to prepare for sklearn
categorical = pd.get_dummies(df[categorical_cols])
categorical.head()

Unnamed: 0,sex_Female,sex_Male,age_cat_25 - 45,age_cat_Greater than 45,age_cat_Less than 25,race_African-American,race_Caucasian,race_Other
0,0,1,0,1,0,0,0,1
1,0,1,1,0,0,1,0,0
2,0,1,0,0,1,1,0,0
3,0,1,0,0,1,1,0,0
4,0,1,1,0,0,0,0,1


In [97]:
#combining categorical and numerical columns to make final dataframe
df_new = pd.merge(numerical, categorical, left_index=True, right_index=True)
df_new.head()

Unnamed: 0,age,decile_score,priors_count,is_recid,is_violent_recid,decile_score.1,v_decile_score,priors_count.1,event,two_year_recid,sex_Female,sex_Male,age_cat_25 - 45,age_cat_Greater than 45,age_cat_Less than 25,race_African-American,race_Caucasian,race_Other
0,69.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0,1,0,1,0,0,0,1
1,34.0,3.0,0.0,1.0,1.0,3.0,1.0,0.0,1.0,1.0,0,1,1,0,0,1,0,0
2,24.0,4.0,4.0,1.0,0.0,4.0,3.0,4.0,0.0,1.0,0,1,0,0,1,1,0,0
3,23.0,8.0,1.0,0.0,0.0,8.0,6.0,1.0,0.0,0.0,0,1,0,0,1,1,0,0
4,43.0,1.0,2.0,0.0,0.0,1.0,1.0,2.0,0.0,0.0,0,1,1,0,0,0,0,1


# Model with Decision Tree using the clean data

Because we are looking for bias in the race column, we drop it and add it back later after our model fits and predicts the data

In [98]:
df_addback = df_new[['race_African-American','race_Caucasian','race_Other']]
drops = ['race_African-American','race_Caucasian','race_Other']
df_new = df_new.drop(drops, axis=1)

#define independent and dependent variables
target_variable = 'two_year_recid'
independent_variables = df_new.drop(columns=target_variable).columns
X = df_new[independent_variables]
y = df_new[target_variable]

In [99]:
df_new.head()

Unnamed: 0,age,decile_score,priors_count,is_recid,is_violent_recid,decile_score.1,v_decile_score,priors_count.1,event,two_year_recid,sex_Female,sex_Male,age_cat_25 - 45,age_cat_Greater than 45,age_cat_Less than 25
0,69.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0,1,0,1,0
1,34.0,3.0,0.0,1.0,1.0,3.0,1.0,0.0,1.0,1.0,0,1,1,0,0
2,24.0,4.0,4.0,1.0,0.0,4.0,3.0,4.0,0.0,1.0,0,1,0,0,1
3,23.0,8.0,1.0,0.0,0.0,8.0,6.0,1.0,0.0,0.0,0,1,0,0,1
4,43.0,1.0,2.0,0.0,0.0,1.0,1.0,2.0,0.0,0.0,0,1,1,0,0


In [100]:
#instantiate tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

tree = DecisionTreeClassifier()

In [101]:
#high auc, thus the model is accurate
cross_val_score(
    tree, X, y, scoring = "roc_auc", cv = 5
).mean()

0.934939431597915

In [102]:
tree.fit(X, y)
tree.predict(X)[:10]

array([0., 1., 1., 0., 0., 0., 1., 0., 0., 1.])

In [103]:
#check which features are most important: age and is_recid are 2 high impacting factors
dict(zip(
    independent_variables,
    tree.feature_importances_
))

{'age': 0.03324380733343089,
 'decile_score': 0.009030112621378891,
 'priors_count': 0.015054366808005294,
 'is_recid': 0.8966083585913821,
 'is_violent_recid': 0.003898576126788215,
 'decile_score.1': 0.01078921586457576,
 'v_decile_score': 0.011186661393923818,
 'priors_count.1': 0.01138853594385733,
 'event': 0.0008495984874522706,
 'sex_Female': 0.0024843443666651673,
 'sex_Male': 0.003469396667039193,
 'age_cat_25 - 45': 0.0009359845073588398,
 'age_cat_Greater than 45': 0.0008298301828746098,
 'age_cat_Less than 25': 0.00023121110526767331}

In [104]:
#testing for different depths to see which one will give the best accuracy
depths = np.arange(2,10) # define the depths
results = [] # create an empty data frame for our results

for depth in depths:
    best_depth_tree = DecisionTreeClassifier(max_depth = depth) # creating an instance of a decision tree
    results.append(cross_val_score(best_depth_tree, X, 
                                   y, scoring="roc_auc", # getting the cv accuracy metric for the tree at each depth
                cv=3).mean())


In [105]:
# a depth of 5 will give the best accuracy
test = pd.DataFrame({'depths':depths, 'mean_roc_auc':results})
test.sort_values("mean_roc_auc", ascending=False)

Unnamed: 0,depths,mean_roc_auc
3,5,0.978097
2,4,0.978029
1,3,0.977966
0,2,0.976109
4,6,0.974839
5,7,0.97309
6,8,0.97152
7,9,0.967386


In [106]:
# running a model on a tree with max depth 5 for best accuracy
simple_tree = DecisionTreeClassifier(max_depth=5)
simple_tree.fit(X, y)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=5, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [107]:
simple_tree.fit(X, y)
predictions = simple_tree.predict(X)

In [108]:
# add predictions and race back to the dataframe
df_new['predictions'] = predictions
df_rejoin = pd.concat([df_new, df_addback], axis = 1)

In [109]:
df_rejoin.head()

Unnamed: 0,age,decile_score,priors_count,is_recid,is_violent_recid,decile_score.1,v_decile_score,priors_count.1,event,two_year_recid,sex_Female,sex_Male,age_cat_25 - 45,age_cat_Greater than 45,age_cat_Less than 25,predictions,race_African-American,race_Caucasian,race_Other
0,69.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0,1,0,1,0,0.0,0,0,1
1,34.0,3.0,0.0,1.0,1.0,3.0,1.0,0.0,1.0,1.0,0,1,1,0,0,1.0,1,0,0
2,24.0,4.0,4.0,1.0,0.0,4.0,3.0,4.0,0.0,1.0,0,1,0,0,1,1.0,1,0,0
3,23.0,8.0,1.0,0.0,0.0,8.0,6.0,1.0,0.0,0.0,0,1,0,0,1,0.0,1,0,0
4,43.0,1.0,2.0,0.0,0.0,1.0,1.0,2.0,0.0,0.0,0,1,1,0,0,0.0,0,0,1


In [110]:
#searching for false positives as an indicator of bias
false_positives = df_rejoin[(df_rejoin.two_year_recid == 0.0) & (df_rejoin.predictions == 1.0)]
false_positives

Unnamed: 0,age,decile_score,priors_count,is_recid,is_violent_recid,decile_score.1,v_decile_score,priors_count.1,event,two_year_recid,sex_Female,sex_Male,age_cat_25 - 45,age_cat_Greater than 45,age_cat_Less than 25,predictions,race_African-American,race_Caucasian,race_Other
54,31.0,5.0,15.0,1.0,0.0,5.0,7.0,15.0,1.0,0.0,0,1,1,0,0,1.0,1,0,0
139,31.0,2.0,1.0,1.0,0.0,2.0,3.0,1.0,1.0,0.0,0,1,1,0,0,1.0,1,0,0
188,66.0,1.0,3.0,1.0,1.0,1.0,1.0,3.0,1.0,0.0,0,1,0,1,0,1.0,1,0,0
197,34.0,2.0,2.0,1.0,0.0,2.0,1.0,2.0,1.0,0.0,0,1,1,0,0,1.0,1,0,0
227,37.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0,1,1,0,0,1.0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7070,47.0,1.0,1.0,1.0,0.0,1.0,2.0,1.0,1.0,0.0,0,1,0,1,0,1.0,1,0,0
7074,45.0,7.0,9.0,1.0,0.0,7.0,9.0,9.0,1.0,0.0,1,0,0,1,0,1.0,1,0,0
7131,28.0,6.0,11.0,1.0,0.0,6.0,5.0,11.0,1.0,0.0,0,1,1,0,0,1.0,0,1,0
7140,25.0,8.0,5.0,1.0,1.0,8.0,7.0,5.0,1.0,0.0,0,1,1,0,0,1.0,0,1,0


In [111]:
print('African American false positive count: ' + str(false_positives['race_African-American'].sum()))
print('Caucasian false positive count: ' + str(false_positives['race_Caucasian'].sum()))
print('Other false positive count: ' + str(false_positives['race_Other'].sum()))

African American false positive count: 135
Caucasian false positive count: 59
Other false positive count: 26


## With a whopping 135 counts of false positive towards African Americans, we can question the validity of this dataset