In [52]:
from IPython.display import Image
import pandas as pd
import numpy as np
import warnings
warnings.simplefilter("ignore")
from sklearn.preprocessing import normalize
from sklearn.impute import SimpleImputer

import seaborn as sns
sns.set(rc={'figure.figsize':(6,6)}) 

# Using Two Year Data

In [53]:
df = pd.read_csv("data/compas-scores-two-years.csv")

In [54]:
#dropping unecessary columns
drop_columns = ['compas_screening_date', 'juv_fel_count',
                'juv_misd_count','c_case_number','vr_charge_degree',
                'vr_offense_date','vr_charge_desc','c_arrest_date',
                'r_case_number', 'vr_case_number','start',
                'juv_other_count','days_b_screening_arrest','c_days_from_compas',
                'first','last','name','dob','c_jail_in','c_jail_out','c_offense_date',
                'c_charge_degree','c_charge_desc','r_charge_degree','r_days_from_arrest',
                'r_offense_date','r_charge_desc','r_jail_in','r_jail_out','violent_recid',
                'score_text','screening_date','v_score_text','v_screening_date','in_custody',
                'out_custody','id','end','type_of_assessment','v_type_of_assessment','is_recid','is_violent_recid','event',
               'decile_score.1','priors_count.1']
df = df.drop(drop_columns,axis=1)

In [55]:
df.head()

Unnamed: 0,sex,age,age_cat,race,decile_score,priors_count,v_decile_score,two_year_recid
0,Male,69,Greater than 45,Other,1,0,1,0
1,Male,34,25 - 45,African-American,3,0,1,1
2,Male,24,Less than 25,African-American,4,4,3,1
3,Male,23,Less than 25,African-American,8,1,6,0
4,Male,43,25 - 45,Other,1,2,1,0


In [56]:
#only looking at african american vs caucasian, make every other race = other
df.race = df.race.replace({
    'Asian':'Other',
    'Hispanic':'Other',
    'Native American':'Other'
})

In [57]:
categorical_cols = df.select_dtypes(exclude=np.number).columns
numerical_cols = df.select_dtypes(np.number).columns

In [58]:
#taking care of nan in numerical columns
numerical = pd.DataFrame(
    SimpleImputer(strategy="median").fit_transform(df[numerical_cols]),
    columns=numerical_cols
)
numerical.head()

Unnamed: 0,age,decile_score,priors_count,v_decile_score,two_year_recid
0,69.0,1.0,0.0,1.0,0.0
1,34.0,3.0,0.0,1.0,1.0
2,24.0,4.0,4.0,3.0,1.0
3,23.0,8.0,1.0,6.0,0.0
4,43.0,1.0,2.0,1.0,0.0


In [59]:
#dealing with categorical columns to prepare for sklearn
categorical = pd.get_dummies(df[categorical_cols])
categorical.head()

Unnamed: 0,sex_Female,sex_Male,age_cat_25 - 45,age_cat_Greater than 45,age_cat_Less than 25,race_African-American,race_Caucasian,race_Other
0,0,1,0,1,0,0,0,1
1,0,1,1,0,0,1,0,0
2,0,1,0,0,1,1,0,0
3,0,1,0,0,1,1,0,0
4,0,1,1,0,0,0,0,1


In [60]:
#combining categorical and numerical columns to make final dataframe
df_new = pd.merge(numerical, categorical, left_index=True, right_index=True)
df_new.head()

Unnamed: 0,age,decile_score,priors_count,v_decile_score,two_year_recid,sex_Female,sex_Male,age_cat_25 - 45,age_cat_Greater than 45,age_cat_Less than 25,race_African-American,race_Caucasian,race_Other
0,69.0,1.0,0.0,1.0,0.0,0,1,0,1,0,0,0,1
1,34.0,3.0,0.0,1.0,1.0,0,1,1,0,0,1,0,0
2,24.0,4.0,4.0,3.0,1.0,0,1,0,0,1,1,0,0
3,23.0,8.0,1.0,6.0,0.0,0,1,0,0,1,1,0,0
4,43.0,1.0,2.0,1.0,0.0,0,1,1,0,0,0,0,1


In [61]:
df_new.to_csv('data/useful-two-year.csv')

# Model with Decision Tree using the clean data

#### Because we are looking for bias in the race column, we drop it and add it back later after our model fits and predicts the data

In [62]:
df_addback = df_new[['race_African-American','race_Caucasian','race_Other']]
drops = ['race_African-American','race_Caucasian','race_Other']
df_new = df_new.drop(drops, axis=1)

#define independent and dependent variables
target_variable = 'two_year_recid'
independent_variables = df_new.drop(columns=target_variable).columns
X = df_new[independent_variables]
y = df_new[target_variable]

In [63]:
df_new.head()

Unnamed: 0,age,decile_score,priors_count,v_decile_score,two_year_recid,sex_Female,sex_Male,age_cat_25 - 45,age_cat_Greater than 45,age_cat_Less than 25
0,69.0,1.0,0.0,1.0,0.0,0,1,0,1,0
1,34.0,3.0,0.0,1.0,1.0,0,1,1,0,0
2,24.0,4.0,4.0,3.0,1.0,0,1,0,0,1
3,23.0,8.0,1.0,6.0,0.0,0,1,0,0,1
4,43.0,1.0,2.0,1.0,0.0,0,1,1,0,0


#### Model starts

In [64]:
#instantiate tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

tree = DecisionTreeClassifier()

In [65]:
#high auc, thus the model is accurate
cross_val_score(
    tree, X, y, scoring = "roc_auc", cv = 5
).mean()

0.5957674312242001

In [66]:
tree.fit(X, y)
tree.predict(X)[:10]

array([0., 1., 1., 0., 0., 0., 1., 0., 0., 0.])

In [67]:
#check which features are most important: age and is_recid are 2 high impacting factors
dict(zip(
    independent_variables,
    tree.feature_importances_
))

{'age': 0.31911100120855435,
 'decile_score': 0.26157133397763827,
 'priors_count': 0.1946871333381646,
 'v_decile_score': 0.17086815885931203,
 'sex_Female': 0.0188426306176195,
 'sex_Male': 0.018397856923480928,
 'age_cat_25 - 45': 0.005453866790148945,
 'age_cat_Greater than 45': 0.0026012853564588495,
 'age_cat_Less than 25': 0.0084667329286224}

In [68]:
#testing for different depths to see which one will give the best accuracy
depths = np.arange(2,10) # define the depths
results = [] # create an empty data frame for our results

for depth in depths:
    best_depth_tree = DecisionTreeClassifier(max_depth = depth) # creating an instance of a decision tree
    results.append(cross_val_score(best_depth_tree, X, 
                                   y, scoring="roc_auc", # getting the cv accuracy metric for the tree at each depth
                cv=3).mean())


In [69]:
# a depth of 5 will give the best accuracy
test = pd.DataFrame({'depths':depths, 'mean_roc_auc':results})
test.sort_values("mean_roc_auc", ascending=False)

Unnamed: 0,depths,mean_roc_auc
3,5,0.723732
4,6,0.720956
2,4,0.718966
5,7,0.716018
1,3,0.710308
6,8,0.701224
0,2,0.687591
7,9,0.684569


#### Running a different model based on the best depth

In [70]:
# running a model on a tree with max depth 5 for best accuracy
simple_tree = DecisionTreeClassifier(max_depth=5)
simple_tree.fit(X, y)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=5, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [71]:
simple_tree.fit(X, y)
predictions = simple_tree.predict(X)

In [72]:
import graphviz
from sklearn.tree import export_graphviz

def draw_tree(tree):
    dot_data = export_graphviz(tree, out_file=None, 
                         feature_names=independent_variables, 
                               class_names=['did recidivate', 'did not recidivate'],
                         filled=True, 
                         #impurity=True,
                         rounded=True,  
                         special_characters=True,
                              proportion = True)  #trying changing proportion = False
    
    graph = graphviz.Source(dot_data)
    graph.format = 'png'
    graph.render('tree',view=True)

In [73]:
draw_tree(simple_tree)

In [74]:
# add predictions and race back to the dataframe
df_new['predictions'] = predictions
df_rejoin = pd.concat([df_new, df_addback], axis = 1)

In [75]:
df_rejoin.head()

Unnamed: 0,age,decile_score,priors_count,v_decile_score,two_year_recid,sex_Female,sex_Male,age_cat_25 - 45,age_cat_Greater than 45,age_cat_Less than 25,predictions,race_African-American,race_Caucasian,race_Other
0,69.0,1.0,0.0,1.0,0.0,0,1,0,1,0,0.0,0,0,1
1,34.0,3.0,0.0,1.0,1.0,0,1,1,0,0,0.0,1,0,0
2,24.0,4.0,4.0,3.0,1.0,0,1,0,0,1,1.0,1,0,0
3,23.0,8.0,1.0,6.0,0.0,0,1,0,0,1,0.0,1,0,0
4,43.0,1.0,2.0,1.0,0.0,0,1,1,0,0,0.0,0,0,1


#### False Positives and False Negatives

In [76]:
#searching for false positives as an indicator of bias
false_positives = df_rejoin[(df_rejoin.two_year_recid == 0.0) & (df_rejoin.predictions == 1.0)]
false_positives

Unnamed: 0,age,decile_score,priors_count,v_decile_score,two_year_recid,sex_Female,sex_Male,age_cat_25 - 45,age_cat_Greater than 45,age_cat_Less than 25,predictions,race_African-American,race_Caucasian,race_Other
17,25.0,10.0,3.0,9.0,0.0,0,1,1,0,0,1.0,1,0,0
35,26.0,8.0,6.0,8.0,0.0,0,1,1,0,0,1.0,0,1,0
40,21.0,8.0,2.0,8.0,0.0,1,0,0,0,1,1.0,1,0,0
54,31.0,5.0,15.0,7.0,0.0,0,1,1,0,0,1.0,1,0,0
58,53.0,5.0,8.0,2.0,0.0,0,1,0,1,0,1.0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7158,21.0,6.0,1.0,5.0,0.0,0,1,0,0,1,1.0,1,0,0
7167,30.0,9.0,2.0,7.0,0.0,0,1,1,0,0,1.0,0,1,0
7169,22.0,5.0,0.0,7.0,0.0,0,1,0,0,1,1.0,1,0,0
7198,32.0,5.0,4.0,3.0,0.0,0,1,1,0,0,1.0,1,0,0


In [77]:
print('African American false positive count: ' + str(false_positives['race_African-American'].sum()))
print('African American false positive rate is : ' + str(false_positives['race_African-American'].sum() / df_rejoin['race_African-American'].sum()))
print('\n')
print('Caucasian false positive count: ' + str(false_positives['race_Caucasian'].sum()))
print('African American false positive rate is : ' + str(false_positives['race_Caucasian'].sum() / df_rejoin['race_Caucasian'].sum()))

African American false positive count: 538
African American false positive rate is : 0.14556277056277056


Caucasian false positive count: 210
African American false positive rate is : 0.08557457212713937


In [78]:
#searching for false negatives as an indicator of bias
false_negatives = df_rejoin[(df_rejoin.two_year_recid == 1.0) & (df_rejoin.predictions == 0.0)]
false_negatives

Unnamed: 0,age,decile_score,priors_count,v_decile_score,two_year_recid,sex_Female,sex_Male,age_cat_25 - 45,age_cat_Greater than 45,age_cat_Less than 25,predictions,race_African-American,race_Caucasian,race_Other
1,34.0,3.0,0.0,1.0,1.0,0,1,1,0,0,0.0,1,0,0
9,21.0,3.0,1.0,5.0,1.0,0,1,0,0,1,0.0,0,1,0
14,47.0,1.0,1.0,1.0,1.0,1,0,0,1,0,0.0,0,1,0
22,27.0,2.0,0.0,3.0,1.0,0,1,1,0,0,0.0,0,1,0
24,24.0,4.0,1.0,5.0,1.0,0,1,0,0,1,0.0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7189,34.0,2.0,3.0,2.0,1.0,0,1,1,0,0,0.0,1,0,0
7190,24.0,3.0,2.0,4.0,1.0,0,1,0,0,1,0.0,1,0,0
7194,30.0,1.0,2.0,1.0,1.0,1,0,1,0,0,0.0,0,1,0
7207,30.0,2.0,0.0,2.0,1.0,0,1,1,0,0,0.0,1,0,0


In [79]:
print('African American false negative count: ' + str(false_negatives['race_African-American'].sum()))
print('African American false negative rate is : ' + str(false_negatives['race_African-American'].sum() / df_rejoin['race_African-American'].sum()))
print('\n')
print('Caucasian false negative count: ' + str(false_negatives['race_Caucasian'].sum()))
print('Caucasian false negative rate is : ' + str(false_negatives['race_Caucasian'].sum() / df_rejoin['race_Caucasian'].sum()))

African American false negative count: 615
African American false negative rate is : 0.1663961038961039


Caucasian false negative count: 535
Caucasian false negative rate is : 0.2180114099429503


# Key Takeaway from Data

- African Americans have a higher false positive rate (__14.5% vs 8.5%__) and Caucasians have a higher false negative rate (__22% vs 16.6%__).
***
- This means that the COMPAS algorithm is more likely to identify an African American as higher risk for recidivism and Caucasians as lower risk for recidivism