In [125]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.preprocessing import normalize
from sklearn.impute import SimpleImputer

from sklearn.ensemble import BaggingRegressor # default classifier is a DT
from sklearn.ensemble import AdaBoostRegressor
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestRegressor
from mlxtend.regressor import StackingRegressor

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import SCORERS

import seaborn as sns
sns.set(rc={'figure.figsize':(6,6)}) 
import warnings
warnings.simplefilter("ignore")

%matplotlib inline

# Using Two Year Data

In [126]:
df = pd.read_csv("data/compas-scores-two-years.csv")

In [127]:
pd.set_option('max_columns', 53)

In [128]:
#dropping unecessary columns
drop_columns = ['compas_screening_date', 'juv_fel_count',
                'juv_misd_count','c_case_number','vr_charge_degree',
                'vr_offense_date','vr_charge_desc','c_arrest_date',
                'r_case_number', 'vr_case_number','start',
                'juv_other_count','days_b_screening_arrest','c_days_from_compas',
                'first','last','name','dob','c_jail_in','c_jail_out','c_offense_date',
                'c_charge_degree','c_charge_desc','r_charge_degree','r_days_from_arrest',
                'r_offense_date','r_charge_desc','r_jail_in','r_jail_out','violent_recid',
                'score_text','screening_date','v_score_text','v_screening_date','in_custody',
                'out_custody','id','end','type_of_assessment','v_type_of_assessment','is_recid','is_violent_recid','event',
               'decile_score.1','priors_count.1']
df = df.drop(drop_columns,axis=1)

In [129]:
df.head()

Unnamed: 0,sex,age,age_cat,race,decile_score,priors_count,v_decile_score,two_year_recid
0,Male,69,Greater than 45,Other,1,0,1,0
1,Male,34,25 - 45,African-American,3,0,1,1
2,Male,24,Less than 25,African-American,4,4,3,1
3,Male,23,Less than 25,African-American,8,1,6,0
4,Male,43,25 - 45,Other,1,2,1,0


In [130]:
df.count()

sex               7214
age               7214
age_cat           7214
race              7214
decile_score      7214
priors_count      7214
v_decile_score    7214
two_year_recid    7214
dtype: int64

In [131]:
#only looking at african american vs caucasian, make every other race = other
df.race = df.race.replace({
    'Asian':'Other',
    'Hispanic':'Other',
    'Native American':'Other'
})

In [132]:
categorical_cols = df.select_dtypes(exclude=np.number).columns
numerical_cols = df.select_dtypes(np.number).columns

In [133]:
#taking care of nan in numerical columns
numerical = pd.DataFrame(
    SimpleImputer(strategy="median").fit_transform(df[numerical_cols]),
    columns=numerical_cols
)
numerical.head()

Unnamed: 0,age,decile_score,priors_count,v_decile_score,two_year_recid
0,69.0,1.0,0.0,1.0,0.0
1,34.0,3.0,0.0,1.0,1.0
2,24.0,4.0,4.0,3.0,1.0
3,23.0,8.0,1.0,6.0,0.0
4,43.0,1.0,2.0,1.0,0.0


In [134]:
#dealing with categorical columns to prepare for sklearn
categorical = pd.get_dummies(df[categorical_cols])
categorical.head()

Unnamed: 0,sex_Female,sex_Male,age_cat_25 - 45,age_cat_Greater than 45,age_cat_Less than 25,race_African-American,race_Caucasian,race_Other
0,0,1,0,1,0,0,0,1
1,0,1,1,0,0,1,0,0
2,0,1,0,0,1,1,0,0
3,0,1,0,0,1,1,0,0
4,0,1,1,0,0,0,0,1


In [135]:
#combining categorical and numerical columns to make final dataframe
df_new = pd.merge(numerical, categorical, left_index=True, right_index=True)
df_new.head()

Unnamed: 0,age,decile_score,priors_count,v_decile_score,two_year_recid,sex_Female,sex_Male,age_cat_25 - 45,age_cat_Greater than 45,age_cat_Less than 25,race_African-American,race_Caucasian,race_Other
0,69.0,1.0,0.0,1.0,0.0,0,1,0,1,0,0,0,1
1,34.0,3.0,0.0,1.0,1.0,0,1,1,0,0,1,0,0
2,24.0,4.0,4.0,3.0,1.0,0,1,0,0,1,1,0,0
3,23.0,8.0,1.0,6.0,0.0,0,1,0,0,1,1,0,0
4,43.0,1.0,2.0,1.0,0.0,0,1,1,0,0,0,0,1


# Now that the dataframe is clean, use ensemble methods

#### Identifying variables

In [136]:
target_variable = 'two_year_recid'
independent_variables = df_new.drop(columns=target_variable).columns
X = df_new[independent_variables]
y = df_new[target_variable]

#### Adding a variety of models to the results dictionary to pick out the most accurate

In [137]:
def evaluate_model(estimator):
    cv_results = cross_validate(estimator, X, y, scoring='accuracy', n_jobs=-1, cv=10, return_train_score=True)
    return pd.DataFrame(cv_results).abs().mean().to_dict()

def display_results(results):
    results_df  = pd.DataFrame(results).T
    results_cols = results_df.columns
    for col in results_df:
        results_df[col] = results_df[col].apply(np.mean)
    return results_df

In [138]:
RESULTS = {}

In [139]:
#testing decision tree and logistic regression, and adding them to results df
RESULTS["tree"] = evaluate_model(DecisionTreeClassifier())
RESULTS["log_reg"] = evaluate_model(LogisticRegression())

pd.DataFrame.from_dict(RESULTS).T

Unnamed: 0,fit_time,score_time,test_score,train_score
tree,0.029121,0.003691,0.610617,0.932492
log_reg,0.064625,0.001894,0.679091,0.680298


In [140]:
#testing bagging classifier with 10 estimators
from sklearn.ensemble import BaggingRegressor, BaggingClassifier
estimator_bagging_10 = BaggingClassifier(n_estimators=10)
RESULTS["bagging_tree_10"] = evaluate_model(estimator_bagging_10)
display_results(RESULTS)

Unnamed: 0,fit_time,score_time,test_score,train_score
tree,0.029121,0.003691,0.610617,0.932492
log_reg,0.064625,0.001894,0.679091,0.680298
bagging_tree_10,0.178325,0.00638,0.627525,0.916567


In [141]:
#testing bagging classifier with 100 estimators
estimator_bagging_100 = BaggingClassifier(n_estimators=100)
RESULTS["bagging_tree_100"] = evaluate_model(estimator_bagging_100)
display_results(RESULTS)

Unnamed: 0,fit_time,score_time,test_score,train_score
tree,0.029121,0.003691,0.610617,0.932492
log_reg,0.064625,0.001894,0.679091,0.680298
bagging_tree_10,0.178325,0.00638,0.627525,0.916567
bagging_tree_100,1.686889,0.042389,0.635567,0.932462


In [142]:
#testing random forest classifier with 100 estimators
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
estimator_randomforest = RandomForestClassifier(n_estimators=100)

RESULTS["randomforest_100"] = evaluate_model(estimator_randomforest)
display_results(RESULTS)

Unnamed: 0,fit_time,score_time,test_score,train_score
tree,0.029121,0.003691,0.610617,0.932492
log_reg,0.064625,0.001894,0.679091,0.680298
bagging_tree_10,0.178325,0.00638,0.627525,0.916567
bagging_tree_100,1.686889,0.042389,0.635567,0.932462
randomforest_100,1.183834,0.047674,0.646099,0.932477


In [143]:
#testing extra tree classifier as base estimator for the bagging classifier
from sklearn.tree import ExtraTreeClassifier

estimator_bagging_random_tree = BaggingClassifier(n_estimators=100,
                                    base_estimator=ExtraTreeClassifier())
RESULTS["bagging_random_tree"] = evaluate_model(estimator_bagging_random_tree)
display_results(RESULTS)

Unnamed: 0,fit_time,score_time,test_score,train_score
tree,0.029121,0.003691,0.610617,0.932492
log_reg,0.064625,0.001894,0.679091,0.680298
bagging_tree_10,0.178325,0.00638,0.627525,0.916567
bagging_tree_100,1.686889,0.042389,0.635567,0.932462
randomforest_100,1.183834,0.047674,0.646099,0.932477
bagging_random_tree,0.863294,0.05146,0.646379,0.932492


In [144]:
#boosting
from sklearn.ensemble import AdaBoostRegressor, AdaBoostClassifier
estimator_adaboost = AdaBoostClassifier(n_estimators=100)

RESULTS["adaboost_100"]  = evaluate_model(estimator_adaboost)
display_results(RESULTS)

Unnamed: 0,fit_time,score_time,test_score,train_score
tree,0.029121,0.003691,0.610617,0.932492
log_reg,0.064625,0.001894,0.679091,0.680298
bagging_tree_10,0.178325,0.00638,0.627525,0.916567
bagging_tree_100,1.686889,0.042389,0.635567,0.932462
randomforest_100,1.183834,0.047674,0.646099,0.932477
bagging_random_tree,0.863294,0.05146,0.646379,0.932492
adaboost_100,0.789098,0.043679,0.682696,0.68766


In [145]:
#xgboost classifier
from xgboost import XGBRegressor, XGBClassifier
estimator_xgboost = XGBClassifier(n_estimators=500)

RESULTS["xgboost_500"] = evaluate_model(estimator_xgboost)

display_results(RESULTS)

Unnamed: 0,fit_time,score_time,test_score,train_score
tree,0.029121,0.003691,0.610617,0.932492
log_reg,0.064625,0.001894,0.679091,0.680298
bagging_tree_10,0.178325,0.00638,0.627525,0.916567
bagging_tree_100,1.686889,0.042389,0.635567,0.932462
randomforest_100,1.183834,0.047674,0.646099,0.932477
bagging_random_tree,0.863294,0.05146,0.646379,0.932492
adaboost_100,0.789098,0.043679,0.682696,0.68766
xgboost_500,3.132824,0.015459,0.683251,0.722422


### From the results table, we can see that logistic regression is the best model as it has the highest test score which tested accuracy as well as the highest train score, which tested the train data's performance

#### Splitting the data, fitting and predicting the model

In [146]:
#select xgboost as the ideal model for predictions
model = XGBClassifier(n_estimators=500)

In [147]:
#high auc, thus the model is accurate
cross_val_score(
    model, X, y, scoring = "roc_auc", cv = 5
).mean()

0.7347921811326299

In [148]:
#splitting the data to prepare for predictions
from sklearn.model_selection import train_test_split
from sklearn import metrics
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2,
                                                    random_state=42)

In [149]:
X_test_addback = X_test[['race_African-American','race_Caucasian','race_Other']]

drops = ['race_African-American','race_Caucasian','race_Other']
X_train = X_train.drop(drops, axis=1)
X_test = X_test.drop(drops, axis=1)

In [150]:
#added target and prediction to existing df
model.fit(X=X_train, y=y_train)
predictions = model.predict(X_test)

X_test["target"] = y_test.tolist()
X_test["prediction"] = predictions

In [151]:
X_rejoin = pd.concat([X_test, X_test_addback], axis = 1)

#### Looking for false positive and false negative rates and amounts as a sign of bias

In [152]:
#used a confusion matrix to identiy cases of false positives, searching for bias against african americans
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, predictions)

array([[612, 211],
       [259, 361]], dtype=int64)

In [153]:
X_rejoin['race_African-American'].sum()

731

In [154]:
X_rejoin['race_Caucasian'].sum()

505

In [155]:
#manipulate the existing df to only show false positive rows
false_positives = X_rejoin[(X_rejoin.target == 0.0) & (X_rejoin.prediction == 1.0)]
false_positives

Unnamed: 0,age,decile_score,priors_count,v_decile_score,sex_Female,sex_Male,age_cat_25 - 45,age_cat_Greater than 45,age_cat_Less than 25,target,prediction,race_African-American,race_Caucasian,race_Other
5952,26.0,5.0,2.0,4.0,0,1,1,0,0,0.0,1.0,0,1,0
509,35.0,2.0,12.0,1.0,1,0,1,0,0,0.0,1.0,1,0,0
676,22.0,9.0,3.0,8.0,0,1,0,0,1,0.0,1.0,1,0,0
5321,21.0,4.0,1.0,6.0,0,1,0,0,1,0.0,1.0,0,1,0
4264,29.0,2.0,5.0,3.0,0,1,1,0,0,0.0,1.0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
325,53.0,1.0,6.0,1.0,0,1,0,1,0,0.0,1.0,0,1,0
5292,56.0,4.0,13.0,1.0,0,1,0,1,0,0.0,1.0,0,1,0
2818,21.0,8.0,1.0,8.0,0,1,0,0,1,0.0,1.0,1,0,0
371,57.0,8.0,18.0,7.0,0,1,0,1,0,0.0,1.0,1,0,0


In [159]:
print('Total amount of False Positives are 211')
print('\n')
print('African American false positive count: ' + str(false_positives['race_African-American'].sum()))
print('African American false positive rate is : ' + str(false_positives['race_African-American'].sum() / X_rejoin['race_African-American'].sum()))
print('\n')
print('Caucasian false positive count: ' + str(false_positives['race_Caucasian'].sum()))
print('Caucasian false positive rate is : ' + str(false_positives['race_Caucasian'].sum() / X_rejoin['race_Caucasian'].sum()))

Total amount of False Positives are 211


African American false positive count: 121
African American false positive rate is : 0.16552667578659372


Caucasian false positive count: 63
Caucasian false positive rate is : 0.12475247524752475


In [157]:
#manipulating data to show false negative rates
false_negatives = X_rejoin[(X_rejoin.target == 1.0) & (X_rejoin.prediction == 0.0)]
false_negatives

Unnamed: 0,age,decile_score,priors_count,v_decile_score,sex_Female,sex_Male,age_cat_25 - 45,age_cat_Greater than 45,age_cat_Less than 25,target,prediction,race_African-American,race_Caucasian,race_Other
3238,57.0,4.0,8.0,2.0,0,1,0,1,0,1.0,0.0,1,0,0
3858,52.0,1.0,2.0,1.0,1,0,0,1,0,1.0,0.0,1,0,0
79,27.0,7.0,1.0,8.0,0,1,1,0,0,1.0,0.0,0,1,0
132,28.0,2.0,0.0,3.0,0,1,1,0,0,1.0,0.0,0,1,0
5157,22.0,7.0,0.0,6.0,1,0,0,0,1,1.0,0.0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2609,32.0,3.0,0.0,2.0,1,0,1,0,0,1.0,0.0,1,0,0
730,45.0,4.0,0.0,2.0,0,1,0,1,0,1.0,0.0,1,0,0
290,54.0,2.0,6.0,1.0,0,1,0,1,0,1.0,0.0,0,1,0
7181,51.0,6.0,7.0,3.0,0,1,0,1,0,1.0,0.0,1,0,0


In [160]:
print('Total amount of False Negatives are 259')
print('\n')
print('African American false negative count: ' + str(false_negatives['race_African-American'].sum()))
print('African American false negative rate is : ' + str(false_negatives['race_African-American'].sum() / X_rejoin['race_African-American'].sum()))
print('\n')
print('Caucasian false negative count: ' + str(false_negatives['race_Caucasian'].sum()))
print('Caucasian false negative rate is : ' + str(false_negatives['race_Caucasian'].sum() / X_rejoin['race_Caucasian'].sum()))

Total amount of False Negatives are 259


African American false negative count: 114
African American false negative rate is : 0.15595075239398085


Caucasian false negative count: 105
Caucasian false negative rate is : 0.2079207920792079


# Key Takeaway from Data

- African Americans have a higher false positive rate (__16.5% vs 12.4%__) and Caucasians have a higher false negative rate (__20.8% vs 15.6%__).
***
- This means that the COMPAS algorithm is more likely to identify an African American as higher risk for recidivism and Caucasians as lower risk for recidivism