In [137]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.preprocessing import normalize
from sklearn.impute import SimpleImputer

from sklearn.ensemble import BaggingRegressor # default classifier is a DT
from sklearn.ensemble import AdaBoostRegressor
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestRegressor
from mlxtend.regressor import StackingRegressor

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import SCORERS

import seaborn as sns
sns.set(rc={'figure.figsize':(6,6)}) 
import warnings
warnings.simplefilter("ignore")

%matplotlib inline

# Using Two Year Data

In [138]:
df = pd.read_csv("data/compas-scores-two-years.csv")

In [111]:
pd.set_option('max_columns', 53)

In [112]:
#drop unecessary columns
drop_columns = ['compas_screening_date', 'juv_fel_count',
                'juv_misd_count','c_case_number','vr_charge_degree',
                'vr_offense_date','vr_charge_desc','c_arrest_date',
                'r_case_number', 'vr_case_number','start',
                'juv_other_count','days_b_screening_arrest','c_days_from_compas','first','last','name','dob','c_jail_in','c_jail_out','c_offense_date','c_charge_degree','c_charge_desc','r_charge_degree','r_days_from_arrest','r_offense_date','r_charge_desc','r_jail_in','r_jail_out','violent_recid','score_text','screening_date','v_score_text','v_screening_date','in_custody','out_custody','id']
df = df.drop(drop_columns,axis=1)

In [113]:
df.head()

Unnamed: 0,sex,age,age_cat,race,decile_score,priors_count,is_recid,is_violent_recid,type_of_assessment,decile_score.1,v_type_of_assessment,v_decile_score,priors_count.1,end,event,two_year_recid
0,Male,69,Greater than 45,Other,1,0,0,0,Risk of Recidivism,1,Risk of Violence,1,0,327,0,0
1,Male,34,25 - 45,African-American,3,0,1,1,Risk of Recidivism,3,Risk of Violence,1,0,159,1,1
2,Male,24,Less than 25,African-American,4,4,1,0,Risk of Recidivism,4,Risk of Violence,3,4,63,0,1
3,Male,23,Less than 25,African-American,8,1,0,0,Risk of Recidivism,8,Risk of Violence,6,1,1174,0,0
4,Male,43,25 - 45,Other,1,2,0,0,Risk of Recidivism,1,Risk of Violence,1,2,1102,0,0


In [114]:
#only looking at african american vs caucasian, make every other race = other
df.race = df.race.replace({
    'Asian':'Other',
    'Hispanic':'Other',
    'Native American':'Other'
})

In [115]:
categorical_cols = df.select_dtypes(exclude=np.number).columns
numerical_cols = df.select_dtypes(np.number).columns

In [116]:
#taking care of nan in numerical columns
numerical = pd.DataFrame(
    SimpleImputer(strategy="median").fit_transform(df[numerical_cols]),
    columns=numerical_cols
)
numerical.head()

Unnamed: 0,age,decile_score,priors_count,is_recid,is_violent_recid,decile_score.1,v_decile_score,priors_count.1,end,event,two_year_recid
0,69.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,327.0,0.0,0.0
1,34.0,3.0,0.0,1.0,1.0,3.0,1.0,0.0,159.0,1.0,1.0
2,24.0,4.0,4.0,1.0,0.0,4.0,3.0,4.0,63.0,0.0,1.0
3,23.0,8.0,1.0,0.0,0.0,8.0,6.0,1.0,1174.0,0.0,0.0
4,43.0,1.0,2.0,0.0,0.0,1.0,1.0,2.0,1102.0,0.0,0.0


In [117]:
#dealing with categorical columns to prepare for sklearn
categorical = pd.get_dummies(df[categorical_cols])
categorical.head()

Unnamed: 0,sex_Female,sex_Male,age_cat_25 - 45,age_cat_Greater than 45,age_cat_Less than 25,race_African-American,race_Caucasian,race_Other,type_of_assessment_Risk of Recidivism,v_type_of_assessment_Risk of Violence
0,0,1,0,1,0,0,0,1,1,1
1,0,1,1,0,0,1,0,0,1,1
2,0,1,0,0,1,1,0,0,1,1
3,0,1,0,0,1,1,0,0,1,1
4,0,1,1,0,0,0,0,1,1,1


In [118]:
#combining categorical and numerical columns to make final dataframe
df = pd.merge(numerical, categorical, left_index=True, right_index=True)
df.head()

Unnamed: 0,age,decile_score,priors_count,is_recid,is_violent_recid,decile_score.1,v_decile_score,priors_count.1,end,event,two_year_recid,sex_Female,sex_Male,age_cat_25 - 45,age_cat_Greater than 45,age_cat_Less than 25,race_African-American,race_Caucasian,race_Other,type_of_assessment_Risk of Recidivism,v_type_of_assessment_Risk of Violence
0,69.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,327.0,0.0,0.0,0,1,0,1,0,0,0,1,1,1
1,34.0,3.0,0.0,1.0,1.0,3.0,1.0,0.0,159.0,1.0,1.0,0,1,1,0,0,1,0,0,1,1
2,24.0,4.0,4.0,1.0,0.0,4.0,3.0,4.0,63.0,0.0,1.0,0,1,0,0,1,1,0,0,1,1
3,23.0,8.0,1.0,0.0,0.0,8.0,6.0,1.0,1174.0,0.0,0.0,0,1,0,0,1,1,0,0,1,1
4,43.0,1.0,2.0,0.0,0.0,1.0,1.0,2.0,1102.0,0.0,0.0,0,1,1,0,0,0,0,1,1,1


# Now that the dataframe is clean, use ensemble methods

In [119]:
target_variable = 'two_year_recid'
independent_variables = df.drop(columns=target_variable).columns
X = df[independent_variables]
y = df[target_variable]

In [120]:
def evaluate_model(estimator):
    cv_results = cross_validate(estimator, X, y, scoring='accuracy', n_jobs=-1, cv=10, return_train_score=True)
    return pd.DataFrame(cv_results).abs().mean().to_dict()

def display_results(results):
    results_df  = pd.DataFrame(results).T
    results_cols = results_df.columns
    for col in results_df:
        results_df[col] = results_df[col].apply(np.mean)
    return results_df

In [121]:
RESULTS = {}

In [122]:
#testing decision tree and logistic regression, and adding them to results df
RESULTS["tree"] = evaluate_model(DecisionTreeClassifier())
RESULTS["log_reg"] = evaluate_model(LogisticRegression())

pd.DataFrame.from_dict(RESULTS).T

Unnamed: 0,fit_time,score_time,test_score,train_score
tree,0.019874,0.002803,0.980591,1.0
log_reg,0.159523,0.002693,0.979068,0.979746


In [123]:
#testing bagging classifier with 10 estimators
from sklearn.ensemble import BaggingRegressor, BaggingClassifier
estimator_bagging_10 = BaggingClassifier(n_estimators=10)
RESULTS["bagging_tree_10"] = evaluate_model(estimator_bagging_10)
display_results(RESULTS)

Unnamed: 0,fit_time,score_time,test_score,train_score
tree,0.019874,0.002803,0.980591,1.0
log_reg,0.159523,0.002693,0.979068,0.979746
bagging_tree_10,0.13454,0.008081,0.986692,0.998983


In [124]:
#testing bagging classifier with 100 estimators
estimator_bagging_100 = BaggingClassifier(n_estimators=100)
RESULTS["bagging_tree_100"] = evaluate_model(estimator_bagging_100)
display_results(RESULTS)

Unnamed: 0,fit_time,score_time,test_score,train_score
tree,0.019874,0.002803,0.980591,1.0
log_reg,0.159523,0.002693,0.979068,0.979746
bagging_tree_10,0.13454,0.008081,0.986692,0.998983
bagging_tree_100,1.216006,0.038341,0.987385,0.999985


In [125]:
#testing random forest classifier with 100 estimators
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
estimator_randomforest = RandomForestClassifier(n_estimators=100)

RESULTS["randomforest_100"] = evaluate_model(estimator_randomforest)
display_results(RESULTS)

Unnamed: 0,fit_time,score_time,test_score,train_score
tree,0.019874,0.002803,0.980591,1.0
log_reg,0.159523,0.002693,0.979068,0.979746
bagging_tree_10,0.13454,0.008081,0.986692,0.998983
bagging_tree_100,1.216006,0.038341,0.987385,0.999985
randomforest_100,0.618716,0.025942,0.988216,0.999985


In [126]:
#testing extra tree classifier as base estimator for the bagging classifier
from sklearn.tree import ExtraTreeClassifier

estimator_bagging_random_tree = BaggingClassifier(n_estimators=100,
                                    base_estimator=ExtraTreeClassifier())
RESULTS["bagging_random_tree"] = evaluate_model(estimator_bagging_random_tree)
display_results(RESULTS)

Unnamed: 0,fit_time,score_time,test_score,train_score
tree,0.019874,0.002803,0.980591,1.0
log_reg,0.159523,0.002693,0.979068,0.979746
bagging_tree_10,0.13454,0.008081,0.986692,0.998983
bagging_tree_100,1.216006,0.038341,0.987385,0.999985
randomforest_100,0.618716,0.025942,0.988216,0.999985
bagging_random_tree,0.643726,0.046181,0.979621,1.0


In [127]:
#boosting
from sklearn.ensemble import AdaBoostRegressor, AdaBoostClassifier
estimator_adaboost = AdaBoostClassifier(n_estimators=100)

RESULTS["adaboost_100"]  = evaluate_model(estimator_adaboost)
display_results(RESULTS)

Unnamed: 0,fit_time,score_time,test_score,train_score
tree,0.019874,0.002803,0.980591,1.0
log_reg,0.159523,0.002693,0.979068,0.979746
bagging_tree_10,0.13454,0.008081,0.986692,0.998983
bagging_tree_100,1.216006,0.038341,0.987385,0.999985
randomforest_100,0.618716,0.025942,0.988216,0.999985
bagging_random_tree,0.643726,0.046181,0.979621,1.0
adaboost_100,1.003249,0.043447,0.987107,0.990143


In [128]:
#xgboost classifier
from xgboost import XGBRegressor, XGBClassifier
estimator_xgboost = XGBClassifier(n_estimators=500)

RESULTS["xgboost_500"] = evaluate_model(estimator_xgboost)

display_results(RESULTS)

Unnamed: 0,fit_time,score_time,test_score,train_score
tree,0.019874,0.002803,0.980591,1.0
log_reg,0.159523,0.002693,0.979068,0.979746
bagging_tree_10,0.13454,0.008081,0.986692,0.998983
bagging_tree_100,1.216006,0.038341,0.987385,0.999985
randomforest_100,0.618716,0.025942,0.988216,0.999985
bagging_random_tree,0.643726,0.046181,0.979621,1.0
adaboost_100,1.003249,0.043447,0.987107,0.990143
xgboost_500,3.401243,0.016875,0.987661,0.998475


### From the results table, we can see that the random forest classifier is the best model as it has the highest test score which tested accuracy as well as the highest train score, which tested the train data's performance

In [129]:
#select random forest as the ideal model for predictions
model = RandomForestClassifier(n_estimators=100)

In [130]:
#high auc, thus the model is accurate
cross_val_score(
    model, X, y, scoring = "roc_auc", cv = 5
).mean()

0.998136162981188

In [131]:
#splitting the data to prepare for predictions
from sklearn.model_selection import train_test_split
from sklearn import metrics
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2,
                                                    random_state=42)

In [132]:
#added target and prediction to existing df
model.fit(X=X_train, y=y_train)
predictions = model.predict(X_test)
X = X_test.reset_index().copy() # make a copy of indices and data
X["target"] = y_test.tolist()
X["prediction"] = predictions
X.head()

Unnamed: 0,index,age,decile_score,priors_count,is_recid,is_violent_recid,decile_score.1,v_decile_score,priors_count.1,end,event,sex_Female,sex_Male,age_cat_25 - 45,age_cat_Greater than 45,age_cat_Less than 25,race_African-American,race_Caucasian,race_Other,type_of_assessment_Risk of Recidivism,v_type_of_assessment_Risk of Violence,target,prediction
0,308,62.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,444.0,0.0,0,1,0,1,0,0,1,0,1,1,0.0,0.0
1,381,28.0,3.0,1.0,0.0,0.0,3.0,3.0,1.0,1094.0,0.0,0,1,1,0,0,1,0,0,1,1,0.0,0.0
2,3238,57.0,4.0,8.0,1.0,0.0,4.0,2.0,8.0,265.0,1.0,0,1,0,1,0,1,0,0,1,1,1.0,1.0
3,2312,19.0,10.0,1.0,1.0,0.0,10.0,10.0,1.0,711.0,1.0,0,1,0,0,1,1,0,0,1,1,1.0,1.0
4,251,37.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,844.0,0.0,1,0,1,0,0,0,0,1,1,1,0.0,0.0


In [133]:
#used a confusion matrix to identiy cases of false positives, searching for bias against african americans
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, predictions)

array([[810,  13],
       [  2, 618]], dtype=int64)

In [134]:
#manipulate the existing df to only show false positive rows
false_positives = X[(X.target == 0.0) & (X.prediction == 1.0)]
false_positives

Unnamed: 0,index,age,decile_score,priors_count,is_recid,is_violent_recid,decile_score.1,v_decile_score,priors_count.1,end,event,sex_Female,sex_Male,age_cat_25 - 45,age_cat_Greater than 45,age_cat_Less than 25,race_African-American,race_Caucasian,race_Other,type_of_assessment_Risk of Recidivism,v_type_of_assessment_Risk of Violence,target,prediction
20,6240,21.0,8.0,1.0,1.0,0.0,8.0,9.0,1.0,132.0,0.0,1,0,0,0,1,0,1,0,1,1,0.0,1.0
166,5418,34.0,7.0,8.0,1.0,0.0,7.0,3.0,8.0,690.0,0.0,0,1,1,0,0,1,0,0,1,1,0.0,1.0
206,3563,25.0,9.0,2.0,1.0,0.0,9.0,7.0,2.0,498.0,0.0,0,1,1,0,0,1,0,0,1,1,0.0,1.0
320,1922,26.0,4.0,3.0,1.0,0.0,4.0,4.0,3.0,105.0,0.0,0,1,1,0,0,0,1,0,1,1,0.0,1.0
409,3010,29.0,10.0,5.0,1.0,0.0,10.0,8.0,5.0,853.0,1.0,0,1,1,0,0,1,0,0,1,1,0.0,1.0
427,3194,59.0,3.0,3.0,1.0,0.0,3.0,2.0,3.0,732.0,1.0,0,1,0,1,0,1,0,0,1,1,0.0,1.0
624,5937,29.0,5.0,2.0,1.0,0.0,5.0,4.0,2.0,101.0,0.0,0,1,1,0,0,0,1,0,1,1,0.0,1.0
875,2574,52.0,5.0,7.0,1.0,0.0,5.0,1.0,7.0,406.0,0.0,0,1,0,1,0,1,0,0,1,1,0.0,1.0
1091,5787,40.0,3.0,2.0,1.0,1.0,3.0,3.0,2.0,161.0,0.0,0,1,1,0,0,1,0,0,1,1,0.0,1.0
1267,4180,28.0,2.0,2.0,1.0,0.0,2.0,3.0,2.0,170.0,0.0,0,1,1,0,0,1,0,0,1,1,0.0,1.0


In [135]:
false_positives['race_African-American'].value_counts()

1    8
0    5
Name: race_African-American, dtype: int64

### As shown in the table above, a majority of the false positives were African American cases, showing a bias in the COMPAS data
