In [1]:
import warnings
from copy import deepcopy
import pickle as pkl
import pandas as pd
import numpy as np

In [2]:
import imblearn.over_sampling

In [3]:
from ipywidgets import interactive, FloatSlider

In [49]:
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import StratifiedKFold

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC

from sklearn import metrics
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.metrics import precision_recall_curve

In [7]:
# function cm_val creates an interactive confusion matrix on un-scaled data.
# function cm_val_scaled creates an interactive confusion matrix on scaled data.
from my_functions import cm_val
from my_functions import cm_val_scaled
# function y_pred_inverse extracts the predictive probability from predict_proba.
from my_functions import y_pred_inverse
# function plot_validation_curve_log plots a validation curve on a log scale.
# function plot_validation_curve_reg plots a validation curve on a default scale.
from my_functions import plot_validation_curve_log
from my_functions import plot_validation_curve_reg
# function plot_learning_curve_reg plots a learning curve on a default scale.
from my_functions import plot_learning_curve
# function plot_decision_tree uses graphviz to visualize the splits of a devision tree.
from my_functions import plot_decision_tree
# function train_and_calibrate_cv performs stratified shuffle split on a specified model,
# returning validation scores and roc/auc.
from my_functions import train_and_calibrate_cv

In [8]:
warnings.filterwarnings('ignore')

In [9]:
pd.set_option('display.max_colwidth', 1000)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [10]:
# pd.set_option('display.max_columns', None)

In [11]:
np.random.seed(42)

In [12]:
df = pd.read_pickle('./Pickles/df_modeling.pkl')

In [13]:
df.describe()

Unnamed: 0,age_at_incident,num_charges,sentence_years,years_until_charged,case_years,highest_charge,amended/corrected_sentence_phase,original_sentence_phase,probation_violation_sentence_phase,remanded_sentence_phase,...,biracial,black,hispanic,unknown_race,white,current_sentence,found_guilty,m_class,other_class,x_class
count,10185.0,10185.0,10185.0,10185.0,10185.0,10185.0,10185.0,10185.0,10185.0,10185.0,...,10185.0,10185.0,10185.0,10185.0,10185.0,10185.0,10185.0,10185.0,10185.0,10185.0
mean,30.16,1.353,8.955,0.438,1.443,0.842,0.017,0.97,0.006,0.001,...,0.174,0.733,0.008,0.001,0.079,0.989,0.155,0.0,0.0,1.0
std,11.023,1.345,9.907,1.542,9.013,0.365,0.13,0.17,0.077,0.034,...,0.379,0.443,0.09,0.038,0.27,0.102,0.362,0.0,0.0,0.0
min,17.0,1.0,0.002,0.014,0.003,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,21.0,1.0,3.5,0.101,0.542,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
50%,27.0,1.0,6.0,0.121,1.052,1.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
75%,37.0,1.0,11.0,0.186,1.83,1.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
max,80.0,29.0,405.0,28.219,902.408,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0


In [15]:
df.columns

Index(['offense_title', 'age_at_incident', 'num_charges', 'sentence_years',
       'years_until_charged', 'case_years', 'highest_charge',
       'amended/corrected_sentence_phase', 'original_sentence_phase',
       'probation_violation_sentence_phase', 'remanded_sentence_phase',
       'resentenced_sentence_phase', 'conditional_discharge_sentence',
       'conversion_sentence', 'boot_camp_sentence', 'death_sentence',
       'jail_sentence', 'prison_sentence', 'probation_sentence',
       'supervision_sentence', 'male', 'asian', 'biracial', 'black',
       'hispanic', 'unknown_race', 'white', 'current_sentence', 'found_guilty',
       'm_class', 'other_class', 'x_class'],
      dtype='object')

In [None]:
df.info()

In [16]:
len(df[df['found_guilty'] == 0])/len(df)

0.845360824742268

In [17]:
df.groupby('found_guilty').mean()

Unnamed: 0_level_0,age_at_incident,num_charges,sentence_years,years_until_charged,case_years,highest_charge,amended/corrected_sentence_phase,original_sentence_phase,probation_violation_sentence_phase,remanded_sentence_phase,...,asian,biracial,black,hispanic,unknown_race,white,current_sentence,m_class,other_class,x_class
found_guilty,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,30.173,1.118,7.83,0.433,1.24,0.852,0.019,0.969,0.007,0.001,...,0.005,0.18,0.72,0.008,0.002,0.085,0.989,0.0,0.0,1.0
1,30.092,2.636,15.106,0.468,2.556,0.782,0.008,0.974,0.003,0.003,...,0.002,0.142,0.8,0.008,0.001,0.048,0.989,0.0,0.0,1.0


In [18]:
len(df)

10185

In [20]:
X = df.drop(['offense_title', 'm_class', 'x_class', 'found_guilty', 'unknown_race', 'case_years', 'other_class'], axis = 1)
y = df['found_guilty']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 101)

In [23]:
df['found_guilty'].value_counts().values[0]

8610

In [24]:
df[(df['m_class'] == 1) | (df['x_class'] == 1)].groupby('found_guilty').mean()

Unnamed: 0_level_0,age_at_incident,num_charges,sentence_years,years_until_charged,case_years,highest_charge,amended/corrected_sentence_phase,original_sentence_phase,probation_violation_sentence_phase,remanded_sentence_phase,...,asian,biracial,black,hispanic,unknown_race,white,current_sentence,m_class,other_class,x_class
found_guilty,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,30.173,1.118,7.83,0.433,1.24,0.852,0.019,0.969,0.007,0.001,...,0.005,0.18,0.72,0.008,0.002,0.085,0.989,0.0,0.0,1.0
1,30.092,2.636,15.106,0.468,2.556,0.782,0.008,0.974,0.003,0.003,...,0.002,0.142,0.8,0.008,0.001,0.048,0.989,0.0,0.0,1.0


In [25]:
df.groupby('found_guilty').mean()

Unnamed: 0_level_0,age_at_incident,num_charges,sentence_years,years_until_charged,case_years,highest_charge,amended/corrected_sentence_phase,original_sentence_phase,probation_violation_sentence_phase,remanded_sentence_phase,...,asian,biracial,black,hispanic,unknown_race,white,current_sentence,m_class,other_class,x_class
found_guilty,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,30.173,1.118,7.83,0.433,1.24,0.852,0.019,0.969,0.007,0.001,...,0.005,0.18,0.72,0.008,0.002,0.085,0.989,0.0,0.0,1.0
1,30.092,2.636,15.106,0.468,2.556,0.782,0.008,0.974,0.003,0.003,...,0.002,0.142,0.8,0.008,0.001,0.048,0.989,0.0,0.0,1.0


In [26]:
df.groupby('found_guilty').sum()

Unnamed: 0_level_0,age_at_incident,num_charges,sentence_years,years_until_charged,case_years,highest_charge,amended/corrected_sentence_phase,original_sentence_phase,probation_violation_sentence_phase,remanded_sentence_phase,...,asian,biracial,black,hispanic,unknown_race,white,current_sentence,m_class,other_class,x_class
found_guilty,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,259786,9624,67416.438,3724.748,10672.562,7339.0,161.0,8346.0,57.0,7.0,...,43.0,1551.0,6202.0,72.0,14.0,728.0,8519.0,0.0,0.0,8610.0
1,47395,4152,23792.175,736.441,4026.34,1232.0,13.0,1534.0,4.0,5.0,...,3.0,223.0,1260.0,12.0,1.0,76.0,1558.0,0.0,0.0,1575.0


In [27]:
df.columns

Index(['offense_title', 'age_at_incident', 'num_charges', 'sentence_years',
       'years_until_charged', 'case_years', 'highest_charge',
       'amended/corrected_sentence_phase', 'original_sentence_phase',
       'probation_violation_sentence_phase', 'remanded_sentence_phase',
       'resentenced_sentence_phase', 'conditional_discharge_sentence',
       'conversion_sentence', 'boot_camp_sentence', 'death_sentence',
       'jail_sentence', 'prison_sentence', 'probation_sentence',
       'supervision_sentence', 'male', 'asian', 'biracial', 'black',
       'hispanic', 'unknown_race', 'white', 'current_sentence', 'found_guilty',
       'm_class', 'other_class', 'x_class'],
      dtype='object')

In [28]:
df['found_guilty'].value_counts()

0    8610
1    1575
Name: found_guilty, dtype: int64

In [29]:
continuous_columns = ['age_at_incident', 'num_charges', 'sentence_years', 'years_until_charged']

In [30]:
X_cont = X[continuous_columns]
X_cont = X_cont.reset_index()
X_cont.drop(['index'], axis = 1, inplace = True)

X_dummy = X.drop(continuous_columns, axis = 1)
X_dummy = X_dummy.reset_index()
X_dummy.drop(['index'], axis = 1, inplace = True)

X_train_cont = X_train[continuous_columns]
X_train_cont = X_train_cont.reset_index()
X_train_cont.drop(['index'], axis = 1, inplace = True)

X_train_dummy = X_train.drop(continuous_columns, axis = 1)
X_train_dummy = X_train_dummy.reset_index()
X_train_dummy.drop(['index'], axis = 1, inplace = True)

X_test_cont = X_test[continuous_columns]
X_test_cont = X_test_cont.reset_index()
X_test_cont.drop(['index'], axis = 1, inplace = True)

X_test_dummy = X_test.drop(continuous_columns, axis = 1)
X_test_dummy = X_test_dummy.reset_index()
X_test_dummy.drop(['index'], axis = 1, inplace = True)

In [31]:
scaler = MinMaxScaler()

X_train_cont_scaled = pd.DataFrame(scaler.fit_transform(X_train_cont))
X_train_cont_scaled = X_train_cont_scaled.reset_index()
X_train_cont_scaled.drop(['index'], axis = 1, inplace = True)
X_train_cont_scaled.columns = continuous_columns

X_cont_scaled = pd.DataFrame(scaler.transform(X_cont))
X_cont_scaled = X_cont_scaled.reset_index()
X_cont_scaled.drop(['index'], axis = 1, inplace = True)
X_cont_scaled.columns = continuous_columns

X_test_cont_scaled = pd.DataFrame(scaler.transform(X_test_cont))
X_test_cont_scaled = X_test_cont_scaled.reset_index()
X_test_cont_scaled.drop(['index'], axis = 1, inplace = True)
X_test_cont_scaled.columns = continuous_columns

X_scaled = pd.concat([X_cont_scaled, X_dummy], axis = 1)
X_train_scaled = pd.concat([X_train_cont_scaled, X_train_dummy], axis = 1)
X_test_scaled = pd.concat([X_test_cont_scaled, X_test_dummy], axis = 1)

In [32]:
pd.DataFrame(y_train)['found_guilty'].value_counts()

0    6876
1    1272
Name: found_guilty, dtype: int64

In [33]:
smote = imblearn.over_sampling.SMOTE(ratio = {0: 6876, 1: 6876}, random_state = 101)
X_train_scaled, y_train = smote.fit_sample(X_train_scaled, y_train)

X_train_scaled = pd.DataFrame(X_train_scaled).reset_index()
X_train_scaled.drop(['index'], axis = 1, inplace = True)
X_train_scaled.columns = list(continuous_columns) + list(X_train_dummy.columns)

In [34]:
print(X_train_scaled.shape)
print(X_test_scaled.shape)
print(y_train.shape)
print(y_test.shape)

(13752, 25)
(2037, 25)
(13752,)
(2037,)


### **Standard Scaler**

First use a standard scaler to level all features in the model.

Create a dataframe to see scaled features.

In [35]:
df_scaled = pd.DataFrame(X_train_scaled, columns = X_train.columns)
df_scaled.head()

Unnamed: 0,age_at_incident,num_charges,sentence_years,years_until_charged,highest_charge,amended/corrected_sentence_phase,original_sentence_phase,probation_violation_sentence_phase,remanded_sentence_phase,resentenced_sentence_phase,...,prison_sentence,probation_sentence,supervision_sentence,male,asian,biracial,black,hispanic,white,current_sentence
0,0.286,0.0,0.018,0.003,1.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
1,0.079,0.0,0.091,0.004,1.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
2,0.0,0.286,0.055,0.004,1.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
3,0.175,0.143,0.055,0.002,1.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
4,0.54,0.0,0.036,0.009,1.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0


In [38]:
pd.set_option('display.max_rows', None)
chi2_, pval = chi2(X_train_scaled, y_train)
p_val_df = pd.DataFrame(pval.tolist())
p_val_df.index = list(continuous_columns) + list(X_train_dummy.columns)
p_val_df.rename({0: 'p_val'}, axis = 1, inplace = True)
p_val_df = p_val_df.sort_values(by = 'p_val', ascending = False)
p_val_df

Unnamed: 0,p_val
current_sentence,0.988
years_until_charged,0.973
original_sentence_phase,0.654
jail_sentence,0.55
hispanic,0.446
age_at_incident,0.313
male,0.147
conversion_sentence,0.083
supervision_sentence,0.046
remanded_sentence_phase,0.014


In [39]:
pd.reset_option("display.max_rows")

In [40]:
high_p_val_drop = list(p_val_df[p_val_df['p_val'] > 0.05].index)
print(high_p_val_drop)

['current_sentence', 'years_until_charged', 'original_sentence_phase', 'jail_sentence', 'hispanic', 'age_at_incident', 'male', 'conversion_sentence']


In [41]:
X_scaled.drop(high_p_val_drop, axis = 1, inplace = True)
X_train_scaled.drop(high_p_val_drop, axis = 1, inplace = True)
X_test_scaled.drop(high_p_val_drop, axis = 1, inplace = True)

In [42]:
cv = StratifiedKFold(n_splits = 3, shuffle = True, random_state = 101)
param_grid = {'C': [10000, 100000, 1000000]}
#              'penalty': ['l1', 'l2']}
grid = GridSearchCV(LogisticRegression(random_state = 101), param_grid = param_grid, cv = cv, scoring = 'roc_auc', refit = True)
grid.fit(X_train_scaled, y_train)

train_predictions = grid.predict(X_train_scaled)
test_predictions = grid.predict(X_test_scaled)
print(grid.cv_results_['mean_train_score'].mean())
print(grid.cv_results_['mean_test_score'].mean())
print('')
print(grid.best_params_)

0.8043214101429766
0.8034070500422849

{'C': 1000000}


In [43]:
print(roc_auc_score(y_train, train_predictions).round(3), '\n')
print(metrics.classification_report(y_train, train_predictions))
print(metrics.confusion_matrix(y_train, train_predictions))
print('\n')
print(roc_auc_score(y_test, test_predictions).round(3), '\n')
print(metrics.classification_report(y_test, test_predictions))
print(metrics.confusion_matrix(y_test, test_predictions))

0.738 

              precision    recall  f1-score   support

           0       0.69      0.86      0.77      6876
           1       0.81      0.62      0.70      6876

   micro avg       0.74      0.74      0.74     13752
   macro avg       0.75      0.74      0.73     13752
weighted avg       0.75      0.74      0.73     13752

[[5889  987]
 [2617 4259]]


0.744 

              precision    recall  f1-score   support

           0       0.93      0.87      0.90      1734
           1       0.45      0.62      0.52       303

   micro avg       0.83      0.83      0.83      2037
   macro avg       0.69      0.74      0.71      2037
weighted avg       0.86      0.83      0.84      2037

[[1510  224]
 [ 116  187]]


In [44]:
logreg = LogisticRegression(C = 1000000)
logreg.fit(X_train_scaled, y_train)

train_predictions = logreg.predict(X_train_scaled)
test_predictions = logreg.predict(X_test_scaled)

In [45]:
print(roc_auc_score(y_train, train_predictions).round(3), '\n')
print(metrics.classification_report(y_train, train_predictions))
print(metrics.confusion_matrix(y_train, train_predictions))
print('\n')
print(roc_auc_score(y_test, test_predictions).round(3), '\n')
print(metrics.classification_report(y_test, test_predictions))
print(metrics.confusion_matrix(y_test, test_predictions))

0.738 

              precision    recall  f1-score   support

           0       0.69      0.86      0.77      6876
           1       0.81      0.62      0.70      6876

   micro avg       0.74      0.74      0.74     13752
   macro avg       0.75      0.74      0.73     13752
weighted avg       0.75      0.74      0.73     13752

[[5889  987]
 [2617 4259]]


0.744 

              precision    recall  f1-score   support

           0       0.93      0.87      0.90      1734
           1       0.45      0.62      0.52       303

   micro avg       0.83      0.83      0.83      2037
   macro avg       0.69      0.74      0.71      2037
weighted avg       0.86      0.83      0.84      2037

[[1510  224]
 [ 116  187]]


In [46]:
pd.set_option('display.max_rows', None)

lr_coef_df = pd.DataFrame(np.e**pd.DataFrame(logreg.coef_.tolist()[0]))
lr_coef_df.index = list(X_train_scaled.columns)
lr_coef_df.rename({0: 'coef_'}, axis = 1, inplace = True)
lr_coef_df = lr_coef_df.sort_values(by = 'coef_', ascending = False)
lr_coef_df

Unnamed: 0,coef_
num_charges,9506259781235598.0
sentence_years,482.872
remanded_sentence_phase,3.543
resentenced_sentence_phase,2.199
black,2.163
biracial,1.564
death_sentence,1.0
white,0.932
probation_sentence,0.923
amended/corrected_sentence_phase,0.896


In [47]:
pd.reset_option("display.max_rows")

In [50]:
svc = LinearSVC()
svc.fit(X_train_scaled, y_train)

train_predictions = logreg.predict(X_train_scaled)
test_predictions = logreg.predict(X_test_scaled)

In [51]:
print(roc_auc_score(y_train, train_predictions).round(3), '\n')
print(metrics.classification_report(y_train, train_predictions))
print(metrics.confusion_matrix(y_train, train_predictions))
print('\n')
print(roc_auc_score(y_test, test_predictions).round(3), '\n')
print(metrics.classification_report(y_test, test_predictions))
print(metrics.confusion_matrix(y_test, test_predictions))

0.738 

              precision    recall  f1-score   support

           0       0.69      0.86      0.77      6876
           1       0.81      0.62      0.70      6876

   micro avg       0.74      0.74      0.74     13752
   macro avg       0.75      0.74      0.73     13752
weighted avg       0.75      0.74      0.73     13752

[[5889  987]
 [2617 4259]]


0.744 

              precision    recall  f1-score   support

           0       0.93      0.87      0.90      1734
           1       0.45      0.62      0.52       303

   micro avg       0.83      0.83      0.83      2037
   macro avg       0.69      0.74      0.71      2037
weighted avg       0.86      0.83      0.84      2037

[[1510  224]
 [ 116  187]]


In [53]:
cv = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 101)
param_grid = {'n_estimators': [400, 500, 600, 700, 800, 900], 'max_depth': np.arange(3, 12), 'max_features': np.arange(3, 9), 'min_samples_split': np.arange(3, 7), 'min_samples_leaf': np.arange(2, 7)}
rand = RandomizedSearchCV(RandomForestClassifier(random_state = 101), param_distributions = param_grid, cv = cv, scoring = 'roc_auc', refit = True, random_state = 101)
rand.fit(X_train_scaled, y_train)

train_predictions = rand.predict(X_train_scaled)
test_predictions = rand.predict(X_test_scaled)
print(rand.cv_results_['mean_train_score'].mean())
print(rand.cv_results_['mean_test_score'].mean())
print('')
print(rand.best_params_)

0.8732274681197805
0.8663850526911634

{'n_estimators': 600, 'min_samples_split': 4, 'min_samples_leaf': 3, 'max_features': 8, 'max_depth': 10}


In [54]:
print(roc_auc_score(y_train, train_predictions).round(3), '\n')
print(metrics.classification_report(y_train, train_predictions))
print(metrics.confusion_matrix(y_train, train_predictions))
print('\n')
print(metrics.accuracy_score(y_test, test_predictions).round(3), '\n')
print(roc_auc_score(y_test, test_predictions).round(3), '\n')
print(metrics.classification_report(y_test, test_predictions))
print(metrics.confusion_matrix(y_test, test_predictions))

0.817 

              precision    recall  f1-score   support

           0       0.76      0.94      0.84      6876
           1       0.92      0.70      0.79      6876

   micro avg       0.82      0.82      0.82     13752
   macro avg       0.84      0.82      0.81     13752
weighted avg       0.84      0.82      0.81     13752

[[6447  429]
 [2084 4792]]


0.871 

0.724 

              precision    recall  f1-score   support

           0       0.92      0.93      0.92      1734
           1       0.57      0.51      0.54       303

   micro avg       0.87      0.87      0.87      2037
   macro avg       0.75      0.72      0.73      2037
weighted avg       0.87      0.87      0.87      2037

[[1618  116]
 [ 147  156]]


In [55]:
rfc = RandomForestClassifier(n_estimators = 700, min_samples_split = 4, min_samples_leaf = 3, max_features = 8, max_depth = 10)
rfc.fit(X_train_scaled, y_train)

train_predictions = rfc.predict(X_train_scaled)
test_predictions = rfc.predict(X_test_scaled)

In [56]:
print(roc_auc_score(y_train, train_predictions).round(3), '\n')
print(metrics.classification_report(y_train, train_predictions))
print(metrics.confusion_matrix(y_train, train_predictions))
print('\n')
print(roc_auc_score(y_test, test_predictions).round(3), '\n')
print(metrics.classification_report(y_test, test_predictions))
print(metrics.confusion_matrix(y_test, test_predictions))

0.817 

              precision    recall  f1-score   support

           0       0.76      0.94      0.84      6876
           1       0.92      0.70      0.79      6876

   micro avg       0.82      0.82      0.82     13752
   macro avg       0.84      0.82      0.81     13752
weighted avg       0.84      0.82      0.81     13752

[[6440  436]
 [2082 4794]]


0.725 

              precision    recall  f1-score   support

           0       0.92      0.93      0.92      1734
           1       0.57      0.52      0.54       303

   micro avg       0.87      0.87      0.87      2037
   macro avg       0.74      0.73      0.73      2037
weighted avg       0.87      0.87      0.87      2037

[[1616  118]
 [ 146  157]]


In [57]:
importances = rfc.feature_importances_
std = np.std([tree.feature_importances_ for tree in rfc.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

# for f in range(X_train_scaled.shape[1]):
for i, feature in enumerate(X_train_scaled.columns):    
    print(str(i + 1) + '. ' + feature + ' ' + str(importances[indices[i]]))

Feature ranking:
1. num_charges 0.6079660780333372
2. sentence_years 0.3126110218460467
3. highest_charge 0.021068556125624756
4. amended/corrected_sentence_phase 0.011794970873354642
5. probation_violation_sentence_phase 0.010511973596308538
6. remanded_sentence_phase 0.009492657957764009
7. resentenced_sentence_phase 0.0077945987018352356
8. conditional_discharge_sentence 0.006105542600379988
9. boot_camp_sentence 0.005913454157694601
10. death_sentence 0.0032850780356657055
11. prison_sentence 0.0021848463346112543
12. probation_sentence 0.0008039157147086342
13. supervision_sentence 0.00036420302484310187
14. asian 5.618534698548865e-05
15. biracial 3.448607594445614e-05
16. black 1.2431574896320417e-05
17. white 0.0
