In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pylab
import project_env as pe
import sklearn as sklearn
import math
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.model_selection import cross_val_score
from itertools import product
import error_analysis as ea
from sklearn.metrics import precision_recall_curve
%matplotlib inline

In [2]:
train = pd.read_csv('data_processing/output/train.csv')
val = pd.read_csv('data_processing/output/val.csv')
test = pd.read_csv('data_processing/output/test.csv')

# Target Variable

In [3]:
#target variable of rearrest in 5 years
X_train, y_train = pe.create_target(train, years = 5)
X_val, y_val = pe.create_target(val, years = 5)

X_test, y_test = pe.create_target(test, years = 5)

In [4]:
unique, counts = np.unique(y_train, return_counts = True)
percent = counts[1]/counts.sum()
print('Training: %s percent were rearrested.' %(percent*100))

unique, counts = np.unique(y_val, return_counts = True)
percent = counts[1]/counts.sum()
print('Test: %s percent were rearrested.' %(percent*100))


Training: 51.9046644379 percent were rearrested.
Test: 51.8282699654 percent were rearrested.


In [5]:
#remove non-feature columns
X_train=X_train.drop(['BOFI_NBR','SCREENING_DISP_CODE','UNIQUE_ID','NEXT_ARREST_TIME'], axis=1)
X_val=X_val.drop(['BOFI_NBR','SCREENING_DISP_CODE','UNIQUE_ID','NEXT_ARREST_TIME'], axis=1)

#Using year and month as predictive variables
X_train=X_train.drop(['ARREST_DATE', 'ARREST_DATE_y'], axis=1)
X_val=X_val.drop(['ARREST_DATE','ARREST_DATE_y'], axis=1)

In [6]:
#remove non-feature columns
X_test=X_test.drop(['BOFI_NBR','SCREENING_DISP_CODE','UNIQUE_ID','NEXT_ARREST_TIME'], axis=1)

#Using year and month as predictive variables
X_test=X_test.drop(['ARREST_DATE', 'ARREST_DATE_y'], axis=1)

# Gradient Boosted Trees

### Default Parameters

In [7]:
gbt = GradientBoostingClassifier(n_estimators = 300,\
                               learning_rate = .1,\
                               max_depth = 5,\
                               min_samples_split = 4)
gbt = gbt.fit(X_train,y_train)

gbt_pred_train = gbt.predict(X_train)
gbt_pred_val = gbt.predict(X_val)

In [8]:
# gbt accuracy
gbt_accu_train = sklearn.metrics.accuracy_score(y_train,gbt_pred_train)
gbt_accu_test = sklearn.metrics.accuracy_score(y_val,gbt_pred_val)

print("Accuracy on Training Dataset: {}".format(gbt_accu_train))
print("Accuracy on Val Dataset: {}".format(gbt_accu_test))

Accuracy on Training Dataset: 0.7844943258056446
Accuracy on Val Dataset: 0.758322721484446


In [9]:
gbt_precision = sklearn.metrics.precision_score(y_val,gbt_pred_val)
gbt_recall = sklearn.metrics.recall_score(y_val,gbt_pred_val)
gbt_f_score = sklearn.metrics.f1_score(y_val,gbt_pred_val)

print("Precision: {}".format(gbt_precision))
print("Recall: {}".format(gbt_recall))
print("F-Score: {}".format(gbt_f_score))

Precision: 0.7621099810377521
Recall: 0.7758862758862759
F-Score: 0.7689364292547178


In [10]:
#confusion matrix for val set
gbt_cm = sklearn.metrics.confusion_matrix(y_val,gbt_pred_val)
gbt_cm    #add cm.to_latex?

array([[3916, 1380],
       [1277, 4421]])

#### Best Model's Predictions and Feature Importance

In [12]:
predicted_prob = gbt.predict_proba(X_val)
predictions = gbt.predict(X_val)

In [13]:
feature_imp = pd.DataFrame(list(zip(X_train.columns, gbt.feature_importances_)))
feature_imp.columns = ['Column Name', 'Importance']
print (feature_imp.sort(columns = 'Importance', ascending = False)[0:5].to_latex())

\begin{tabular}{llr}
\toprule
{} &       Column Name &  Importance \\
\midrule
9  &               AGE &    0.227318 \\
0  &    BAR\_ADMIT\_DAYS &    0.174524 \\
12 &  ARREST\_TO\_SCREEN &    0.122600 \\
8  &    SCREENING\_DAYS &    0.078135 \\
1  &     CRIMINAL\_FLAG &    0.054159 \\
\bottomrule
\end{tabular}



  app.launch_new_instance()


### Confusion Matrix and Risk CSV

In [24]:
X_val_id, y_val_id = pe.create_target(val, years = 5)

val_risk = X_val_id[['BOFI_NBR', 'UNIQUE_ID']].copy()
val_risk['Risk'] = predicted_prob[:,1]

val_risk.to_csv('results/val_risk.csv')


val_pred = X_val_id[['BOFI_NBR', 'UNIQUE_ID']].copy()
val_pred['pred_y'] = predictions
val_pred['true_y'] = y_val_id

val_pred.to_csv('results/val_pred.csv')
