In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import project_env as pe
import sklearn as sklearn
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.model_selection import cross_val_score
%matplotlib inline

In [64]:
train = pd.read_csv('train.csv')
val = pd.read_csv('val.csv')

# Target Variable

In [65]:
#target variable of rearrest in 2 years
X_train, y_train = pe.create_target(train, years = 2)
X_val, y_val = pe.create_target(val, years = 2)

In [66]:
unique, counts = np.unique(y_train, return_counts = True)
percent = counts[1]/counts.sum()
print('Training: %s percent were rearrested.' %(percent*100))

unique, counts = np.unique(y_val, return_counts = True)
percent = counts[1]/counts.sum()
print('Test: %s percent were rearrested.' %(percent*100))


Training: 55.6245941507 percent were rearrested.
Test: 55.273745862 percent were rearrested.


In [67]:
#remove non-feature columns
X_train=X_train.drop(['BOFI_NBR','SCREENING_DISP_CODE','UNIQUE_ID','NEXT_ARREST_TIME'], axis=1)
X_val=X_val.drop(['BOFI_NBR','SCREENING_DISP_CODE','UNIQUE_ID','NEXT_ARREST_TIME'], axis=1)

#Using year and month as predictive variables
X_train=X_train.drop(['ARREST_DATE', 'ARREST_DATE_y'], axis=1)
X_val=X_val.drop(['ARREST_DATE','ARREST_DATE_y'], axis=1)

# Gradient Boosted Trees

In [68]:
gbt = GradientBoostingClassifier()
gbt = gbt.fit(X_train,y_train)

gbt_pred_train = gbt.predict(X_train)
gbt_pred_val = gbt.predict(X_val)

In [69]:
# gbt accuracy
gbt_accu_train = sklearn.metrics.accuracy_score(y_train,gbt_pred_train)
gbt_accu_test = sklearn.metrics.accuracy_score(y_val,gbt_pred_val)

print("Accuracy on Training Dataset: {}".format(gbt_accu_train))
print("Accuracy on Val Dataset: {}".format(gbt_accu_test))

Accuracy on Training Dataset: 0.7559174422898178
Accuracy on Val Dataset: 0.7538579067990833


In [70]:
gbt_precision = sklearn.metrics.precision_score(y_val,gbt_pred_val)
gbt_recall = sklearn.metrics.recall_score(y_val,gbt_pred_val)
gbt_f_score = sklearn.metrics.f1_score(y_val,gbt_pred_val)

print("Precision: {}".format(gbt_precision))
print("Recall: {}".format(gbt_recall))
print("F-Score: {}".format(gbt_f_score))

Precision: 0.7495854063018242
Recall: 0.8329494149083203
F-Score: 0.7890717060184175


In [9]:
#confusion matrix for val set
gbt_cm = sklearn.metrics.confusion_matrix(y_val,gbt_pred_val)
gbt_cm    #add cm.to_latex?

array([[5761, 3021],
       [1843, 9010]])

# Random Forest

In [10]:
rf_clf =  RandomForestClassifier()
rf_clf = rf_clf.fit(X_train,y_train)

rf_pred_train = rf_clf.predict(X_train)
rf_pred_val = rf_clf.predict(X_val)

In [11]:
#a rf ccuracy
rf_accu_train = sklearn.metrics.accuracy_score(y_train,rf_pred_train)
rf_accu_test = sklearn.metrics.accuracy_score(y_val,rf_pred_val)

print("Accuracy on Training Dataset: {}".format(rf_accu_train))
print("Accuracy on Val Dataset: {}".format(rf_accu_test))

Accuracy on Training Dataset: 0.9840970727918613
Accuracy on Val Dataset: 0.7160682454800101


In [12]:
rf_precision = sklearn.metrics.precision_score(y_val,rf_pred_val)
rf_recall = sklearn.metrics.recall_score(y_val,rf_pred_val)
rf_f_score = sklearn.metrics.f1_score(y_val,rf_pred_val)

print("Precision: {}".format(rf_precision))
print("Recall: {}".format(rf_recall))
print("F-Score: {}".format(rf_f_score))

Precision: 0.7482596425211665
Recall: 0.7328849166129181
F-Score: 0.7404924824279663


In [13]:
#confusion matrix for val set
rf_cm = sklearn.metrics.confusion_matrix(y_val,rf_pred_val)
rf_cm    #add cm.to_latex?

array([[6106, 2676],
       [2899, 7954]])