In [27]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
#newer versions of scikit learn need to import from model_selection
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction import DictVectorizer
import sklearn as sklearn

In [28]:
data_simple = pd.read_csv('data_simple.csv', encoding = "ISO-8859-1", low_memory=False, index_col=0)

In [29]:
#did not use charge_cat because only 50,555 non missing values.
#Drop if arrest date is before 1988
data_simple = data_simple[data_simple['ARREST_DATE']>=19880101]

#Drop if BOFI_NBR =/ or 00000
data_simple = data_simple[data_simple['BOFI_NBR']!='/']
data_simple = data_simple[data_simple['BOFI_NBR']!='00000']

data = data_simple[['SCREENING_DISP_CODE','CHARGE_CLASS','CHARGE_TYPE','LEAD_CHARGE_CODE']].copy()

In [30]:
data.count()

SCREENING_DISP_CODE    145437
CHARGE_CLASS           145501
CHARGE_TYPE            147373
LEAD_CHARGE_CODE       147377
dtype: int64

In [31]:
data.groupby(['SCREENING_DISP_CODE'])['SCREENING_DISP_CODE'].count()

SCREENING_DISP_CODE
140.0        2
160.0        1
230.0    20711
240.0    90495
250.0      117
260.0    34108
280.0        2
310.0        1
Name: SCREENING_DISP_CODE, dtype: int64

In [32]:
data.groupby(['CHARGE_CLASS']).count()

Unnamed: 0_level_0,SCREENING_DISP_CODE,CHARGE_TYPE,LEAD_CHARGE_CODE
CHARGE_CLASS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1.0,855,862,862
2.0,21647,21948,21950
3.0,58600,59826,59827
4.0,57307,58125,58125
5.0,14,18,18
6.0,4699,4718,4718
8.0,1,1,1


In [33]:
data.groupby(['CHARGE_TYPE']).count()

Unnamed: 0_level_0,SCREENING_DISP_CODE,CHARGE_CLASS,LEAD_CHARGE_CODE
CHARGE_TYPE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AR,143853,144274,146112
IF,1084,1172,1209
IN,43,52,52


In [34]:
data['LEAD_CHARGE_CODE'].unique().size

749

In [35]:
#Drop rows with missing values
data.dropna(axis=0, how='any', thresh=None, subset=None, inplace=True)
print('%s rows' %(data.shape[0]))



143120 rows


In [36]:
data.groupby(['SCREENING_DISP_CODE']).count()

Unnamed: 0_level_0,CHARGE_CLASS,CHARGE_TYPE,LEAD_CHARGE_CODE
SCREENING_DISP_CODE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
140.0,2,2,2
160.0,1,1,1
230.0,20563,20563,20563
240.0,88809,88809,88809
250.0,117,117,117
260.0,33625,33625,33625
280.0,2,2,2
310.0,1,1,1


In [37]:
'''
create labels
SCREENING_DISP_CODE = 230 > accepted, all other > not accepted
accepted coded as 1
'''

data['y']=np.where(data['SCREENING_DISP_CODE']==230,1,0)
data.groupby(['y']).count()

Unnamed: 0_level_0,SCREENING_DISP_CODE,CHARGE_CLASS,CHARGE_TYPE,LEAD_CHARGE_CODE
y,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,122557,122557,122557,122557
1,20563,20563,20563,20563


In [38]:
'''
create features using one hot encoding
'''

# first convert into integer values. one hot enconding only takes int input
l_enc = LabelEncoder()

col1 = l_enc.fit_transform(data['CHARGE_CLASS'])
col2 = l_enc.fit_transform(data['CHARGE_TYPE'])
col3 = l_enc.fit_transform(data['LEAD_CHARGE_CODE'])

X = np.column_stack((col1,col2,col3))

#one hot encoding
enc = OneHotEncoder()
X_enc = enc.fit_transform(X)

In [39]:
'''
split into training and val
DO WE NEED TO ADD TEST SET AS WELL? 
right now no test set because we don't care about generalization of baseline model

Split randomly right now, does not take year into account
'''

y = data['y']

X_train, X_test, y_train, y_test = train_test_split(X_enc, y, test_size=0.2)

In [40]:
#convert to dense array, decision tree only takes dense array input
X_train = X_train.toarray()
X_test = X_test.toarray()

In [41]:
'''
Decision tree with entropy criterion
'''

dt = DecisionTreeClassifier(criterion='entropy')
dt = dt.fit(X_train,y_train)

pred_train = dt.predict(X_train)
pred_test = dt.predict(X_test)


In [42]:
#accuracy

accu_train = sklearn.metrics.accuracy_score(y_train,pred_train)
accu_test = sklearn.metrics.accuracy_score(y_test,pred_test)

print("Accuracy on Training Dataset: {}".format(accu_train))
print("Accuracy on Test Dataset: {}".format(accu_test))

Accuracy on Training Dataset: 0.8658730435997765
Accuracy on Test Dataset: 0.862178591391839


In [43]:
precision = sklearn.metrics.precision_score(y_test,pred_test)
recall = sklearn.metrics.recall_score(y_test,pred_test)

print("Precision: {}".format(precision))
print("Recall: {}".format(recall))

Precision: 0.7184684684684685
Recall: 0.07707175646291375


In [44]:
#confusion matrix for test set
cm = sklearn.metrics.confusion_matrix(y_test,pred_test)
cm

array([[24360,   125],
       [ 3820,   319]])