In [209]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
#newer versions of scikit learn need to import from model_selection
from sklearn.cross_validation import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction import DictVectorizer
import sklearn as sklearn

In [210]:
data_simple = pd.read_csv('data_simple.csv', encoding = "ISO-8859-1", low_memory=False, index_col=0)

In [211]:
#did not use charge_cat because only 50,555 non missing values.
data = data_simple[['SCREENING_DISP_CODE','CHARGE_CLASS','CHARGE_TYPE','LEAD_CHARGE_CODE']].copy()

In [212]:
data.count()

SCREENING_DISP_CODE    244779
CHARGE_CLASS           148545
CHARGE_TYPE            150448
LEAD_CHARGE_CODE       150453
dtype: int64

In [213]:
data.groupby(['SCREENING_DISP_CODE'])['SCREENING_DISP_CODE'].count()

SCREENING_DISP_CODE
17                          1
30                          2
50                         14
60                          3
120                         1
130                         2
140                         3
150                         1
160                         9
230                    109650
240                    100008
250                       161
260                     34912
280                         8
310                         1
320                         1
560                         2
Name: SCREENING_DISP_CODE, dtype: int64

In [214]:
data.groupby(['CHARGE_CLASS']).count()

Unnamed: 0_level_0,SCREENING_DISP_CODE,CHARGE_TYPE,LEAD_CHARGE_CODE
CHARGE_CLASS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,996,1016,1016
2,22710,23051,23053
3,59445,60736,60737
4,57690,58637,58637
5,27,31,31
6,4981,5054,5055
7,15,15,15
8,1,1,1


In [215]:
data.groupby(['CHARGE_TYPE']).count()

Unnamed: 0_level_0,SCREENING_DISP_CODE,CHARGE_CLASS,LEAD_CHARGE_CODE
CHARGE_TYPE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AR,146299,146747,148605
IF,1396,1730,1779
IN,47,64,64


In [216]:
data['LEAD_CHARGE_CODE'].unique().size

796

In [217]:
#Drop rows with missing values
data.dropna(axis=0, how='any', thresh=None, subset=None, inplace=True)
print('%s rows' %(data.shape[0]))

145862 rows


In [218]:
'''
create labels
SCREENING_DISP_CODE = 230 > accepted, all other > not accepted
accepted coded as 1
'''

data['y']=np.where(data['SCREENING_DISP_CODE']==230,1,0)
#data.groupby(['SCREENING_DISP_CODE'])['y'].sum()

In [219]:
'''
create features using one hot encoding
'''

# first convert into integer values. one hot enconding only takes int input
l_enc = LabelEncoder()

col1 = l_enc.fit_transform(data['CHARGE_CLASS'])
col2 = l_enc.fit_transform(data['CHARGE_TYPE'])
col3 = l_enc.fit_transform(data['LEAD_CHARGE_CODE'])

X = np.column_stack((col1,col2,col3))

#one hot encoding
enc = OneHotEncoder()
X_enc = enc.fit_transform(X)

In [220]:
'''
split into training and val
DO WE NEED TO ADD TEST SET AS WELL? 
right now no test set because we don't care about generalization of baseline model

Split randomly right now, does not take year into account
'''

y = data['y']

X_train, X_test, y_train, y_test = train_test_split(X_enc, y, test_size=0.2)

In [221]:
#convert to dense array, decision tree only takes dense array input
X_train = X_train.toarray()
X_test = X_test.toarray()

In [222]:
'''
Decision tree with entropy criterion
'''

dt = DecisionTreeClassifier(criterion='entropy')
dt = dt.fit(X_train,y_train)

pred_train = dt.predict(X_train)
pred_test = dt.predict(X_test)


In [223]:
#accuracy

accu_train = sklearn.metrics.accuracy_score(y_train,pred_train)
accu_test = sklearn.metrics.accuracy_score(y_test,pred_test)

print("Accuracy on Training Dataset: {}".format(accu_train))
print("Accuracy on Test Dataset: {}".format(accu_test))

Accuracy on Training Dataset: 0.862592018099
Accuracy on Test Dataset: 0.860178932575


In [224]:
precision = sklearn.metrics.precision_score(y_test,pred_test)
recall = sklearn.metrics.recall_score(y_test,pred_test)

print("Precision: {}".format(precision))
print("Recall: {}".format(recall))

Precision: 0.78293135436
Recall: 0.0962591240876


In [225]:
#confusion matrix for test set
cm = sklearn.metrics.confusion_matrix(y_test,pred_test)
cm

array([[24672,   117],
       [ 3962,   422]])