In [272]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction import DictVectorizer
import sklearn as sklearn

In [273]:
data_simple = pd.read_csv('data_simple.csv', encoding = "ISO-8859-1", low_memory=False, index_col=0)

# CLEAN DATA

In [274]:
#SHOULD WE BE USING ADD_DATE instead of ARREST_DATE? need to check # of missing values for ADD_DATE
rearrest = data_simple[['BOFI_NBR','ARREST_DATE']].copy()

In [275]:
rearrest.count()

BOFI_NBR       280293
ARREST_DATE    150979
dtype: int64

In [276]:
#Drop rows with missing values
rearrest.dropna(axis=0, how='any', thresh=None, subset=None, inplace=True)
print('%s rows' %(rearrest.shape[0]))

150979 rows


In [277]:
#Drop if arrest date is before 1988
rearrest = rearrest[rearrest['ARREST_DATE']>=19880101]

In [278]:
#Drop if BOFI_NBR =/ or 00000
rearrest = rearrest[rearrest['BOFI_NBR']!='/']
rearrest = rearrest[rearrest['BOFI_NBR']!='00000']

In [279]:
rearrest.count()

BOFI_NBR       147834
ARREST_DATE    147834
dtype: int64

In [280]:
#drop duplicates 
rearrest=rearrest.drop_duplicates()

In [281]:
rearrest.count()

BOFI_NBR       129845
ARREST_DATE    129845
dtype: int64

In [282]:
#add column with # of arrests for each BOFI_NBR
rearrest['ARREST_CNT'] = rearrest.groupby('BOFI_NBR')['ARREST_DATE'].transform('count')

In [283]:
#FIRST_ARREST = 0 as flag for first arrest
rearrest.sort_values(by = ['BOFI_NBR','ARREST_DATE'], inplace=True)
rearrest['FIRST_ARREST'] = rearrest.groupby('BOFI_NBR').cumcount()

In [284]:
#add flag for whether BOFI_NBR has rearrest
#using & for elementwise operation
rearrest['REARREST_FLAG'] = np.where((rearrest['ARREST_CNT']>1) & \
                                 (rearrest['ARREST_CNT']-1 != rearrest['FIRST_ARREST']),1,0)

In [285]:
rearrest[:20]

Unnamed: 0,BOFI_NBR,ARREST_DATE,ARREST_CNT,FIRST_ARREST,REARREST_FLAG
79479,13348,19900309.0,1.0,0,0
126493,20492,19920204.0,1.0,0,0
89690,100043,19900919.0,1.0,0,0
255206,100126,19980602.0,1.0,0,0
99339,100323,19910123.0,1.0,0,0
110723,100332,19910618.0,2.0,0,1
110724,100332,19940611.0,2.0,1,0
198075,100387,19950515.0,1.0,0,0
236667,100408,19970603.0,1.0,0,0
67470,100414,19920213.0,1.0,0,0


In [286]:
data_merged = pd.merge(data_simple, \
                 rearrest, \
                 on=['BOFI_NBR','ARREST_DATE'], \
                 how='inner')

In [287]:
data_merged.shape

(147834, 36)

In [288]:
#identify arrests where at least one charge was accepted
accepted = data[data['SCREENING_DISP_CODE']==230][['BOFI_NBR','ARREST_DATE']]

In [289]:
#drop rows where at least one charge was accepted during that arrest
data_not_charged = pd.merge(data, \
                 accepted, \
                 on=['BOFI_NBR','ARREST_DATE'], \
                 how='outer',\
                 indicator = True)
data_not_charged = data_not_charged[data_not_charged['_merge']=='left_only'].drop('_merge', axis=1)

In [290]:
data_not_charged.shape

(119815, 9)

In [291]:
data_not_charged.count()

BOFI_NBR               119815
ARREST_DATE            119815
ARREST_CNT             119815
FIRST_ARREST           119815
REARREST_FLAG          119815
SCREENING_DISP_CODE    119815
CHARGE_CLASS           119815
CHARGE_TYPE            119815
LEAD_CHARGE_CODE       119815
dtype: int64

In [292]:
#Drop rows with missing values
data_not_charged.dropna(axis=0, how='any', thresh=None, subset=None, inplace=True)
print('%s rows' %(data_not_charged.shape[0]))

119815 rows


In [293]:
'''
group by BOFI_NBR, ARREST_DATE
for arrests with multiple values for CHARGE_CLASS, CHARGE_TYPE, and LEAD_CHARGE, collapse into string with multiple values
'''
#combine into one column. this is fine because we will eventually do one hot encoding
data_not_charged['CHARGES'] = data_not_charged['CHARGE_CLASS'].map(str).str.strip() + ',' + data_not_charged['CHARGE_TYPE'].str.strip() + \
                            ',' +data_not_charged['LEAD_CHARGE_CODE'].str.strip()
#aggregate multiple charges for each arrest
data_not_charged['CHARGES'] = data_not_charged[['BOFI_NBR','ARREST_DATE','CHARGES']].groupby(['BOFI_NBR','ARREST_DATE'])['CHARGES'].transform(lambda x: ','.join(x))

#drop dupes
data_not_charged=data_not_charged.drop_duplicates(['BOFI_NBR','ARREST_DATE','CHARGES'])

In [294]:
data_not_charged[:10]

Unnamed: 0,BOFI_NBR,ARREST_DATE,ARREST_CNT,FIRST_ARREST,REARREST_FLAG,SCREENING_DISP_CODE,CHARGE_CLASS,CHARGE_TYPE,LEAD_CHARGE_CODE,CHARGES
0,232589,19910126.0,2.0,0,1,240.0,3.0,AR,14:(27) 62,"3.0,AR,14:(27) 62"
1,232589,19930330.0,2.0,1,0,240.0,3.0,AR,14:62,"3.0,AR,14:62"
2,228992,19890512.0,5.0,0,1,240.0,2.0,AR,40:967 (C) (2),"2.0,AR,40:967 (C) (2)"
3,228992,19901018.0,5.0,1,1,260.0,3.0,AR,14:67 (A),"3.0,AR,14:67 (A)"
4,228992,19910620.0,5.0,2,1,240.0,4.0,AR,14:67 (C),"4.0,AR,14:67 (C)"
6,228992,19990913.0,5.0,4,0,240.0,4.0,AR,14:63,"4.0,AR,14:63"
9,172592,19900612.0,2.0,1,0,240.0,3.0,AR,14:69 (A),"3.0,AR,14:69 (A)"
10,239336,19970517.0,1.0,0,0,240.0,4.0,AR,14:220.1,"4.0,AR,14:220.1"
11,312136,19901230.0,1.0,0,0,240.0,2.0,AR,14:(27) 30,"2.0,AR,14:(27) 30"
17,206138,19970130.0,1.0,0,0,240.0,3.0,AR,14:72,"3.0,AR,14:72"


In [295]:
data_not_charged.shape

(108789, 10)

In [296]:
#keep observations within 1988-1996 timeframe
data_not_charged = data_not_charged[data_not_charged['ARREST_DATE']<=19961231]
data_not_charged.shape

(71454, 10)

# ONE HOT ENCODING

In [297]:
#class is not imbalanced
data_not_charged.groupby('REARREST_FLAG')['REARREST_FLAG'].count()

REARREST_FLAG
0    41883
1    29571
Name: REARREST_FLAG, dtype: int64

In [298]:
X = data_not_charged['CHARGES']
y = data_not_charged['REARREST_FLAG']


In [299]:
from sklearn.feature_extraction.text import CountVectorizer
c_vect = CountVectorizer(binary=True,tokenizer=lambda x: x.split(','))

X_vect = c_vect.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_vect, y, test_size=0.2)

X_train=X_train.toarray()
X_test=X_test.toarray()

In [300]:
X_train.shape

(57163, 526)

In [301]:
y_train.shape

(57163,)

# DECISION TREE

In [302]:
'''
Decision tree with entropy criterion
'''

dt = DecisionTreeClassifier(criterion='entropy')
dt = dt.fit(X_train,y_train)

pred_train = dt.predict(X_train)
pred_test = dt.predict(X_test)


In [303]:
#accuracy

accu_train = sklearn.metrics.accuracy_score(y_train,pred_train)
accu_test = sklearn.metrics.accuracy_score(y_test,pred_test)

print("Accuracy on Training Dataset: {}".format(accu_train))
print("Accuracy on Test Dataset: {}".format(accu_test))

Accuracy on Training Dataset: 0.6060213774644437
Accuracy on Test Dataset: 0.5854034007417256


In [304]:
precision = sklearn.metrics.precision_score(y_test,pred_test)
recall = sklearn.metrics.recall_score(y_test,pred_test)

print("Precision: {}".format(precision))
print("Recall: {}".format(recall))

Precision: 0.5015060240963856
Recall: 0.16843702579666162


In [305]:
#confusion matrix for test set
cm = sklearn.metrics.confusion_matrix(y_test,pred_test)
cm

array([[7367,  993],
       [4932,  999]])