In [3]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction import DictVectorizer
import sklearn as sklearn

In [4]:
data_simple = pd.read_csv('data_simple.csv', encoding = "ISO-8859-1", low_memory=False)

# CLEAN DATA

In [5]:
data_simple.columns

Index(['UNIQUE_ID', 'ADA_CODE', 'ADD_DATE', 'ARREST_CREDIT_CODE',
       'ARREST_DATE', 'BAR_ADMISSION', 'BOFI_NBR', 'CHARGE_CLASS',
       'CHARGE_TYPE', 'CRIMINAL_FLAG', 'DFDN_SEQ_NBR', 'DOB', 'FBI_NBR',
       'FINAL_DETENTION_FLAG', 'HABITUAL_OFFENDER_FLAG',
       'INITIAL_DETENTION_FLAG', 'JUVENILE_FLAG', 'LEAD_CHARGE_CODE', 'PARTY',
       'POLICE_RPT_DATE', 'POLICE_RPT_DAYS', 'RACE', 'SADA_DOB', 'SADA_RACE',
       'SADA_SEX', 'SCREENING_DAYS', 'SCREENING_DISP_CODE',
       'SCREENING_DISP_DATE', 'SEX', 'SYS_NBR'],
      dtype='object')

In [6]:
#SHOULD WE BE USING ADD_DATE instead of ARREST_DATE? need to check # of missing values for ADD_DATE
rearrest = data_simple[['BOFI_NBR','ARREST_DATE']].copy()

In [7]:
rearrest.count()

BOFI_NBR       280293
ARREST_DATE    280290
dtype: int64

In [8]:
#Drop rows with missing values
rearrest.dropna(axis=0, how='any', thresh=None, subset=None, inplace=True)
print('%s rows' %(rearrest.shape[0]))

280289 rows


In [9]:
#Drop if arrest date is before 1988
rearrest = rearrest[rearrest['ARREST_DATE']>=19880101]

In [10]:
#Drop if BOFI_NBR =/ or 00000
rearrest = rearrest[rearrest['BOFI_NBR']!='/']
rearrest = rearrest[rearrest['BOFI_NBR']!='00000']

In [11]:
rearrest.count()

BOFI_NBR       269512
ARREST_DATE    269512
dtype: int64

In [12]:
#drop duplicates 
rearrest=rearrest.drop_duplicates()

In [13]:
rearrest.count()

BOFI_NBR       222464
ARREST_DATE    222464
dtype: int64

In [14]:
#add column with # of arrests for each BOFI_NBR
rearrest['ARREST_CNT'] = rearrest.groupby('BOFI_NBR')['ARREST_DATE'].transform('count')

In [15]:
#FIRST_ARREST = 0 as flag for first arrest
rearrest.sort_values(by = ['BOFI_NBR','ARREST_DATE'], inplace=True)
rearrest['FIRST_ARREST'] = rearrest.groupby('BOFI_NBR').cumcount()

In [16]:
#add flag for whether BOFI_NBR has rearrest
#using & for elementwise operation
rearrest['REARREST_FLAG'] = np.where((rearrest['ARREST_CNT']>1) & \
                                 (rearrest['ARREST_CNT']-1 != rearrest['FIRST_ARREST']),1,0)

In [17]:
rearrest[:20]

Unnamed: 0,BOFI_NBR,ARREST_DATE,ARREST_CNT,FIRST_ARREST,REARREST_FLAG
79479,13348,19900309.0,1.0,0,0
163257,1731920,19931015.0,1.0,0,0
126493,20492,19920204.0,1.0,0,0
238073,221742,19970627.0,1.0,0,0
36698,301725,19890427.0,1.0,0,0
89690,100043,19900919.0,1.0,0,0
255206,100126,19980602.0,1.0,0,0
138229,100144,19920821.0,1.0,0,0
149821,100284,19930217.0,1.0,0,0
104796,100315,19910402.0,1.0,0,0


In [18]:
data_merged = pd.merge(data_simple, \
                 rearrest, \
                 on=['BOFI_NBR','ARREST_DATE'], \
                 how='inner')

In [19]:
data_merged = data_merged[['BOFI_NBR','ARREST_DATE','ARREST_CNT','FIRST_ARREST','REARREST_FLAG','SCREENING_DISP_CODE',\
                           'CHARGE_CLASS','CHARGE_TYPE','LEAD_CHARGE_CODE']].copy()

In [20]:
data_merged.shape

(269512, 9)

In [21]:
#identify arrests where at least one charge was accepted
accepted = data_merged[data_merged['SCREENING_DISP_CODE']==230][['BOFI_NBR','ARREST_DATE']]

In [22]:
#drop rows where at least one charge was accepted during that arrest
data_not_charged = pd.merge(data_merged, \
                 accepted, \
                 on=['BOFI_NBR','ARREST_DATE'], \
                 how='outer',\
                 indicator = True)
data_not_charged = data_not_charged[data_not_charged['_merge']=='left_only'].drop('_merge', axis=1)

In [23]:
data_not_charged.shape

(136371, 9)

In [24]:
data_merged.columns

Index(['BOFI_NBR', 'ARREST_DATE', 'ARREST_CNT', 'FIRST_ARREST',
       'REARREST_FLAG', 'SCREENING_DISP_CODE', 'CHARGE_CLASS', 'CHARGE_TYPE',
       'LEAD_CHARGE_CODE'],
      dtype='object')

In [25]:
data_not_charged.count()

BOFI_NBR               136371
ARREST_DATE            136371
ARREST_CNT             136371
FIRST_ARREST           136371
REARREST_FLAG          136371
SCREENING_DISP_CODE    121787
CHARGE_CLASS           133212
CHARGE_TYPE            135945
LEAD_CHARGE_CODE       135947
dtype: int64

In [26]:
#Drop rows with missing values
data_not_charged.dropna(axis=0, how='any', thresh=None, subset=None, inplace=True)
print('%s rows' %(data_not_charged.shape[0]))

119550 rows


In [27]:
'''
group by BOFI_NBR, ARREST_DATE
for arrests with multiple values for CHARGE_CLASS, CHARGE_TYPE, and LEAD_CHARGE, collapse into string with multiple values
'''
#combine into one column. this is fine because we will eventually do one hot encoding
data_not_charged['CHARGES'] = data_not_charged['CHARGE_CLASS'].map(str).str.strip() + ',' + data_not_charged['CHARGE_TYPE'].str.strip() + \
                            ',' +data_not_charged['LEAD_CHARGE_CODE'].str.strip()
#aggregate multiple charges for each arrest
data_not_charged['CHARGES'] = data_not_charged[['BOFI_NBR','ARREST_DATE','CHARGES']].groupby(['BOFI_NBR','ARREST_DATE'])['CHARGES'].transform(lambda x: ','.join(x))

#drop dupes
data_not_charged=data_not_charged.drop_duplicates(['BOFI_NBR','ARREST_DATE','CHARGES'])

In [28]:
data_not_charged[:10]

Unnamed: 0,BOFI_NBR,ARREST_DATE,ARREST_CNT,FIRST_ARREST,REARREST_FLAG,SCREENING_DISP_CODE,CHARGE_CLASS,CHARGE_TYPE,LEAD_CHARGE_CODE,CHARGES
0,232589,19910126.0,3.0,0,1,240.0,3.0,AR,14:(27) 62,"3.0,AR,14:(27) 62"
2,232589,19930330.0,3.0,2,0,240.0,3.0,AR,14:62,"3.0,AR,14:62"
5,228992,19890512.0,6.0,0,1,240.0,2.0,AR,40:967 (C) (2),"2.0,AR,40:967 (C) (2)"
7,228992,19901018.0,6.0,1,1,260.0,3.0,AR,14:67 (A),"3.0,AR,14:67 (A)"
8,228992,19910620.0,6.0,2,1,240.0,4.0,AR,14:67 (C),"4.0,AR,14:67 (C)"
12,228992,19990913.0,6.0,5,0,240.0,4.0,AR,14:63,"4.0,AR,14:63"
14,172592,19890507.0,4.0,1,1,240.0,3.0,IF,14:69 (A),"3.0,IF,14:69 (A)"
19,172592,19900612.0,4.0,3,0,240.0,3.0,AR,14:69 (A),"3.0,AR,14:69 (A)"
21,239336,19970517.0,1.0,0,0,240.0,4.0,AR,14:220.1,"4.0,AR,14:220.1"
22,312136,19901230.0,2.0,0,1,240.0,2.0,AR,14:(27) 30,"2.0,AR,14:(27) 30"


In [29]:
data_not_charged.shape

(110852, 10)

In [30]:
#keep observations within 1988-1996 timeframe
data_not_charged = data_not_charged[data_not_charged['ARREST_DATE']<=19961231]
data_not_charged.shape

(75005, 10)

In [31]:
data_not_charged.groupby('REARREST_FLAG')['REARREST_FLAG'].count()

REARREST_FLAG
0    33281
1    41724
Name: REARREST_FLAG, dtype: int64

In [32]:
#rearrest rate for people released by screener
41724/(33281+41724)

0.5562829144723686

# ONE HOT ENCODING

In [33]:
#class is not imbalanced
data_not_charged.groupby('REARREST_FLAG')['REARREST_FLAG'].count()

REARREST_FLAG
0    33281
1    41724
Name: REARREST_FLAG, dtype: int64

In [34]:
X = data_not_charged['CHARGES']
y = data_not_charged['REARREST_FLAG']


In [35]:
from sklearn.feature_extraction.text import CountVectorizer
c_vect = CountVectorizer(binary=True,tokenizer=lambda x: x.split(','))

X_vect = c_vect.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_vect, y, test_size=0.2)

X_train=X_train.toarray()
X_test=X_test.toarray()

In [36]:
X_train.shape

(60004, 528)

In [37]:
y_train.shape

(60004,)

# DECISION TREE

In [43]:
'''
Decision tree with entropy criterion
'''

dt = DecisionTreeClassifier(criterion='entropy')
dt = dt.fit(X_train,y_train)

pred_train = dt.predict(X_train)
pred_test = dt.predict(X_test)


In [44]:
#accuracy

accu_train = sklearn.metrics.accuracy_score(y_train,pred_train)
accu_test = sklearn.metrics.accuracy_score(y_test,pred_test)

print("Accuracy on Training Dataset: {}".format(accu_train))
print("Accuracy on Test Dataset: {}".format(accu_test))

Accuracy on Training Dataset: 0.6151089927338177
Accuracy on Test Dataset: 0.5860942603826411


In [45]:
precision = sklearn.metrics.precision_score(y_test,pred_test)
recall = sklearn.metrics.recall_score(y_test,pred_test)
f_score = sklearn.metrics.f1_score(y_test,pred_test)

print("Precision: {}".format(precision))
print("Recall: {}".format(recall))
print("F-Score: {}".format(f_score))

Precision: 0.609017847823818
Recall: 0.7031814895155459
F-Score: 0.6527210694110408


In [46]:
#confusion matrix for test set
cm = sklearn.metrics.confusion_matrix(y_test,pred_test)
cm

array([[2957, 3746],
       [2463, 5835]])

# Gradient Boosted Tree

In [39]:
from sklearn.ensemble import GradientBoostingClassifier
gbt = GradientBoostingClassifier()
gbt = gbt.fit(X_train,y_train)

pred_train = gbt.predict(X_train)
pred_test = gbt.predict(X_test)


In [40]:
#accuracy

accu_train = sklearn.metrics.accuracy_score(y_train,pred_train)
accu_test = sklearn.metrics.accuracy_score(y_test,pred_test)

print("Accuracy on Training Dataset: {}".format(accu_train))
print("Accuracy on Test Dataset: {}".format(accu_test))

Accuracy on Training Dataset: 0.5839610692620492
Accuracy on Test Dataset: 0.5746283581094593


In [42]:
precision = sklearn.metrics.precision_score(y_test,pred_test)
recall = sklearn.metrics.recall_score(y_test,pred_test)
f_score = sklearn.metrics.f1_score(y_test,pred_test)

print("Precision: {}".format(precision))
print("Recall: {}".format(recall))
print("F-Score: {}".format(f_score))

Precision: 0.5730063218828547
Recall: 0.9066040009640878
F-Score: 0.7021981611984879
