In [46]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import project_env as pe
%matplotlib inline

In [47]:
#bring in BOFI_NBR, SCREENING_DISP_CODE
data_simple = pd.read_csv('data_simple.csv', encoding = "ISO-8859-1", low_memory=False)
data_simple = data_simple[['SCREENING_DISP_CODE','UNIQUE_ID','BOFI_NBR']]

In [53]:
bin_features = pd.read_csv('df_bin_features.csv', encoding = "ISO-8859-1", low_memory=False)
num_features = pd.read_csv('df_num_features.csv', encoding = "ISO-8859-1", low_memory=False)
date_features = pd.read_csv('df_date_features.csv', encoding = "ISO-8859-1", low_memory=False)
cat_features = pd.read_csv('df_cat_features.csv', encoding = "ISO-8859-1", low_memory=False, na_filter=False) #important change
rearrest = pd.read_csv('df_rearrest_times.csv', encoding = "ISO-8859-1", low_memory=False)

print('bin_features: %s' %(str(rearrest.shape)))
print('num_features: %s' %(str(num_features.shape)))
print('date_features: %s' %(str(date_features.shape)))
print('cat_features: %s' %(str(cat_features.shape)))
print('rearrest: %s' %(str(rearrest.shape)))

bin_features: (269543, 2)
num_features: (280294, 5)
date_features: (272088, 16)
cat_features: (280294, 8)
rearrest: (269543, 2)


In [54]:
merged = pd.merge(rearrest, \
                 bin_features,\
                 on='UNIQUE_ID', \
                 how='left')
merged = pd.merge(merged, \
                 num_features,\
                 on='UNIQUE_ID', \
                 how='left')
merged = pd.merge(merged, \
                 date_features,\
                 on='UNIQUE_ID', \
                 how='left')
merged = pd.merge(merged, \
                 cat_features,\
                 on='UNIQUE_ID', \
                 how='left')
merged = pd.merge(merged, \
                 data_simple,\
                 on='UNIQUE_ID', \
                 how='left')

In [35]:
#identify arrests where at least one charge was accepted
accepted = merged[merged['SCREENING_DISP_CODE']==230][['BOFI_NBR','ARREST_DATE']]

#drop rows where at least one charge was accepted during that arrest
data_not_charged = pd.merge(merged, \
                 accepted, \
                 on=['BOFI_NBR','ARREST_DATE'], \
                 how='outer',\
                 indicator = True)

data_not_charged = data_not_charged[data_not_charged['_merge']=='left_only'].drop('_merge', axis=1)

#remove duplicate arrests on same day
data_not_charged = data_not_charged[data_not_charged['NEXT_ARREST_TIME']!='Delete']

#convert NEXT_ARREST_TIME to numeric
data_not_charged['NEXT_ARREST_TIME'] = data_not_charged['NEXT_ARREST_TIME'].apply(pd.to_numeric)

In [7]:
#not encoded
data_test, data_train, data_val = pe.split_data(data_not_charged, test_split=.2,  \
                                                train_split=.64, by_var='ARREST_DATE_y', random_state=1)

Test Data 25068
(100272, 37)
Val Data 20055
Train Data 80217


In [44]:
data_train.to_csv('data_train.csv')

In [9]:
#encoded
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

cat_var =['BAR_ADMISSION_m','ARREST_CREDIT_CODE', 'CHARGE_CLASS', 'CHARGE_TYPE', 'LEAD_CHARGE_CODE',\
       'PARTY', 'RACE', 'SADA_RACE','SEX','SADA_SEX','POLICE_RPT_DATE_y','POLICE_RPT_DATE_m',\
         'ARREST_DATE_m','DOB_y','DOB_m','SCREENING_DISP_DATE_y','SCREENING_DISP_DATE_m',\
         'BAR_ADMISSION_y','BAR_ADMISSION_m','ARREST_DATE_y']

cat_var_enc = pe.one_hot_encode(data_not_charged[cat_var])
cat_var_enc = pd.DataFrame(cat_var_enc.toarray(), index=data_not_charged.index)
data_not_charged_enc = data_not_charged.drop(cat_var[:-1], axis=1)
data_not_charged_enc = pd.merge(data_not_charged_enc, \
                 cat_var_enc,\
                 left_index=True, \
                 right_index=True, \
                 how='left')

data_test_enc, data_train_enc, data_val_enc = pe.split_data(data_not_charged_enc, test_split=.2,  \
                                               train_split=.64, by_var='ARREST_DATE_y', random_state=1)

Test Data 25068
(100272, 1001)
Val Data 20055
Train Data 80217


In [10]:
data_train_enc.to_csv('data_train_enc.csv',index=False)
data_val_enc.to_csv('data_val_enc.csv',index=False)