In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import project_env as pe
from sklearn.preprocessing import LabelEncoder
%matplotlib inline

In [2]:
#bring in BOFI_NBR, SCREENING_DISP_CODE
data_simple = pd.read_csv('data_simple.csv')
data_simple = data_simple[['SCREENING_DISP_CODE','UNIQUE_ID','BOFI_NBR']]

In [3]:
bin_features = pd.read_csv('df_bin_features.csv', encoding = "ISO-8859-1", low_memory=False)
num_features = pd.read_csv('df_num_features.csv', encoding = "ISO-8859-1", low_memory=False)
date_features = pd.read_csv('df_date_features.csv', encoding = "ISO-8859-1", low_memory=False)
cat_features = pd.read_csv('df_cat_features.csv', encoding = "ISO-8859-1", low_memory=False)
rearrest = pd.read_csv('df_rearrest_times.csv', encoding = "ISO-8859-1", low_memory=False)
#rearrest = pd.read_csv('df_rearrests.csv', encoding = "ISO-8859-1", low_memory=False)

print('bin_features: %s' %(str(rearrest.shape)))
print('num_features: %s' %(str(num_features.shape)))
print('date_features: %s' %(str(date_features.shape)))
print('cat_features: %s' %(str(cat_features.shape)))
print('rearrest: %s' %(str(rearrest.shape)))

bin_features: (269543, 2)
num_features: (280294, 5)
date_features: (272088, 19)
cat_features: (280294, 8)
rearrest: (269543, 2)


In [4]:
merged = pd.merge(rearrest, \
                 bin_features,\
                 on='UNIQUE_ID', \
                 how='left')
merged = pd.merge(merged, \
                 num_features,\
                 on='UNIQUE_ID', \
                 how='left')
merged = pd.merge(merged, \
                 date_features,\
                 on='UNIQUE_ID', \
                 how='left')
merged = pd.merge(merged, \
                 cat_features,\
                 on='UNIQUE_ID', \
                 how='left')
merged = pd.merge(merged, \
                 data_simple,\
                 on='UNIQUE_ID', \
                 how='left')

cols = ['UNIQUE_ID', 'NEXT_ARREST_TIME', 'CRIMINAL_FLAG', \
        'FINAL_DETENTION_FLAG', 'HABITUAL_OFFENDER_FLAG', \
        'INITIAL_DETENTION_FLAG', 'JUVENILE_FLAG', 'SADA_SEX', \
        'SEX', 'TOT_NUM_DEF', 'MULTIPLE_DEF_FLAG', 'SCREENING_DAYS', \
        'ARREST_DATE', 'ARREST_DATE_y', 'SCREENING_DISP_DATE_y', \
        'SCREENING_DISP_DATE_m', 'AGE', 'ARREST_TO_SCREEN', 'ARREST_CREDIT_CODE', \
        'CHARGE_CLASS', 'CHARGE_TYPE', 'PARTY', 'RACE', \
        'SADA_RACE', 'SCREENING_DISP_CODE', 'BOFI_NBR']

merged = merged[cols]

In [6]:
#identify arrests where at least one charge was accepted
accepted = merged[merged['SCREENING_DISP_CODE']==230][['BOFI_NBR','ARREST_DATE']]

#drop rows where at least one charge was accepted during that arrest
refused = pd.merge(merged, \
                 accepted, \
                 on=['BOFI_NBR','ARREST_DATE'], \
                 how='outer',\
                 indicator = True)

refused = refused[refused['_merge']=='left_only'].drop('_merge', axis=1)

#remove duplicate arrests on same day
refused = refused[refused['NEXT_ARREST_TIME']!='Delete']

#convert NEXT_ARREST_TIME to numeric
refused['NEXT_ARREST_TIME'] = refused['NEXT_ARREST_TIME'].apply(pd.to_numeric)

In [14]:
cat_var = ['SADA_SEX', 'SEX', 'PARTY', 'RACE', 'SADA_RACE', \
           'SCREENING_DISP_DATE_m', 'CHARGE_TYPE', \
           'ARREST_CREDIT_CODE', 'CHARGE_CLASS', 'ARREST_DATE_y', 'SCREENING_DISP_DATE_y']

cat_var_enc = pe.one_hot_encode(refused[cat_var])
cat_var_enc = pd.DataFrame(cat_var_enc.toarray(), index=refused.index)

refused_enc = refused.drop(cat_var[:-1], axis=1)
refused_enc = pd.merge(refused_enc, \
                       cat_var_enc,\
                       left_index=True, \
                       right_index=True, \
                       how='left')

In [None]:
#Removed one record with a screening_disp_date in 1983, because this causes an error

In [None]:
test, train, val = pe.split_data(refused_enc[refused_enc['SCREENING_DISP_DATE_y' != 1983]], test_split=.2, \
                                 train_split=.64, by_var='SCREENING_DISP_DATE_y', random_state=1)

In [None]:
train.to_csv('train.csv',index=False)
val.to_csv('val.csv',index=False)