In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from sklearn.preprocessing import LabelEncoder
%matplotlib inline

In [3]:
os.chdir("/Users/carolineroper/Documents/School/Machine Learning/Reset_Project/noda")

In [4]:
import project_env as pe

In [5]:
#bring in BOFI_NBR, SCREENING_DISP_CODE
data_simple = pd.read_csv('data_processing/output/data_simple.csv', encoding = "ISO-8859-1", low_memory=False)
data_simple = data_simple[['SCREENING_DISP_CODE','UNIQUE_ID','BOFI_NBR']]

In [6]:
data_simple.head()

Unnamed: 0,SCREENING_DISP_CODE,UNIQUE_ID,BOFI_NBR
0,,0,255544
1,,1,257683
2,,2,255696
3,,3,251021
4,,4,246712


In [7]:
bin_features = pd.read_csv('data_processing/output/df_bin_features.csv', encoding = "ISO-8859-1", low_memory=False)
num_features = pd.read_csv('data_processing/output/df_num_features.csv', encoding = "ISO-8859-1", low_memory=False)
date_features = pd.read_csv('data_processing/output/df_date_features.csv', encoding = "ISO-8859-1", low_memory=False) \
                            .drop('JUVENILE_FLAG',axis=1)
cat_features = pd.read_csv('data_processing/output/df_cat_features.csv', encoding = "ISO-8859-1", low_memory=False)
rearrest = pd.read_csv('data_processing/output/df_rearrest_times.csv', encoding = "ISO-8859-1", low_memory=False)

print('bin_features: %s' %(str(rearrest.shape)))
print('num_features: %s' %(str(num_features.shape)))
print('date_features: %s' %(str(date_features.shape)))
print('cat_features: %s' %(str(cat_features.shape)))
print('rearrest: %s' %(str(rearrest.shape)))

bin_features: (269543, 2)
num_features: (280294, 5)
date_features: (272088, 21)
cat_features: (280294, 8)
rearrest: (269543, 2)


In [8]:
merged = pd.merge(rearrest, \
                 bin_features,\
                 on='UNIQUE_ID', \
                 how='left')
merged = pd.merge(merged, \
                 num_features,\
                 on='UNIQUE_ID', \
                 how='left')
merged = pd.merge(merged, \
                 date_features,\
                 on='UNIQUE_ID', \
                 how='left')
merged = pd.merge(merged, \
                 cat_features,\
                 on='UNIQUE_ID', \
                 how='left')
merged = pd.merge(merged, \
                 data_simple,\
                 on='UNIQUE_ID', \
                 how='left')

In [9]:
merged.to_csv('merged_pj.csv', index=False)

In [10]:
merged.columns

Index(['UNIQUE_ID', 'NEXT_ARREST_TIME', 'CRIMINAL_FLAG',
       'FINAL_DETENTION_FLAG', 'HABITUAL_OFFENDER_FLAG',
       'INITIAL_DETENTION_FLAG', 'JUVENILE_FLAG', 'SADA_SEX', 'SEX',
       'TOT_NUM_DEF', 'MULTIPLE_DEF_FLAG', 'SCREENING_DAYS', 'POLICE_RPT_DAYS',
       'POLICE_RPT_DATE', 'ARREST_DATE', 'DOB', 'SCREENING_DISP_DATE',
       'BAR_ADMISSION', 'POLICE_RPT_DATE_y', 'ARREST_DATE_y', 'DOB_y',
       'SCREENING_DISP_DATE_y', 'BAR_ADMISSION_y', 'POLICE_RPT_DATE_m',
       'ARREST_DATE_m', 'DOB_m', 'SCREENING_DISP_DATE_m', 'BAR_ADMISSION_m',
       'AGE', 'BAR_ADMIT_DAYS', 'ARREST_TO_SCREEN', 'AGE_NA',
       'AGE_JUV_INVALID', 'ARREST_CREDIT_CODE', 'CHARGE_CLASS', 'CHARGE_TYPE',
       'LEAD_CHARGE_CODE', 'PARTY', 'RACE', 'SADA_RACE', 'SCREENING_DISP_CODE',
       'BOFI_NBR'],
      dtype='object')

In [11]:
cols = ['UNIQUE_ID', 'NEXT_ARREST_TIME', 'ARREST_DATE', 'ARREST_DATE_y','BOFI_NBR','SCREENING_DISP_CODE',\
        'BAR_ADMIT_DAYS','CRIMINAL_FLAG', \
        'FINAL_DETENTION_FLAG', 'HABITUAL_OFFENDER_FLAG', \
        'INITIAL_DETENTION_FLAG', 'JUVENILE_FLAG', 'SADA_SEX', \
        'SEX', 'TOT_NUM_DEF', 'MULTIPLE_DEF_FLAG', 'SCREENING_DAYS', \
        'SCREENING_DISP_DATE_y', \
        'SCREENING_DISP_DATE_m', 'AGE', 'ARREST_TO_SCREEN', \
        'CHARGE_CLASS', 'CHARGE_TYPE', 'PARTY', 'RACE', \
        'SADA_RACE','AGE_JUV_INVALID','AGE_NA']

merged = merged[cols]

In [12]:
merged.to_csv('merged_pj.csv',index=False)

In [13]:
#identify arrests where at least one charge was accepted
accepted = merged[merged['SCREENING_DISP_CODE']==230][['BOFI_NBR','ARREST_DATE']]

In [14]:
#identify arrests where at least one charge was accepted
accepted = merged[merged['SCREENING_DISP_CODE']==230][['BOFI_NBR','ARREST_DATE']]

#drop rows where at least one charge was accepted during that arrest
classified = pd.merge(merged, \
                 accepted, \
                 on=['BOFI_NBR','ARREST_DATE'], \
                 how='outer',\
                 indicator = True)

classified['ACCEPTED'] = np.where(classified['_merge']=='both', 1, 0)

classified = classified.drop('_merge', axis=1)

#remove duplicate arrests on same day
#confirmed that it's possible that an arrest has at least one charge was accepted and is still flagged as "delete"
classified = classified[classified['NEXT_ARREST_TIME']!='Delete']

#convert NEXT_ARREST_TIME to numeric
classified['NEXT_ARREST_TIME'] = classified['NEXT_ARREST_TIME'].apply(pd.to_numeric)

In [65]:
#pd.pivot_table(classified, index = ['NEXT_ARREST_TIME'], columns=['ACCEPTED'], aggfunc=lambda x: len(x.unique()))

In [63]:
cat_var = ['SADA_SEX', 'SEX', 'PARTY', 'RACE', 'SADA_RACE', \
           'SCREENING_DISP_DATE_y', 'SCREENING_DISP_DATE_m', 'CHARGE_TYPE', \
           'CHARGE_CLASS', 'ARREST_DATE_y']

cat_var_df = pd.DataFrame()

for column in cat_var:
    category = np.repeat(classified.loc[:,column].name, len(classified.loc[:,column].value_counts(dropna=False)))
    value = classified.loc[:,column].value_counts(dropna=False).index
    count = classified.loc[:,column].value_counts(dropna=False)
    cat_var_df = cat_var_df.append(pd.DataFrame(np.transpose(np.vstack([category, value, count]))))
cat_var_df.columns = ['Category', 'Value', 'Count']
cat_var_df['Count'] = pd.to_numeric(cat_var_df['Count'])

cat_var_df.fillna(value = "Missing", inplace=True)

cat_var_df['Label'] = cat_var_df['Category'] + '_' + cat_var_df['Value']

In [20]:
#two rounds of splits so predicted judge training set includes all training records from risk prediction model and so on.

test_ne_1, train_ne_1, val_ne_1 = pe.split_data(classified[classified['ACCEPTED']==0], test_split=.2, \
                                 train_split=.64, by_var='ARREST_DATE_y', random_state=1)

test_ne_2, train_ne_2, val_ne_2 = pe.split_data(classified[classified['ACCEPTED']==1], test_split=.2, \
                                 train_split=.64, by_var='ARREST_DATE_y', random_state=1)

test_ne = pd.concat([test_ne_1, test_ne_2],axis=0)
train_ne = pd.concat([train_ne_1, train_ne_2],axis=0)
val_ne = pd.concat([val_ne_1, val_ne_2],axis=0)

Test Data 25068
(100272, 29)
Val Data 20055
Train Data 80217
Test Data 21078
(84308, 29)
Val Data 16862
Train Data 67446


In [21]:
train_ne.to_csv('data_train_pj.csv', index=False)
test_ne.to_csv('data_test_pj.csv', index=False)
val_ne.to_csv('data_val_pj.csv', index=False)

In [36]:
cat_var_enc = pe.one_hot_encode(classified[cat_var])
cat_var_enc = pd.DataFrame(cat_var_enc.toarray(), index=classified.index) #can pass the column names here - 

In [42]:
after_enc_df = pd.DataFrame()

for column in list(cat_var_enc.columns):
    after_enc_df = after_enc_df.append(pd.DataFrame([column,sum(cat_var_enc.loc[:,column])]).transpose())
after_enc_df.columns = ['current_name', 'Count']

In [None]:
#code below creates meaningful column names for encoded variables - need to QA more thoroughly

In [123]:
col_name_df = pd.merge(after_enc_df, cat_var_df, on='Count', how='left')

In [124]:
s = col_name_df.current_name.value_counts() 

col_name_df[col_name_df.current_name.isin(list(s[s > 1].index))]


Unnamed: 0,current_name,Count,Category,Value,Label
0,0.0,18660.0,SADA_SEX,-1.0,SADA_SEX_-1
1,0.0,18660.0,SCREENING_DISP_DATE_y,1996.0,SCREENING_DISP_DATE_y_1996.0
23,22.0,1.0,SCREENING_DISP_DATE_y,1986.0,SCREENING_DISP_DATE_y_1986.0
24,22.0,1.0,SCREENING_DISP_DATE_y,1983.0,SCREENING_DISP_DATE_y_1983.0
25,23.0,1.0,SCREENING_DISP_DATE_y,1986.0,SCREENING_DISP_DATE_y_1986.0
26,23.0,1.0,SCREENING_DISP_DATE_y,1983.0,SCREENING_DISP_DATE_y_1983.0
36,33.0,18660.0,SADA_SEX,-1.0,SADA_SEX_-1
37,33.0,18660.0,SCREENING_DISP_DATE_y,1996.0,SCREENING_DISP_DATE_y_1996.0


In [125]:
#these row indexes found manually - not sure how to do this more automatically

col_name_df = col_name_df.drop([1, 23, 26, 36])

In [126]:
col_name_df['current_name'] = pd.to_numeric(col_name_df['current_name'])

col_name_df.index = col_name_df['current_name']

col_dict = col_name_df['Label'].to_dict()


In [136]:
classified_enc = classified.drop(cat_var[:-1], axis=1)
classified_enc = pd.merge(classified_enc, \
                       cat_var_enc,\
                       left_index=True, \
                       right_index=True, \
                       how='left')

In [137]:
classified_enc = classified_enc.rename(columns = col_dict)

In [138]:
#two rounds of splits so predicted judge training set includes all training records from risk prediction model and so on.

test_1, train_1, val_1 = pe.split_data(classified_enc[classified_enc['ACCEPTED']==0], test_split=.2, \
                                 train_split=.64, by_var='ARREST_DATE_y', random_state=1)

test_2, train_2, val_2 = pe.split_data(classified_enc[classified_enc['ACCEPTED']==1], test_split=.2, \
                                 train_split=.64, by_var='ARREST_DATE_y', random_state=1)

test = pd.concat([test_1, test_2],axis=0)
train = pd.concat([train_1, train_2],axis=0)
val = pd.concat([val_1, val_2],axis=0)

Test Data 25068
(100272, 93)
Val Data 20055
Train Data 80217
Test Data 21078
(84308, 93)
Val Data 16862
Train Data 67446


In [140]:
train.to_csv('train_pj.csv',index=False)
val.to_csv('val_pj.csv',index=False)
test.to_csv('test_pj.csv', index=False)