# Predictor label

## NOTE: This process is very time consuming.  It has already been done and the new data has been saved.  This code is included for our records.

In [1]:
import pandas as pd
import numpy as np
from project_env import to_date

In [2]:
data = pd.read_csv('data_simple.csv')

In [3]:
# Remove 4 null arrest dates
data = data[data.ARREST_DATE.notnull()]

# Remove cases where there is no ARREST_DATE or ADD_DATE (or cases earlier than 1980)
data = data[np.logical_and(data.ARREST_DATE>19880000,data.ADD_DATE>19880000)]

In [4]:
# Convert Dates to datetime format
data.ARREST_DATE = data.ARREST_DATE.astype(str)
data.ARREST_DATE=pd.to_datetime(data.ARREST_DATE, errors='coerce',format='%Y%m%d')
data.ADD_DATE = data.ADD_DATE.astype(str)
data.ADD_DATE = pd.to_datetime(data.ADD_DATE, errors='ignore', format='%Y%m%d')

In [5]:
# Sort data in chronological order
data=data.sort_values(by='ARREST_DATE')

In [6]:
# Get the number of times each BOFI_NBR appears in data
values = data.BOFI_NBR.value_counts()

# Get the entrants who only appeared once
unique = values.loc[values==1]

#Get anyone who had multiple appearances in data
notunique = values.loc[values>1]
rearrests = list(notunique.index)

In [7]:
# Create a column that indicates if someone appeared more than once
data['EVER_REARREST'] = np.where(data.BOFI_NBR.isin(rearrests), 1, 0)

# Create a column where the time to next arrest will be placed
data['NEXT_ARREST_TIME']=np.zeros(data.shape[0])

In [28]:
# Loop over everyone who has shown up more than once, get their 
# list of arrest dates and produce a new list for the amount of 
# time between each arrest.  The final arrest is labeled "Final Arrest"

all_indices = []
all_times = []
for i in rearrests:
    indices = list(data.loc[data.BOFI_NBR==i].index)
    dates = list(data.loc[data.BOFI_NBR==i].ARREST_DATE)
    timetorearrest=[]

    for k,v in enumerate(dates): 
        try:
            if (dates[k+1]-v).days==0:
                timetorearrest.append('Delete')
            else:
                timetorearrest.append((dates[k+1]-v).days)
        except IndexError:
            timetorearrest.append('Final Arrest')
    
    all_indices.append(indices)
    all_times.append(timetorearrest)

In [29]:
# Add times to the NEXT_ARREST_TIME column in main data
for i,v in enumerate(all_indices):
    for j,s in enumerate(v):
        data.loc[s, 'NEXT_ARREST_TIME'] = all_times[i][j]

In [34]:
data['NEXT_ARREST_TIME'].loc[data.NEXT_ARREST_TIME=='Final Arrest']=0
data['NEXT_ARREST_TIME'].loc[data.NEXT_ARREST_TIME=='0.0']=0

In [31]:
# Export
rearrest = data[['UNIQUE_ID','NEXT_ARREST_TIME']]
rearrest.to_csv('df_rearrests.csv',index=False)