# Data Preparation for the Interactive Analysis Layer

## TODO: DOCUMENTATION/EXPLANATIONS FOR THIS NOTEBOOK

In [495]:
import requests
import pandas as pd
import numpy as np
import re
import copy

In [496]:
# Create dataframes for Northwest, Suffolk, and Middlesex
nw = pd.read_csv('../data/cleaned/clean_northwestern.csv', encoding='utf8',
                    dtype={})
sf = pd.read_csv('../data/cleaned/clean_suffolk.csv', encoding='utf8',
                    dtype={})
ms = pd.read_csv('../data/cleaned/clean_middlesex.csv', encoding='utf8',
                    dtype={'Incident_Guilty_or_missing':str}, low_memory=False)
pd.set_option("display.max.columns", None)

## Step 1: Additional Columns

In [497]:
# Add column: 'Inc_Juvenile', so the information is found under the same column name in all regions
ms['Inc_Juvenile'] = ms['JuvenileC']
# Suffolk has no juvenile data; all incidents are treated as juvenile
sf['Inc_Juvenile'] = True
nw['Inc_Juvenile'] = nw['Age at Offense'] < 21

In [498]:
# Add column: 'Inc_Felony' (boolean)
# This is dummy data; eventually this data will be implemented earlier in the data pipeline
np.random.seed(42)
for x in [nw, ms, sf]:
    x['Inc_Felony'] = (np.random.randint(0,20, x.shape[0]))
    x['Inc_Felony'] = x['Inc_Felony'] == 19
    
nw['Inc_Felony'] = nw.groupby(['Person ID', 'Offense Date'])['Inc_Felony'].transform('min')
sf['Inc_Felony'] = sf.groupby(['Person ID', 'Offense Date'])['Inc_Felony'].transform('min')
ms['Inc_Felony'] = ms.groupby(['Case Number'])['Inc_Felony'].transform('min')

In [499]:
# Add column: 'Inc_Years_Remaining'; the number of years that must still pass before the incident may be eligible
# That's 3 years for a misdemeanor, and 7 years for a felony
# Note that any incident for which the waiting period has already passed will have a value <= 0
for x in [nw, ms, sf]:
    #x['Inc_Years_Remaining'] = 0
    x.loc[(x['Inc_Felony'] == True),['Inc_Years_Remaining']] = 7 - x['years_since_offense']
    x.loc[(x['Inc_Felony'] == False),['Inc_Years_Remaining']] = 3 - x['years_since_offense']

In [500]:
def generateIncidentCode(row):
    # This function returns a string 6 digits long; each digit can be 1 or 0. For each digit position, a 1 indicates:
    # First: incident occured at a juvenile age
    # Second: all incident offenses are eligible for expungement under 100J
    # Third: at least one incident offense is a sex or murder offense
    # Fourth: at least one incident offense has a guilty disposition
    # Fifth: at least one incident offense lacks all disposition data
    # Sixth: not enough years have passed for the incident to be potentially expungeable
    
    result = list('000000')
    if row['Inc_Juvenile']:
        result[0] = '1'
    if row['Inc_Expungeable_Attempts_Are']:
        result[1] = '1'
    if row['Inc_Sex_or_Murder']:
        result[2] = '1'
    if row['Incident_Guilty_or_missing'] == 'True':
        result[3] = '1'
    if row['Inc_Missing_Any_Dispo']:
        result[4] = '1'
    if row['Inc_Years_Remaining'] > 0:
        result[5] = '1'
        
    return ''.join(result)

In [501]:
# Add column: 'Incident Code'
for x in [nw, ms, sf]:
    x['Incident Code'] = x.apply(lambda row: generateIncidentCode(row), axis=1)

## Step 2: Reduce Dataframes Into Single-Column Summaries

In [502]:
nw_summary = (nw.groupby(['Person ID', 'Offense Date'])['Incident Code'].first().reset_index())
nw_summary['All Incident Codes'] = nw_summary.groupby(['Person ID'])['Incident Code'].transform(lambda y: ''.join(y))
nw_summary.drop_duplicates(subset='Person ID', inplace=True)
nw_summary.drop(columns=['Person ID', 'Offense Date', 'Incident Code'], inplace=True)
print(nw_summary)

                   All Incident Codes
0      001000010000000000000100000001
5                              010100
6                              010000
7                        010000000100
9      010000010000000000000000010101
...                               ...
33063                          010000
33064                          000100
33065                          010000
33066                    010000010000
33068                          000100

[19517 rows x 1 columns]


In [503]:
sf_summary = (sf.groupby(['Person ID', 'Offense Date'])['Incident Code'].first().reset_index())
sf_summary['All Incident Codes'] = sf_summary.groupby(['Person ID'])['Incident Code'].transform(lambda y: ''.join(y))
sf_summary.drop_duplicates(subset='Person ID', inplace=True)
sf_summary.drop(columns=['Person ID', 'Offense Date', 'Incident Code'], inplace=True)
print(sf_summary)

                                       All Incident Codes
0                                                  110100
1       1100001101001001001001001000001000001000001001...
10                   110000110100110100110100100100110011
16                                           100100100000
18                                           100000100100
...                                                   ...
147341                                             110000
147342                                             100100
147343                                             110000
147344                                             110010
147345                                             110000

[90440 rows x 1 columns]


In [504]:
ms_summary = (ms.groupby(['Case Number', 'Offense Date'])['Incident Code'].first().reset_index())
ms_summary['All Incident Codes'] = ms_summary.groupby(['Case Number'])['Incident Code'].transform(lambda y: ''.join(y))
ms_summary.drop_duplicates(subset='Case Number', inplace=True)
ms_summary.drop(columns=['Case Number', 'Offense Date', 'Incident Code'], inplace=True)
print(ms_summary)

       All Incident Codes
0                  010100
1                  001000
2                  010000
3                  001000
4                  101000
...                   ...
163706             010001
163707             010001
163708             010001
163709             010001
163710             001000

[163711 rows x 1 columns]


## Step 3: Answers to questions provided by CFJJ

In [505]:
def determineEligibility(row, categories):
    categoryList = copy.deepcopy(categories)
    incidentString = row['All Incident Codes']
    incidents = re.findall('......', incidentString)
    eligibleToday = True
    missingDispo = False
    categoryFound = False
    
    for incident in incidents:
        if incident[-1] == '1':
            eligibleToday = False
        if incident[-2] == '1':
            missingDispo = True
        
        for category in categoryList:
            categoryRegex = re.compile(category[1])
            if categoryRegex.match(incident):
                categoryFound = True
                # The focused incident matches the focused category; decrement the category allotment
                category[0] = category[0] - 1
                if category[0] < 0:
                    # If any category exceeds its allotment, this individual is never eligible
                    return 0
        # If the incident does not belong to any of the given categories, this individual is never eligible
        if not categoryFound:
            return 0
        categoryFound = False
    # If this point is reached, the individual is eligible, but may still need to wait for the 3 or 7 years to pass
    if eligibleToday and not missingDispo:
        return 1
    elif eligibleToday and missingDispo:
        return 2
    elif not eligibleToday and not missingDispo:
        return 3
    elif not eligibleToday and missingDispo:
        return 4
    
    # This point shouldn't ever be reachable
    return -1

In [506]:
def printAnswers(categories, region):
    if region == 'nw':
        regionName = 'Northwest'
        df = nw_summary.copy()
        unit = 'individuals'
    elif region == 'sf':
        regionName = 'Suffolk'
        df = sf_summary.copy()
        unit = 'individuals'
    elif region == 'ms':
        regionName = 'Middlesex'
        df = ms_summary.copy()
        unit = 'cases'
    else:
        print('Invalid region provided')
        return
    
    df['Result'] = df.apply(lambda row: determineEligibility(row, categories), axis=1)
    
    neverEligible = (df['Result'].values == 0).sum()
    eligibleNow = (df['Result'].values == 1).sum()
    eligibleNowIncomplete = (df['Result'].values == 2).sum()
    eligibleLater = (df['Result'].values == 3).sum()
    eligibleLaterIncomplete = (df['Result'].values == 4).sum()
    
    print(regionName)
    print(eligibleNow + eligibleNowIncomplete, unit, 'are eligible today.', eligibleNowIncomplete, 'of them have incomplete disposition data.')
    print('An additional', eligibleLater + eligibleLaterIncomplete, unit, 'will become eligible after their waiting period has ended.', eligibleLaterIncomplete, 'of them have incomplete disposition data.')
    print(neverEligible, unit, 'will never be eligible.\n')

In [507]:
def answerQuestion(categories):
    printAnswers(categories, 'nw')
    printAnswers(categories, 'sf')
    printAnswers(categories, 'ms')

**Question 1**

In [508]:
answerQuestion([[2, '11.1'], [2, '11.0']])

Northwest
980 individuals are eligible today. 34 of them have incomplete disposition data.
An additional 386 individuals will become eligible after their waiting period has ended. 19 of them have incomplete disposition data.
18151 individuals will never be eligible.

Suffolk
40728 individuals are eligible today. 5859 of them have incomplete disposition data.
An additional 9168 individuals will become eligible after their waiting period has ended. 3263 of them have incomplete disposition data.
40544 individuals will never be eligible.

Middlesex
2443 cases are eligible today. 0 of them have incomplete disposition data.
An additional 767 cases will become eligible after their waiting period has ended. 0 of them have incomplete disposition data.
160501 cases will never be eligible.



**Question 2**

In [509]:
answerQuestion([[2, '1.01'], [2, '1.00']])

Northwest
1510 individuals are eligible today. 47 of them have incomplete disposition data.
An additional 610 individuals will become eligible after their waiting period has ended. 30 of them have incomplete disposition data.
17397 individuals will never be eligible.

Suffolk
62112 individuals are eligible today. 7878 of them have incomplete disposition data.
An additional 15376 individuals will become eligible after their waiting period has ended. 6406 of them have incomplete disposition data.
12952 individuals will never be eligible.

Middlesex
4216 cases are eligible today. 0 of them have incomplete disposition data.
An additional 1466 cases will become eligible after their waiting period has ended. 0 of them have incomplete disposition data.
158029 cases will never be eligible.



**Question 3**

In [510]:
answerQuestion([[0, '10.1'], [4, '1...']])

Northwest
1357 individuals are eligible today. 46 of them have incomplete disposition data.
An additional 567 individuals will become eligible after their waiting period has ended. 35 of them have incomplete disposition data.
17593 individuals will never be eligible.

Suffolk
62486 individuals are eligible today. 8898 of them have incomplete disposition data.
An additional 17184 individuals will become eligible after their waiting period has ended. 7843 of them have incomplete disposition data.
10770 individuals will never be eligible.

Middlesex
4111 cases are eligible today. 0 of them have incomplete disposition data.
An additional 1443 cases will become eligible after their waiting period has ended. 0 of them have incomplete disposition data.
158157 cases will never be eligible.



## Step 4: Output Summary Files

In [511]:
# Save the summary dataframes as csv files, overwriting them in the cleaned data folder
nw_file = nw_summary.to_csv('../data/cleaned/interactive_northwestern.csv', index=False)
ms_file = ms_summary.to_csv('../data/cleaned/interactive_middlesex.csv', index=False)
sf_file = sf_summary.to_csv('../data/cleaned/interactive_suffolk.csv', index=False)