In [1]:
# This notebook does the following:
    # A. Queries our database to construct sentence level data from court commitment and sentence computation for every
    # infraction resulting in incarceration. (dataset A)
    # B. Queries sentence component to get Most Serious Offense from all sentence components since this variable
    # is missing in much of dataset A and is needed as our outcome variable (dataset B)
    # C. Puts together dataset A and B
    # D. Carries out several steps of cleaning the data and getting recidivism flag
    # E. Queries database for any additional features (e.g. disciplinary infractions)
    # F. Hold outs active sentences, drops those missing recidivism flag
    # Dropped observations missing the following (if we can't proxy for them)
        # Sentence Start Date (~1.3%)
        # Sentence End Date (~800 obs)
        # Most Serious Offense (2.6% obs)
        # Our decided category (~1% obs)
    # F. 

In [2]:
import sqlite3
from sqlite3 import Error
import pandas as pd
import config
import os.path
from os import path

from create_db import create_connection, create_table, clean_column_names
from populate_db import extract_data, insert_records
import query_db as qd

import importlib

import datetime
import re
import numpy as np

In [3]:
importlib.reload(qd)

<module 'query_db' from '/Users/daminisharma/Dropbox/Harris MSCAPP/2019-20_Q3_Spring/Machine Learning/covid_decarceration/files/query_db.py'>

In [4]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [5]:
#coded_offenses = pd.read_excel('https://github.com/christi-liongson/covid_decarceration/blob/construct_public_safety_data/data/Coding%20Offenses%20-%20For%20GitHub.xlsx',sheet_name="Coding - FINAL")
coded_offenses = pd.read_excel('../data/Coding Offenses - For GitHub.xlsx',sheet_name="Coding - FINAL")

In [6]:
coded_offenses.head()

Unnamed: 0,Primary offense code,Description (if needed),Decided Category,Needed a check?,More lenient,More harsh
0,DRIV LICENSE REVOKED,0,1,NO,1,1
1,LARCENY,0,2,YES,1,3
2,DWI DRIVING WHILE IMPAIRED,0,2,NO,2,2
3,FELONY B&E,"Felony Breaking and Entering, as opposed to Mi...",3,YES,2,4
4,WORTHLESS CHECK,0,1,NO,1,1


In [7]:
# Part A: Queries our database to construct sentence level data from court commitment and sentence computation for every
    # infraction resulting in incarceration. (dataset A)
start = datetime.datetime.now()
query_court_commitment = '''
                        SELECT A.OFFENDER_NC_DOC_ID_NUMBER as ID, 
                            A.COMMITMENT_PREFIX, 
                            A.EARLIEST_SENTENCE_EFFECTIVE_DT, 
                            A.MOST_SERIOUS_OFFENSE_CODE                              
                        FROM OFNT3BB1 A
                        WHERE NEW_PERIOD_OF_INCARCERATION_FL = "Y";
                        '''

conn = create_connection(config.database_name)
court_small = qd.query_db_notebook(conn,query_court_commitment)


query_sentence_comp = '''
                            SELECT INMATE_DOC_NUMBER as ID, 
                                INMATE_COMMITMENT_PREFIX as COMMITMENT_PREFIX, 
                                INMATE_COMPUTATION_STATUS_FLAG, 
                                max(ACTUAL_SENTENCE_END_DATE) as END_DATE,
                                max(PROJECTED_RELEASE_DATE_PRD) as PROJ_END_DATE
                            FROM INMT4BB1
                            GROUP BY INMATE_DOC_NUMBER, INMATE_COMMITMENT_PREFIX;
                        '''

sentence_compute_small = qd.query_db_notebook(conn,query_sentence_comp)


query_inmt_profile = '''
                    SELECT 
                        INMATE_DOC_NUMBER as ID,
                        INMATE_RECORD_STATUS_CODE,
                        INMATE_ADMIN_STATUS_CODE,
                        DATE_OF_LAST_INMATE_MOVEMENT,
                        TYPE_OF_LAST_INMATE_MOVEMENT,
                        CURRENT_COMMITMENT_PREFIX,
                        INMATE_GENDER_CODE as GENDER,
                        INMATE_RACE_CODE as RACE,
                        INMATE_BIRTH_DATE as BIRTH_DATE,
                        INMATE_ETHNIC_AFFILIATION as ETHNICITY,
                        INMATE_CONTROL_STATUS_CODE as CONTROL_STATUS,
                        INMATE_SPECIAL_CHARACTERISTICS as SPECIAL_CHARS,
                        TOTAL_DISCIPLINE_INFRACTIONS,
                        LATEST_DISCIPLINE_INFRACTION,
                        LAST_DISCIPLINE_INFRACTION_DT
                    FROM INMT4AA1;
                    '''

query_inmt_profile = '''
                    SELECT 
                        INMATE_DOC_NUMBER as ID,
                        INMATE_RECORD_STATUS_CODE,
                        INMATE_ADMIN_STATUS_CODE,
                        DATE_OF_LAST_INMATE_MOVEMENT,
                        TYPE_OF_LAST_INMATE_MOVEMENT,
                        CURRENT_COMMITMENT_PREFIX,
                        INMATE_CONTROL_STATUS_CODE as CONTROL_STATUS
                    FROM INMT4AA1;
                    '''

inmt_profile = qd.query_db_notebook(conn,query_inmt_profile)

query_offender_profile = '''
                        SELECT 
                        OFFENDER_NC_DOC_ID_NUMBER as ID,
                        OFFENDER_GENDER_CODE as GENDER,
                        OFFENDER_RACE_CODE as RACE,
                        OFFENDER_BIRTH_DATE as BIRTH_DATE,
                        STATE_WHERE_OFFENDER_BORN as STATE_BORN,
                        OFFENDER_ETHNIC_CODE as ETHNICITY,
                        OFFENDER_CITIZENSHIP_CODE as CITIZENSHIP                        
                    FROM OFNT3AA1;
                            
                        '''

offender_profile = qd.query_db_notebook(conn,query_offender_profile)

conn.close

data = court_small.merge(sentence_compute_small, on=['ID','COMMITMENT_PREFIX'], how='outer')
data = data.merge(inmt_profile, on=['ID'], how = 'left')
data = data.merge(offender_profile, on=['ID'], how = 'left')
#data = data.merge(disc_infraction, on=['ID'], how='left')


stop = datetime.datetime.now()
print("Time Elapsed:", stop - start) 


Time Elapsed: 0:02:25.101231


In [8]:
#data.to_csv('datasetA_court_sentcomp.csv', index=False)

In [9]:
data.shape

(903181, 19)

In [10]:
# Part B: Queries sentence component to get Most Serious Offense from all sentence components since this variable
    # is missing in much of dataset A and is needed as our outcome variable (dataset B)

start = datetime.datetime.now()

query_sentence_component = '''
                            SELECT OFFENDER_NC_DOC_ID_NUMBER as ID, 
                                        COMMITMENT_PREFIX, 
                                        SENTENCE_COMPONENT_NUMBER,
                                        PRIMARY_OFFENSE_CODE,
                                        PRIMARY_FELONYMISDEMEANOR_CD,
                                        SENTENCING_PENALTY_CLASS_CODE,
                                        PRIOR_RECORD_LEVEL_CODE,
                                        MINIMUM_SENTENCE_LENGTH,
                                        MAXIMUM_SENTENCE_LENGTH,
                                        SENTENCE_TYPE_CODE,
                                        COUNTY_OF_CONVICTION_CODE
                            FROM OFNT3CE1
                            WHERE SENTENCE_TYPE_CODE LIKE '%PRISONS%';
                            '''

conn = create_connection(config.database_name)
sent_comp_small = qd.query_db_notebook(conn,query_sentence_component)

stop = datetime.datetime.now()
print("Time Elapsed:", stop - start) 


Time Elapsed: 0:01:38.665407


In [11]:
#sent_comp_small.to_csv('datasetB_sentcomponent_only_incarcerated.csv', index=False)

In [12]:
# Part B: Queries sentence component to get Most Serious Offense from all sentence components since this variable
    # is missing in much of dataset A and is needed as our outcome variable (dataset B)
# Check how many unique ID and COMMITMENT_PREFIX combinations there are
dataset_B = sent_comp_small.copy()
grouped = dataset_B.groupby(['ID', 'COMMITMENT_PREFIX'])
total_combinations = grouped.ngroups
print(total_combinations)

# Find the ID / COMMITMENT_PREFIX combinations that have the maximum MINIMUM_SENTENCE_LENGTH
# We will use these combinations to filter dataset_B for PRIMARY_OFFENSE_CODE
# Note: These might not be unique

min_sentence = pd.DataFrame(dataset_B.groupby(['ID', 'COMMITMENT_PREFIX'])['MINIMUM_SENTENCE_LENGTH'].max().reset_index(name='max_min'))
min_sentence.head(10)

# Check to make sure we're not accidentally dropping any rows
min_sentence.groupby(['ID', 'COMMITMENT_PREFIX']).ngroups

# Filter dataset_B to only these rows
filter_tuples = [tuple(x) for x in min_sentence.to_numpy()]

filtered_B = dataset_B[dataset_B[['ID', 'COMMITMENT_PREFIX', 'MINIMUM_SENTENCE_LENGTH']].apply(tuple, axis=1).isin(filter_tuples)]
filtered_B.head(10)

count_nunique_offenses = pd.DataFrame(filtered_B.groupby(['ID', 'COMMITMENT_PREFIX'])['PRIMARY_OFFENSE_CODE'].nunique().reset_index(name='count'))
count_nunique_offenses['count'].describe()


# Pull out the ID / COMMITMENT_PREFIX combinations that are unique on max(MINIMUM_SENTENCE_LENGTH)
unique_min_filter = [tuple(x) for x in count_nunique_offenses[count_nunique_offenses['count'] == 1][['ID', 'COMMITMENT_PREFIX']].to_numpy()]
nonunique_min_filter = [tuple(x) for x in count_nunique_offenses[count_nunique_offenses['count'] != 1][['ID', 'COMMITMENT_PREFIX']].to_numpy()]

cols_to_keep = ['ID', 'COMMITMENT_PREFIX','PRIMARY_OFFENSE_CODE','MINIMUM_SENTENCE_LENGTH', 'MAXIMUM_SENTENCE_LENGTH']

filtered_B_min_unique = filtered_B[filtered_B[['ID','COMMITMENT_PREFIX']].apply(tuple, axis=1).isin(unique_min_filter)][cols_to_keep]
filtered_B_min_unique.head()

# Drop duplicate rows from filtered_B_min_unique (we know that they all have the same PRIMARY_OFFENSE_CODE)
# Note: This method keeps the first observation, but again, this shouldn't matter
filtered_B_min_unique.drop_duplicates(subset=['ID','COMMITMENT_PREFIX','PRIMARY_OFFENSE_CODE'],inplace=True)
filtered_B_min_unique.head()

filtered_B_min_nonunique = filtered_B[filtered_B[['ID','COMMITMENT_PREFIX']].apply(tuple, axis=1).isin(nonunique_min_filter)][cols_to_keep]
filtered_B_min_nonunique.head()

find_max_max = pd.DataFrame(filtered_B_min_nonunique.groupby(['ID', 'COMMITMENT_PREFIX'])['MAXIMUM_SENTENCE_LENGTH'].max().reset_index(name='max_max'))
find_max_max.head()

by_max_tuples = [tuple(x) for x in find_max_max.to_numpy()]
filtered_B_max = filtered_B_min_nonunique[filtered_B_min_nonunique[['ID', 'COMMITMENT_PREFIX', 'MAXIMUM_SENTENCE_LENGTH']].apply(tuple, axis=1).isin(by_max_tuples)]
filtered_B_max.head()

count_offenses_by_max = pd.DataFrame(filtered_B_max.groupby(['ID', 'COMMITMENT_PREFIX'])['PRIMARY_OFFENSE_CODE'].nunique().reset_index(name='count'))
count_offenses_by_max.head()

# Pull out the ID and COMMITMENT_PREFIX tuples in FILTERED_B_MT1 where there is a unique PRIMARY_OFFENSE_CODE
# after looking at the maximum of MAXIMUM_SENTENCE_LENGTH
unique_max = count_offenses_by_max[count_offenses_by_max['count'] == 1][['ID', 'COMMITMENT_PREFIX']]
unique_max_filter = [tuple(x) for x in unique_max.to_numpy()]

filtered_B_max_unique = filtered_B_max[filtered_B_max[['ID', 'COMMITMENT_PREFIX']].apply(tuple, axis=1).isin(unique_max_filter)]
filtered_B_max_unique.head()

# Drop duplicate rows from filtered_B_max_unique (we know that they all have the same PRIMARY_OFFENSE_CODE)
# Note: This method keeps the first observation, but again, this shouldn't matter
filtered_B_max_unique.drop_duplicates(subset=['ID','COMMITMENT_PREFIX','PRIMARY_OFFENSE_CODE'],inplace=True)
filtered_B_max_unique.head()

concat_1_2 = filtered_B_min_unique.append(filtered_B_max_unique)
concat_1_2.shape

# Final merged version of datasets A and B
dataset_with_most_serious = concat_1_2
dataset_with_most_serious.shape

891122


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


(823722, 5)

In [13]:
# Part C: Puts together dataset A and B
datasetB_primary_offense = dataset_with_most_serious.loc[:,['ID','COMMITMENT_PREFIX','PRIMARY_OFFENSE_CODE']]

print("Dataset B # observations:",datasetB_primary_offense.shape[0])

# merging on datasetA (court commitment + sentence computation) with datasetB ("self constructed" primary offenses from
# sentence component)
data_A_B = data.merge(datasetB_primary_offense, on = ['ID','COMMITMENT_PREFIX'], how='left') 


Dataset B # observations: 823722


In [14]:
# Part D: Carries out several steps of cleaning the data and getting recidivism flag

# Replace Most Serious Offense with our constructed Primary Offense Code if missing
data_A_B['MOST_SERIOUS_OFFENSE_CODE'].mask(data_A_B['MOST_SERIOUS_OFFENSE_CODE'].isnull(), data_A_B['PRIMARY_OFFENSE_CODE'], inplace=True)

print("% missing most serious offense:",data_A_B['MOST_SERIOUS_OFFENSE_CODE'].isnull().sum() / data_A_B.shape[0])
print("Total number of observations in dataset A + B: ", data_A_B.shape[0])

# Step 1
# https://kanoki.org/2019/07/17/pandas-how-to-replace-values-based-on-conditions/
print("Cleaning dates and dropping missing")
data_A_B['END_DATE'].mask(data_A_B['END_DATE'] == '0001-01-01', data_A_B['PROJ_END_DATE'], inplace=True)
data_A_B = data_A_B[data_A_B['END_DATE']!='0001-01-01']
data_A_B = data_A_B[data_A_B['EARLIEST_SENTENCE_EFFECTIVE_DT']!='0001-01-01']
data_A_B = data_A_B[data_A_B['END_DATE'].notna()]
data_A_B = data_A_B[data_A_B['EARLIEST_SENTENCE_EFFECTIVE_DT'].notna()]

print("Total number of observations in dataset A + B: ", data_A_B.shape[0])
print("% still missing most serious offense:",data_A_B['MOST_SERIOUS_OFFENSE_CODE'].isnull().sum() / data_A_B.shape[0])

# Step 1.5 drop observations missing most serious offense code
print("Drop observations missing most serious offense code")
data_A_B = data_A_B[data_A_B['MOST_SERIOUS_OFFENSE_CODE'].notna()]
print("Total number of observations in dataset A + B: ", data_A_B.shape[0])

# Step 2
# write data to sqlite in memory so can query it to get next record
print("Querying database to get nextPrefix, nextOffense")
conn = sqlite3.connect(':memory:')
data_A_B.to_sql('data', conn, index=False)

start = datetime.datetime.now()
# https://stackoverflow.com/questions/37360901/sql-self-join-compare-current-record-with-the-record-of-the-previous-date
query_datasetAB = '''
                        SELECT *, 
                        LEAD(COMMITMENT_PREFIX,1,0) OVER (
                                                    PARTITION BY ID
                                                    ORDER BY COMMITMENT_PREFIX
                                                    ) NextPrefix,
                        LEAD(EARLIEST_SENTENCE_EFFECTIVE_DT,1,0) OVER (
                                                    PARTITION BY ID
                                                    ORDER BY COMMITMENT_PREFIX
                                                    ) NextStart,
                        LEAD(MOST_SERIOUS_OFFENSE_CODE,1,0) OVER (
                                                    PARTITION BY ID
                                                    ORDER BY COMMITMENT_PREFIX
                                                    ) NextOffense                                                    
                                                    
                        FROM data ;

                        '''


dataset_flag = qd.query_db_notebook(conn,query_datasetAB)
conn.close
stop = datetime.datetime.now()
print("Time Elapsed:", stop - start) 


% missing most serious offense: 0.03883602640442347
Total number of observations in dataset A + B:  903182
Cleaning dates and dropping missing
Total number of observations in dataset A + B:  888121
% still missing most serious offense: 0.02623966779301469
Drop observations missing most serious offense code
Total number of observations in dataset A + B:  864817
Querying database to get nextPrefix, nextOffense
Time Elapsed: 0:01:03.127480


In [15]:
# Define functions that fix dates
# specifically, some dates are top coded as 9999- usually for a life sentence
# this exceeds pandas' max date, so they first need to be re-top-coded, then turned into the date format
# date == 0 happens when an individual does NOT have a "next date" - these should be turned to Na
def fix_dates(data,date_var):
    data['new_col'] = data[date_var].astype(str).str[0:4].astype(int)
    data.loc[data['new_col']>2261, date_var] = '2261-01-02'
    data[date_var] = data[date_var].replace(0,np.nan)
    data.loc[data[date_var]=="0", date_var] = None
    data[date_var] = pd.to_datetime(data[date_var],format='%Y-%m-%d',errors='coerce')
    #df[date_var] = pd.to_datetime(df[date_var].str.split(n=1).str[0],format='%Y-%m-%d')
    return data


def get_recidivism_label(data,num_years=1):
    data['Time_Diff'] = pd.DatetimeIndex(data['NextStart']).year - pd.DatetimeIndex(data['END_DATE']).year
    data['Recidivate'] = np.nan
    # if NextPrefix != 0:
    data.loc[(data['NextPrefix']!=0) & (data['Time_Diff']<= num_years), 'Recidivate'] = 1
    data.loc[(data['NextPrefix']!=0) & (data['Time_Diff']> num_years), 'Recidivate'] = 0
    
    # if nextprefix = 0, inmate is inactive, and they did not die in prison 
    # (e.g. serving life sentence or  other wise) then 
    # recidivism = 0
    data.loc[(data['NextPrefix']==0) & (data['INMATE_ADMIN_STATUS_CODE']=='INACTIVE') & (data['TYPE_OF_LAST_INMATE_MOVEMENT']!='DEATH'), 'Recidivate'] = 0
    
    # if nextprefix = 0, inmate status code is not active or inactive(could be missing) and 
    # end date is not 2261-01-02 (life sentence), they were likely released from prison
    # recidivism = 0
    data.loc[(data['NextPrefix']==0) & (data['INMATE_ADMIN_STATUS_CODE']!='ACTIVE') & (data['INMATE_ADMIN_STATUS_CODE']!='INACTIVE') & (data['END_DATE']!='2261-01-02'), 'Recidivate'] = 0
    
    return data

In [16]:
# Part D continued
# Step 3. 
# call fix dates function to fix relevant dates
print("Fix Dates")
dataset_flag = fix_dates(dataset_flag,'EARLIEST_SENTENCE_EFFECTIVE_DT')
dataset_flag = fix_dates(dataset_flag,'END_DATE')
dataset_flag = fix_dates(dataset_flag,'NextStart')

# Step 4
# get recidivism flag - see decision rules and function above 
print("Get recidivism flag")
dataset_flag = get_recidivism_label(dataset_flag)


Fix Dates
Get recidivism flag


In [19]:
conn = create_connection(config.database_name)
query_disc_infraction = '''
                    SELECT INMATE_DOC_NUMBER as ID,
                            COUNT(*) as DISCIPLINARY_INFRACTION_COUNT                            
                    FROM INMT9CF1
                    GROUP BY INMATE_DOC_NUMBER;
                    '''
query_disc_infraction = '''
                    SELECT *                           
                    FROM INMT9CF1
                    LIMIT 1000;
                    '''


disc_infraction = qd.query_db_notebook(conn,query_disc_infraction)

conn.close

<function Connection.close>

In [18]:
disc_infraction.head()

Unnamed: 0,INMATE_DOC_NUMBER,DISCIPLINARY_INFRACTION_DATE,DISCIPLINARY_INFRACTION_TIME,DISCIP_INFRACTION_SEQUENCE_NO,DISCIPLINARY_INFRACTION_CODE,DISCIPLINARY_CHARGE_LEVEL,INMATE_PLEA_RE_INFRACTION,DISCIINFRACTION_VERDICT_CODE,TYPE_OF_HEARING_FOR_PUNISHMENT,DISCIPLINARY_APPEAL_DECISION,DISCI_SEGREGATION_TIME_DAYS,DSEG_DAYS_SUSPENDEDIN_MONTHS,GOOD_TIME_LOST_DUE_TO_INFRAC,GOOD_TIME_LOST_SUSPENDED_MOS,DISCI_VIOLATION_STATUS_CODE,EXTRA_DUTY_HOURS,EXTRA_DUTY_HOURS_SUSPEND_MNTHS,PRIVILEGE_LOST__1ST_TYPE_CODE,PRIVILEGE_LOST__2ND_TYPE_CODE,PRIVILEGE_LOST__3RD_TYPE_CODE,PRIVILEGES_SUSPENDED_IN_DAYS,SUSPPRIVILEGES_SUSPENDEDMOS,CUSTODY_DEMOTED_FROM_CODE,CUSTODY_DEMOTED_TO_CODE,CUSTODY_DEMOTION_SUSPENDEDMO,ACTIVATE_PRIOR_SUSPENSION,DATE_OF_LAST_UPDATE,TIME_OF_LAST_UPDATE,ORIGINAL_DR_CODE_FROM_INVEST,SUSPENSION_STATUS,DATE_OF_PRIOR_SUSP_PUNISH,TIME_OF_PRIOR_SUSP_PUNISH
0,10,1975-07-26,01:00:00,1,MISUSE SUPPLIES,UNIT,GUILTY,GUILTY,CONV,,0,0,0,2,APPLIED,0,0,UNRECORDED - CONV.,,,0,0,,,0,,0001-01-01,01:00:00,,SUSPENSIONS EXIST,0001-01-01,01:00:00
1,10,1975-07-28,01:00:00,1,DISOBEY ORDER,UNIT,GUILTY,GUILTY,CONV,,0,0,0,0,APPLIED,0,0,UNRECORDED - CONV.,,,0,0,,,0,,0001-01-01,01:00:00,,,0001-01-01,01:00:00
2,10,1975-11-23,01:00:00,1,DISOBEY ORDER,UNIT,GUILTY,GUILTY,CONV,,0,0,0,0,APPLIED,0,0,UNRECORDED - CONV.,,,0,0,,,0,,0001-01-01,01:00:00,,,0001-01-01,01:00:00
3,10,1976-01-03,01:00:00,1,DISOBEY ORDER,UNIT,GUILTY,GUILTY,CONV,,0,0,0,0,APPLIED,0,0,UNRECORDED - CONV.,,,0,0,,,0,,0001-01-01,01:00:00,,,0001-01-01,01:00:00
4,10,1976-01-22,01:00:00,1,DISOBEY ORDER,UNIT,GUILTY,GUILTY,CONV,,0,0,0,0,APPLIED,0,0,UNRECORDED - CONV.,,,0,0,,,0,,0001-01-01,01:00:00,,,0001-01-01,01:00:00


In [42]:
weird = dataset_flag[dataset_flag['Time_Diff']<0]

In [43]:
weird.head(100)

Unnamed: 0,ID,COMMITMENT_PREFIX,EARLIEST_SENTENCE_EFFECTIVE_DT,MOST_SERIOUS_OFFENSE_CODE,INMATE_COMPUTATION_STATUS_FLAG,END_DATE,PROJ_END_DATE,INMATE_RECORD_STATUS_CODE,INMATE_ADMIN_STATUS_CODE,DATE_OF_LAST_INMATE_MOVEMENT,TYPE_OF_LAST_INMATE_MOVEMENT,CURRENT_COMMITMENT_PREFIX,CONTROL_STATUS,GENDER,RACE,BIRTH_DATE,STATE_BORN,ETHNICITY,CITIZENSHIP,PRIMARY_OFFENSE_CODE,NextPrefix,NextStart,NextOffense,new_col,Time_Diff,Recidivate
6,10,AA,1975-06-11,SAFECRACKING/SAFE ROBBERY,EXPIRED,1977-03-17,0001-01-01,INACTIVE,INACTIVE,1983-06-27,TERMINATED PAROLE,,REGULAR POPULATION RPOP,MALE,BLACK,1953-05-18,,UNKNOWN,BORN IN U.S.,SAFECRACKING/SAFE ROBBERY,AB,1975-06-11,SAFECRACKING/SAFE ROBBERY,1975,-2.0,1.0
134,192,AA,1974-07-30,MURDER SECOND DEGREE,EXPIRED,1975-01-28,0001-01-01,INACTIVE,INACTIVE,2001-10-20,TERMINATED PAROLE,BA,REGULAR POPULATION RPOP,MALE,WHITE,1952-11-18,NORTH CAROLINA,EUROPEAN/N.AM./AUSTR,BORN IN U.S.,MURDER SECOND DEGREE,AB,1974-07-12,MURDER SECOND DEGREE,1974,-1.0,1.0
135,192,AB,1974-07-12,MURDER SECOND DEGREE,EXPIRED,1978-08-02,1995-07-16,INACTIVE,INACTIVE,2001-10-20,TERMINATED PAROLE,BA,REGULAR POPULATION RPOP,MALE,WHITE,1952-11-18,NORTH CAROLINA,EUROPEAN/N.AM./AUSTR,BORN IN U.S.,MURDER SECOND DEGREE,AC,1974-07-12,MURDER SECOND DEGREE,1974,-4.0,1.0
151,206,AA,1977-11-15,ARMED ROBBERY,EXPIRED,1978-07-03,0001-01-01,INACTIVE,INACTIVE,1985-03-10,TERMINATED PAROLE,,REGULAR POPULATION RPOP,MALE,WHITE,1953-10-12,NORTH CAROLINA,UNKNOWN,BORN IN U.S.,ARMED ROBBERY,AB,1977-11-15,ARMED ROBBERY,1977,-1.0,1.0
307,382,AD,1975-05-22,ASSAULT,EXPIRED,1976-01-30,0001-01-01,INACTIVE,INACTIVE,1991-10-25,TERMINATED PAROLE,,REGULAR POPULATION RPOP,MALE,BLACK,1941-06-20,NORTH CAROLINA,UNKNOWN,BORN IN U.S.,ASSAULT,AE,1975-04-18,RECEIVING STOLEN VEHICLE,1975,-1.0,1.0
670,678,AB,1976-09-22,AWDW,EXPIRED,1978-09-25,1978-09-24,INACTIVE,INACTIVE,1994-08-03,TERMINATED PAROLE,,REGULAR POPULATION RPOP,MALE,WHITE,1953-02-22,NORTH CAROLINA,UNKNOWN,BORN IN U.S.,AWDW,AC,1976-09-22,AWDW,1976,-2.0,1.0
1517,1508,AB,1976-08-26,BURGLARY 1ST DEGREE,EXPIRED,1977-08-31,0001-01-01,INACTIVE,INACTIVE,2004-10-12,EXPIRATION,BA,REGULAR POPULATION RPOP,MALE,BLACK,1956-02-27,,UNKNOWN,BORN IN U.S.,BURGLARY 1ST DEGREE,BA,1976-08-25,RAPE SECOND DEGREE,1976,-1.0,1.0
1619,1587,AA,1976-04-22,LARCENY (OVER $200),EXPIRED,1977-02-08,0001-01-01,INACTIVE,INACTIVE,1980-05-06,TERMINATED PAROLE,,REGULAR POPULATION RPOP,MALE,BLACK,1959-03-15,NORTH CAROLINA,UNKNOWN,BORN IN U.S.,LARCENY (OVER $200),AB,1976-02-27,LARCENY (OVER $200),1976,-1.0,1.0
2667,2748,AA,1974-03-28,ROBBERY W/DANGEROUS WEAPON,EXPIRED,1975-04-25,0001-01-01,INACTIVE,INACTIVE,1985-02-04,TERMINATED PAROLE,,REGULAR POPULATION RPOP,MALE,BLACK,1953-11-11,,UNKNOWN,BORN IN U.S.,ROBBERY W/DANGEROUS WEAPON,AB,1974-03-28,ROBBERY W/DANGEROUS WEAPON,1974,-1.0,1.0
3150,3186,AC,1973-11-21,SAFECRACKING/SAFE ROBBERY,EXPIRED,1974-12-17,0001-01-01,ACTIVE,ACTIVE,2019-04-15,RECEIVED FROM,BA,REGULAR POPULATION RPOP,MALE,WHITE,1946-10-30,NEW YORK,EUROPEAN/N.AM./AUSTR,BORN IN U.S.,SAFECRACKING/SAFE ROBBERY,AD,1973-06-22,SAFECRACKING/SAFE ROBBERY,1973,-1.0,1.0


In [27]:
dataset_flag_small = dataset_flag.head(1000)
dataset_flag_small.head()

Unnamed: 0,ID,COMMITMENT_PREFIX,EARLIEST_SENTENCE_EFFECTIVE_DT,MOST_SERIOUS_OFFENSE_CODE,INMATE_COMPUTATION_STATUS_FLAG,END_DATE,PROJ_END_DATE,INMATE_RECORD_STATUS_CODE,INMATE_ADMIN_STATUS_CODE,DATE_OF_LAST_INMATE_MOVEMENT,TYPE_OF_LAST_INMATE_MOVEMENT,CURRENT_COMMITMENT_PREFIX,CONTROL_STATUS,GENDER,RACE,BIRTH_DATE,STATE_BORN,ETHNICITY,CITIZENSHIP,PRIMARY_OFFENSE_CODE,NextPrefix,NextStart,NextOffense,new_col,Time_Diff,Recidivate
0,4,AA,1983-07-12,SELL SCHEDULE II,EXPIRED,1984-07-11,1984-07-11,INACTIVE,INACTIVE,1984-07-11,TERMINATED PAROLE,,REGULAR POPULATION RPOP,MALE,WHITE,1961-10-15,ALASKA,UNKNOWN,BORN IN U.S.,SELL SCHEDULE II,0,NaT,0,0,,0.0
1,6,AA,1973-01-30,WORTHLESS CHECK,EXPIRED,1973-03-28,0001-01-01,INACTIVE,INACTIVE,1975-08-18,TERMINATED PAROLE,,REGULAR POPULATION RPOP,MALE,WHITE,1951-07-17,NORTH CAROLINA,UNKNOWN,BORN IN U.S.,WORTHLESS CHECK,AB,1973-04-11,WORTHLESS CHECK,1973,0.0,1.0
2,6,AB,1973-04-11,WORTHLESS CHECK,EXPIRED,1975-08-18,1974-08-10,INACTIVE,INACTIVE,1975-08-18,TERMINATED PAROLE,,REGULAR POPULATION RPOP,MALE,WHITE,1951-07-17,NORTH CAROLINA,UNKNOWN,BORN IN U.S.,WORTHLESS CHECK,0,NaT,0,0,,0.0
3,8,AA,1990-04-09,DWI DRIVING WHILE IMPAIRED,EXPIRED,1990-05-17,1990-10-09,INACTIVE,INACTIVE,1995-09-14,TERMINATED PAROLE,,REGULAR POPULATION RPOP,MALE,WHITE,1963-12-29,NORTH CAROLINA,UNKNOWN,BORN IN U.S.,DWI DRIVING WHILE IMPAIRED,AB,1993-08-30,HABITUAL IMPAIRED DRIVING,1993,3.0,0.0
4,8,AB,1993-08-30,HABITUAL IMPAIRED DRIVING,EXPIRED,1994-01-26,1994-02-18,INACTIVE,INACTIVE,1995-09-14,TERMINATED PAROLE,,REGULAR POPULATION RPOP,MALE,WHITE,1963-12-29,NORTH CAROLINA,UNKNOWN,BORN IN U.S.,HABITUAL IMPAIRED DRIVING,BA,1995-01-02,HABITUAL IMPAIRED DRIVING,1995,1.0,1.0


In [46]:
conn = sqlite3.connect(':memory:')
disc_infraction.to_sql('disc_infraction', conn, index=False)
dataset_flag_small.to_sql('dataset_flag_small', conn, index=False)

query = '''
        SELECT INMATE_DOC_NUMBER as ID,
                DISCIPLINARY_INFRACTION_DATE,
                COMMITMENT_PREFIX,
                EARLIEST_SENTENCE_EFFECTIVE_DT,
                END_DATE,
                COUNT(DISCIPLINARY_INFRACTION_DATE) as INFRACTION_PER_SENT
        FROM disc_infraction A
        INNER JOIN dataset_flag_small B
        WHERE A.INMATE_DOC_NUMBER = B.ID
        AND A.DISCIPLINARY_INFRACTION_DATE >= B.EARLIEST_SENTENCE_EFFECTIVE_DT
        AND A.DISCIPLINARY_INFRACTION_DATE <= B.END_DATE
        GROUP BY INMATE_DOC_NUMBER, COMMITMENT_PREFIX
        ;
        
        '''
test = qd.query_db_notebook(conn,query)

conn.close


<function Connection.close>

In [49]:
#dataset_flag_small.head(20)

In [50]:
#disc_infraction.head(20)

In [51]:
test.head(40)

Unnamed: 0,ID,DISCIPLINARY_INFRACTION_DATE,COMMITMENT_PREFIX,EARLIEST_SENTENCE_EFFECTIVE_DT,END_DATE,INF_PER_COM
0,10,1975-07-26,AA,1975-06-11 00:00:00,1977-03-17 00:00:00,11
1,10,1975-07-26,AB,1975-06-11 00:00:00,1983-06-27 00:00:00,15
2,26,1983-09-26,AA,1983-07-18 00:00:00,1984-09-09 00:00:00,3
3,31,1974-05-11,AA,1971-08-13 00:00:00,1981-11-02 00:00:00,2
4,33,1985-05-27,AD,1984-02-02 00:00:00,1990-05-26 00:00:00,2
5,33,1997-09-24,BA,1995-06-22 00:00:00,2009-09-02 00:00:00,9
6,35,1992-06-03,BA,1991-09-16 00:00:00,1997-05-20 00:00:00,13
7,35,1999-06-17,BB,1999-03-03 00:00:00,1999-12-28 00:00:00,10
8,35,2001-07-18,BC,2001-01-16 00:00:00,2011-11-29 00:00:00,164
9,37,2010-01-12,BA,2009-10-05 00:00:00,2010-02-19 00:00:00,5


In [54]:
# Divide infractions by # of sentences if there are dups on ID / DISCIPLINARY_INFRACTION_DATE
    # might indicate concurrent sentences
count_dups = test.groupby(['ID','DISCIPLINARY_INFRACTION_DATE'])["ID"].count().reset_index(name="count")
test = test.merge(count_dups, how = 'left')
#test.groupby(['ID','DISCIPLINARY_INFRACTION_DATE']).transform('count')

In [55]:
test.head()

Unnamed: 0,ID,DISCIPLINARY_INFRACTION_DATE,COMMITMENT_PREFIX,EARLIEST_SENTENCE_EFFECTIVE_DT,END_DATE,INF_PER_COM,count
0,10,1975-07-26,AA,1975-06-11 00:00:00,1977-03-17 00:00:00,11,2
1,10,1975-07-26,AB,1975-06-11 00:00:00,1983-06-27 00:00:00,15,2
2,26,1983-09-26,AA,1983-07-18 00:00:00,1984-09-09 00:00:00,3,1
3,31,1974-05-11,AA,1971-08-13 00:00:00,1981-11-02 00:00:00,2,1
4,33,1985-05-27,AD,1984-02-02 00:00:00,1990-05-26 00:00:00,2,1


In [56]:
test['INF_PER_COM'] = round(test['INF_PER_COM']/test['count'])

In [57]:
test.head()

Unnamed: 0,ID,DISCIPLINARY_INFRACTION_DATE,COMMITMENT_PREFIX,EARLIEST_SENTENCE_EFFECTIVE_DT,END_DATE,INF_PER_COM,count
0,10,1975-07-26,AA,1975-06-11 00:00:00,1977-03-17 00:00:00,6.0,2
1,10,1975-07-26,AB,1975-06-11 00:00:00,1983-06-27 00:00:00,8.0,2
2,26,1983-09-26,AA,1983-07-18 00:00:00,1984-09-09 00:00:00,3.0,1
3,31,1974-05-11,AA,1971-08-13 00:00:00,1981-11-02 00:00:00,2.0,1
4,33,1985-05-27,AD,1984-02-02 00:00:00,1990-05-26 00:00:00,2.0,1


In [23]:
# Part E - querying additional features
conn = create_connection(config.database_name)
dataset_flag.to_sql('dataset_AB', conn, index=False)
query_disc_infraction = '''
                    SELECT INMATE_DOC_NUMBER as ID,
                            COUNT(*) as DISCIPLINARY_INFRACTION_COUNT                            
                    FROM INMT9CF1
                    GROUP BY INMATE_DOC_NUMBER;
                    '''
query_disc_infraction = '''
                    SELECT *                           
                    FROM INMT9CF1
                    LIMIT 100;
                    '''


disc_infraction = qd.query_db_notebook(conn,query_disc_infraction)

conn.close


<function Connection.close>

In [None]:
# Part F
# Step 5
# Hold out active senteces
print("Hold out active sentences")
active_sentences = dataset_flag[(dataset_flag['INMATE_ADMIN_STATUS_CODE']=='ACTIVE') & (dataset_flag['NextPrefix']==0)]
print("Size of active sentences dataset: ",active_sentences.shape[0])

# Step 6
# drop observations with no recidivism flag (this will also drop active sentences, but we've already separated those)
print("Drop observations with no recidivism flag (this will also drop active sentences, but we've already separated those)")
print("Additional observations dropped are mostly of those who died in prison and therefore wont have a recidivate flag")
dataset_flag = dataset_flag[(dataset_flag['Recidivate'].notnull())]
print("Size of remaining dataset: ",dataset_flag.shape[0])

# Step 7
# Bring in coded offenses - sanity check

# this merges our coded offenses onto "most serious offense" to check how much coverage
# our variable is giving us. however, this not what we ultimately want - in the end, we want
# our codes to be merged onto "nextOffense" - i.e., the offense code for the next offense 
# someone committed that resulted in re-incarceration
# NextOffense can be missing for 2 reasons: because most serious offense is missing, or because
# the individual did not recidivate. after merging our codes onto "NextOffense", we can replace
# "Decided Category" with 0 if recidivism = 0, and leave it as NA otherwise
#dataset_with_offenses_test = dataset_flag.merge(coded_offenses, how='left', left_on='MOST_SERIOUS_OFFENSE_CODE', right_on='Primary offense code')

# Step 8 and 9
# Now, merge on coded offenses onto NextOffense, turn Decided Category, More Lenient, and more harsh = 0 if recidivism = 0
print("Merging on our coded categories")
dataset_with_offenses = dataset_flag.merge(coded_offenses, how='left', left_on='NextOffense', right_on='Primary offense code')
dataset_with_offenses.loc[dataset_with_offenses['Recidivate']==0,'Decided Category'] = 0
dataset_with_offenses.loc[dataset_with_offenses['Recidivate']==0,'More lenient'] = 0
dataset_with_offenses.loc[dataset_with_offenses['Recidivate']==0,'More harsh'] = 0

print("% missing decided category",dataset_with_offenses['Decided Category'].isnull().sum()/dataset_with_offenses.shape[0])

# Drop those missing decided category
dataset_with_offenses = dataset_with_offenses[(dataset_with_offenses['Decided Category'].notnull())]
print("Final dataset size: " , dataset_with_offenses.shape[0])


## Charmaine's WIP code

To-do:
1. Check the number of unique primary offense codes for every combination of ID and COMMITMENT_PREFIX
2. Check what is going on with the 9999999 values in MINIMUM_SENTENCE_LENGTH
3. Spot check IDs in court commitment data to compare most serious offense

In [336]:
# Now that the most serious offenses have been selected, i'm rerunning the code using the output of the 
# sql query so eventually we wont need intermediate datasets
dataset_A = data.copy()
dataset_B = sent_comp_small.copy()

In [337]:
# Import CSVs
import pandas as pd

#dataset_A = pd.read_csv('../data/datasetA_court_sentcomp.csv')
#dataset_B = pd.read_csv('../data/datasetB_sentcomponent_only_incarcerated.csv')

In [338]:
dataset_A.head()

Unnamed: 0,ID,COMMITMENT_PREFIX,EARLIEST_SENTENCE_EFFECTIVE_DT,MOST_SERIOUS_OFFENSE_CODE,INMATE_COMPUTATION_STATUS_FLAG,END_DATE,PROJ_END_DATE,INMATE_RECORD_STATUS_CODE,INMATE_ADMIN_STATUS_CODE,DATE_OF_LAST_INMATE_MOVEMENT,TYPE_OF_LAST_INMATE_MOVEMENT,CURRENT_COMMITMENT_PREFIX,CONTROL_STATUS,GENDER,RACE,BIRTH_DATE,STATE_BORN,ETHNICITY,CITIZENSHIP,DISCIPLINARY_INFRACTION_COUNT
0,4,AA,1983-07-12,,EXPIRED,1984-07-11,1984-07-11,INACTIVE,INACTIVE,1984-07-11,TERMINATED PAROLE,,REGULAR POPULATION RPOP,MALE,WHITE,1961-10-15,ALASKA,UNKNOWN,BORN IN U.S.,
1,6,AA,1973-01-30,,EXPIRED,1973-03-28,0001-01-01,INACTIVE,INACTIVE,1975-08-18,TERMINATED PAROLE,,REGULAR POPULATION RPOP,MALE,WHITE,1951-07-17,NORTH CAROLINA,UNKNOWN,BORN IN U.S.,
2,6,AB,1973-04-11,,EXPIRED,1975-08-18,1974-08-10,INACTIVE,INACTIVE,1975-08-18,TERMINATED PAROLE,,REGULAR POPULATION RPOP,MALE,WHITE,1951-07-17,NORTH CAROLINA,UNKNOWN,BORN IN U.S.,
3,8,AA,1990-04-09,,EXPIRED,1990-05-17,1990-10-09,INACTIVE,INACTIVE,1995-09-14,TERMINATED PAROLE,,REGULAR POPULATION RPOP,MALE,WHITE,1963-12-29,NORTH CAROLINA,UNKNOWN,BORN IN U.S.,
4,8,AB,1993-08-30,,EXPIRED,1994-01-26,1994-02-18,INACTIVE,INACTIVE,1995-09-14,TERMINATED PAROLE,,REGULAR POPULATION RPOP,MALE,WHITE,1963-12-29,NORTH CAROLINA,UNKNOWN,BORN IN U.S.,


In [339]:
dataset_A.columns

Index(['ID', 'COMMITMENT_PREFIX', 'EARLIEST_SENTENCE_EFFECTIVE_DT',
       'MOST_SERIOUS_OFFENSE_CODE', 'INMATE_COMPUTATION_STATUS_FLAG',
       'END_DATE', 'PROJ_END_DATE', 'INMATE_RECORD_STATUS_CODE',
       'INMATE_ADMIN_STATUS_CODE', 'DATE_OF_LAST_INMATE_MOVEMENT',
       'TYPE_OF_LAST_INMATE_MOVEMENT', 'CURRENT_COMMITMENT_PREFIX',
       'CONTROL_STATUS', 'GENDER', 'RACE', 'BIRTH_DATE', 'STATE_BORN',
       'ETHNICITY', 'CITIZENSHIP', 'DISCIPLINARY_INFRACTION_COUNT'],
      dtype='object')

Note that dataset A is unique on `ID` and `COMMITMENT_PREFIX`.

In [340]:
dataset_B.head()

Unnamed: 0,ID,COMMITMENT_PREFIX,SENTENCE_COMPONENT_NUMBER,PRIMARY_OFFENSE_CODE,PRIMARY_FELONYMISDEMEANOR_CD,SENTENCING_PENALTY_CLASS_CODE,PRIOR_RECORD_LEVEL_CODE,MINIMUM_SENTENCE_LENGTH,MAXIMUM_SENTENCE_LENGTH,SENTENCE_TYPE_CODE,COUNTY_OF_CONVICTION_CODE
0,4,AA,1,SELL SCHEDULE II,FELON,CLASS H,,0,30000,DEPT OF CORR DIV OF PRISONS,PERQUIMANS
1,4,AA,2,SELL SCHEDULE II,FELON,CLASS H,,0,30000,DEPT OF CORR DIV OF PRISONS,PERQUIMANS
2,6,AA,1,WORTHLESS CHECK,MISD.,MISD.(PRE-STRUCTURE),,60,90,DEPT OF CORR DIV OF PRISONS,CUMBERLAND
3,6,AB,1,WORTHLESS CHECK,MISD.,MISD.(PRE-STRUCTURE),,30,0,DEPT OF CORR DIV OF PRISONS,CUMBERLAND
4,6,AB,2,WORTHLESS CHECK,MISD.,MISD.(PRE-STRUCTURE),,30,0,DEPT OF CORR DIV OF PRISONS,CUMBERLAND


In [341]:
dataset_B.columns

Index(['ID', 'COMMITMENT_PREFIX', 'SENTENCE_COMPONENT_NUMBER',
       'PRIMARY_OFFENSE_CODE', 'PRIMARY_FELONYMISDEMEANOR_CD',
       'SENTENCING_PENALTY_CLASS_CODE', 'PRIOR_RECORD_LEVEL_CODE',
       'MINIMUM_SENTENCE_LENGTH', 'MAXIMUM_SENTENCE_LENGTH',
       'SENTENCE_TYPE_CODE', 'COUNTY_OF_CONVICTION_CODE'],
      dtype='object')

In [342]:
dataset_A.shape

(903181, 20)

In [343]:
dataset_B.shape

(1728836, 11)

In [344]:
# Check how many unique ID and COMMITMENT_PREFIX combinations there are
grouped = dataset_B.groupby(['ID', 'COMMITMENT_PREFIX'])
total_combinations = grouped.ngroups
print(total_combinations)

891122


There are 891,122 unique ID and COMMITMENT_PREFIX combinations. This is the total we're working with. Note that this is smaller than the number of rows in dataset A

In [345]:
# Find the ID / COMMITMENT_PREFIX combinations that have the maximum MINIMUM_SENTENCE_LENGTH
# We will use these combinations to filter dataset_B for PRIMARY_OFFENSE_CODE
# Note: These might not be unique

min_sentence = pd.DataFrame(dataset_B.groupby(['ID', 'COMMITMENT_PREFIX'])['MINIMUM_SENTENCE_LENGTH'].max().reset_index(name='max_min'))
min_sentence.head(10)

Unnamed: 0,ID,COMMITMENT_PREFIX,max_min
0,4,AA,0
1,6,AA,60
2,6,AB,30
3,8,AA,10000
4,8,AB,0
5,8,BA,0
6,10,AA,0
7,10,AB,200000
8,14,AA,1800
9,14,AB,600


In [346]:
# Check to make sure we're not accidentally dropping any rows
min_sentence.groupby(['ID', 'COMMITMENT_PREFIX']).ngroups

891122

In [347]:
# Filter dataset_B to only these rows
filter_tuples = [tuple(x) for x in min_sentence.to_numpy()]

filtered_B = dataset_B[dataset_B[['ID', 'COMMITMENT_PREFIX', 'MINIMUM_SENTENCE_LENGTH']].apply(tuple, axis=1).isin(filter_tuples)]
filtered_B.head(10)

Unnamed: 0,ID,COMMITMENT_PREFIX,SENTENCE_COMPONENT_NUMBER,PRIMARY_OFFENSE_CODE,PRIMARY_FELONYMISDEMEANOR_CD,SENTENCING_PENALTY_CLASS_CODE,PRIOR_RECORD_LEVEL_CODE,MINIMUM_SENTENCE_LENGTH,MAXIMUM_SENTENCE_LENGTH,SENTENCE_TYPE_CODE,COUNTY_OF_CONVICTION_CODE
0,4,AA,1,SELL SCHEDULE II,FELON,CLASS H,,0,30000,DEPT OF CORR DIV OF PRISONS,PERQUIMANS
1,4,AA,2,SELL SCHEDULE II,FELON,CLASS H,,0,30000,DEPT OF CORR DIV OF PRISONS,PERQUIMANS
2,6,AA,1,WORTHLESS CHECK,MISD.,MISD.(PRE-STRUCTURE),,60,90,DEPT OF CORR DIV OF PRISONS,CUMBERLAND
3,6,AB,1,WORTHLESS CHECK,MISD.,MISD.(PRE-STRUCTURE),,30,0,DEPT OF CORR DIV OF PRISONS,CUMBERLAND
4,6,AB,2,WORTHLESS CHECK,MISD.,MISD.(PRE-STRUCTURE),,30,0,DEPT OF CORR DIV OF PRISONS,CUMBERLAND
5,6,AB,3,WORTHLESS CHECK,MISD.,MISD.(PRE-STRUCTURE),,30,0,DEPT OF CORR DIV OF PRISONS,CUMBERLAND
6,6,AB,4,WORTHLESS CHECK,MISD.,MISD.(PRE-STRUCTURE),,30,0,DEPT OF CORR DIV OF PRISONS,CUMBERLAND
7,6,AB,5,WORTHLESS CHECK,MISD.,MISD.(PRE-STRUCTURE),,30,0,DEPT OF CORR DIV OF PRISONS,CUMBERLAND
8,6,AB,6,WORTHLESS CHECK,MISD.,MISD.(PRE-STRUCTURE),,30,0,DEPT OF CORR DIV OF PRISONS,CUMBERLAND
9,6,AB,7,WORTHLESS CHECK,MISD.,MISD.(PRE-STRUCTURE),,30,0,DEPT OF CORR DIV OF PRISONS,CUMBERLAND


In [348]:
# Check to make sure we're not accidentally dropping any rows
filtered_B.groupby(['ID', 'COMMITMENT_PREFIX']).ngroups

891122

Based on the sample of observations above, there are quite a few ties, but it might be that they are all of the same PRIMARY_OFFENSE_CODE (e.g., rows 3-9). 

I will check how many unique primary offense codes there are for every ID and COMMITMENT_PREFIX combination.

In [349]:
count_nunique_offenses = pd.DataFrame(filtered_B.groupby(['ID', 'COMMITMENT_PREFIX'])['PRIMARY_OFFENSE_CODE'].nunique().reset_index(name='count'))
count_nunique_offenses['count'].describe()

count    891122.000000
mean          1.199647
std           0.547120
min           0.000000
25%           1.000000
50%           1.000000
75%           1.000000
max          12.000000
Name: count, dtype: float64

In [350]:
count_nunique_offenses.head(5)

Unnamed: 0,ID,COMMITMENT_PREFIX,count
0,4,AA,1
1,6,AA,1
2,6,AB,1
3,8,AA,1
4,8,AB,1


In [351]:
# Pull out the ID / COMMITMENT_PREFIX combinations that are unique on max(MINIMUM_SENTENCE_LENGTH)
unique_min_filter = [tuple(x) for x in count_nunique_offenses[count_nunique_offenses['count'] == 1][['ID', 'COMMITMENT_PREFIX']].to_numpy()]
nonunique_min_filter = [tuple(x) for x in count_nunique_offenses[count_nunique_offenses['count'] != 1][['ID', 'COMMITMENT_PREFIX']].to_numpy()]

cols_to_keep = ['ID', 'COMMITMENT_PREFIX','PRIMARY_OFFENSE_CODE','MINIMUM_SENTENCE_LENGTH', 'MAXIMUM_SENTENCE_LENGTH']

filtered_B_min_unique = filtered_B[filtered_B[['ID','COMMITMENT_PREFIX']].apply(tuple, axis=1).isin(unique_min_filter)][cols_to_keep]
filtered_B_min_unique.head()

Unnamed: 0,ID,COMMITMENT_PREFIX,PRIMARY_OFFENSE_CODE,MINIMUM_SENTENCE_LENGTH,MAXIMUM_SENTENCE_LENGTH
0,4,AA,SELL SCHEDULE II,0,30000
1,4,AA,SELL SCHEDULE II,0,30000
2,6,AA,WORTHLESS CHECK,60,90
3,6,AB,WORTHLESS CHECK,30,0
4,6,AB,WORTHLESS CHECK,30,0


In [352]:
# Drop duplicate rows from filtered_B_min_unique (we know that they all have the same PRIMARY_OFFENSE_CODE)
# Note: This method keeps the first observation, but again, this shouldn't matter
filtered_B_min_unique.drop_duplicates(subset=['ID','COMMITMENT_PREFIX','PRIMARY_OFFENSE_CODE'],inplace=True)
filtered_B_min_unique.head()

Unnamed: 0,ID,COMMITMENT_PREFIX,PRIMARY_OFFENSE_CODE,MINIMUM_SENTENCE_LENGTH,MAXIMUM_SENTENCE_LENGTH
0,4,AA,SELL SCHEDULE II,0,30000
2,6,AA,WORTHLESS CHECK,60,90
3,6,AB,WORTHLESS CHECK,30,0
30,8,AA,DWI DRIVING WHILE IMPAIRED,10000,10000
31,8,AB,HABITUAL IMPAIRED DRIVING,0,10000


In [353]:
filtered_B_min_unique.shape

(756670, 5)

In [354]:
# Check if there are duplicates
filtered_B_min_unique[filtered_B_min_unique.duplicated(subset=['ID','COMMITMENT_PREFIX','PRIMARY_OFFENSE_CODE'])]

Unnamed: 0,ID,COMMITMENT_PREFIX,PRIMARY_OFFENSE_CODE,MINIMUM_SENTENCE_LENGTH,MAXIMUM_SENTENCE_LENGTH


In [355]:
grouped_by_min = filtered_B_min_unique.groupby(['ID', 'COMMITMENT_PREFIX'])
print("Number of ID / COMMITMENT_PREFIX combinations that are unique on max(MINIMUM_SENTENCE_LENGTH):", 
      grouped_by_min.ngroups)

Number of ID / COMMITMENT_PREFIX combinations that are unique on max(MINIMUM_SENTENCE_LENGTH): 756669


In [356]:
756669 / total_combinations

0.849119424725234

In [357]:
total_combinations - 756669

134453

#### Main takeaway:
84 percent (756,669) of `ID` and `COMMITMENT_PREFIX` combinations have a single PRIMARY_OFFENSE_CODE for the `max(MINIMUM_SENTENCE_LENGTH)`. The remaining 16 percent (134,453) have more than one `PRIMARY_OFFENSE_CODE` after filtering to `max(MINIMUM_SENTENCE_LENGTH)`. It is these 16 percent that we need to further filter by `MAXIMUM_SENTENCE_LENGTH` and then, if needed, by random selection.

In [358]:
filtered_B_min_nonunique = filtered_B[filtered_B[['ID','COMMITMENT_PREFIX']].apply(tuple, axis=1).isin(nonunique_min_filter)][cols_to_keep]
filtered_B_min_nonunique.head()

Unnamed: 0,ID,COMMITMENT_PREFIX,PRIMARY_OFFENSE_CODE,MINIMUM_SENTENCE_LENGTH,MAXIMUM_SENTENCE_LENGTH
41,19,AA,MISD B&E,2400,2400
42,19,AA,LARCENY (OVER $200),2400,2400
45,20,BA,FELONY B&E,0,100000
46,20,BA,B & E & L,0,100000
47,20,BA,FELONY B&E,0,60000


We have some odd observations where the primary offense is a felony, but MINIMUM_SENTENCE_LENGTH is 0 and MAXIMUM_SENTENCE_LENGTH > 0 (e.g., index 45-47)

In [359]:
grouped_by_max = filtered_B_min_nonunique.groupby(['ID', 'COMMITMENT_PREFIX'])
print("Number of ID / COMMITMENT_PREFIX combinations that are NOT unique on max(MINIMUM_SENTENCE_LENGTH):", 
      grouped_by_max.ngroups)

Number of ID / COMMITMENT_PREFIX combinations that are NOT unique on max(MINIMUM_SENTENCE_LENGTH): 134453


In [360]:
# Check the numbers are adding up
print("Do the numbers add up?", 134453 + 756669 == total_combinations)

Do the numbers add up? True


There are 134,453 `ID` and `COMMITMENT_PREFIX` combinations that have more than one unique PRIMARY_OFFENSE_CODE after filtering by `MINIMUM_SENTENCE_LENGTH`. Filter by `MAXIMUM_SENTENCE_LENGTH`...

In [361]:
find_max_max = pd.DataFrame(filtered_B_min_nonunique.groupby(['ID', 'COMMITMENT_PREFIX'])['MAXIMUM_SENTENCE_LENGTH'].max().reset_index(name='max_max'))
find_max_max.head()

Unnamed: 0,ID,COMMITMENT_PREFIX,max_max
0,19,AA,2400
1,20,BA,100000
2,33,AD,180000
3,35,BA,80000
4,49,AC,60000


In [362]:
by_max_tuples = [tuple(x) for x in find_max_max.to_numpy()]
filtered_B_max = filtered_B_min_nonunique[filtered_B_min_nonunique[['ID', 'COMMITMENT_PREFIX', 'MAXIMUM_SENTENCE_LENGTH']].apply(tuple, axis=1).isin(by_max_tuples)]
filtered_B_max.head()


Unnamed: 0,ID,COMMITMENT_PREFIX,PRIMARY_OFFENSE_CODE,MINIMUM_SENTENCE_LENGTH,MAXIMUM_SENTENCE_LENGTH
41,19,AA,MISD B&E,2400,2400
42,19,AA,LARCENY (OVER $200),2400,2400
45,20,BA,FELONY B&E,0,100000
46,20,BA,B & E & L,0,100000
74,33,AD,ROBBERY W/DANGEROUS WEAPON,0,180000


In [364]:
count_offenses_by_max = pd.DataFrame(filtered_B_max.groupby(['ID', 'COMMITMENT_PREFIX'])['PRIMARY_OFFENSE_CODE'].nunique().reset_index(name='count'))
count_offenses_by_max.head()

Unnamed: 0,ID,COMMITMENT_PREFIX,count
0,19,AA,2
1,20,BA,2
2,33,AD,1
3,35,BA,1
4,49,AC,1


In [365]:
# Pull out the ID and COMMITMENT_PREFIX tuples in FILTERED_B_MT1 where there is a unique PRIMARY_OFFENSE_CODE
# after looking at the maximum of MAXIMUM_SENTENCE_LENGTH
unique_max = count_offenses_by_max[count_offenses_by_max['count'] == 1][['ID', 'COMMITMENT_PREFIX']]
unique_max_filter = [tuple(x) for x in unique_max.to_numpy()]

filtered_B_max_unique = filtered_B_max[filtered_B_max[['ID', 'COMMITMENT_PREFIX']].apply(tuple, axis=1).isin(unique_max_filter)]
filtered_B_max_unique.head()

Unnamed: 0,ID,COMMITMENT_PREFIX,PRIMARY_OFFENSE_CODE,MINIMUM_SENTENCE_LENGTH,MAXIMUM_SENTENCE_LENGTH
74,33,AD,ROBBERY W/DANGEROUS WEAPON,0,180000
83,35,BA,FELONY B&E,0,80000
103,49,AC,INDECENT LIBERTY W/CHILD,0,60000
121,64,AA,MISD B&E,0,20000
134,71,BA,POSSESS WITS SCHEDULE II,0,100000


In [366]:
# Drop duplicate rows from filtered_B_max_unique (we know that they all have the same PRIMARY_OFFENSE_CODE)
# Note: This method keeps the first observation, but again, this shouldn't matter
filtered_B_max_unique.drop_duplicates(subset=['ID','COMMITMENT_PREFIX','PRIMARY_OFFENSE_CODE'],inplace=True)
filtered_B_max_unique.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,ID,COMMITMENT_PREFIX,PRIMARY_OFFENSE_CODE,MINIMUM_SENTENCE_LENGTH,MAXIMUM_SENTENCE_LENGTH
74,33,AD,ROBBERY W/DANGEROUS WEAPON,0,180000
83,35,BA,FELONY B&E,0,80000
103,49,AC,INDECENT LIBERTY W/CHILD,0,60000
121,64,AA,MISD B&E,0,20000
134,71,BA,POSSESS WITS SCHEDULE II,0,100000


In [367]:
filtered_B_max_unique.shape

(67052, 5)

In [368]:
# Check if there are duplicates
filtered_B_max_unique[filtered_B_max_unique.duplicated(subset=['ID','COMMITMENT_PREFIX','PRIMARY_OFFENSE_CODE'])]

Unnamed: 0,ID,COMMITMENT_PREFIX,PRIMARY_OFFENSE_CODE,MINIMUM_SENTENCE_LENGTH,MAXIMUM_SENTENCE_LENGTH


In [369]:
grouped_by_max = filtered_B_max_unique.groupby(['ID', 'COMMITMENT_PREFIX'])
print("Number of ID / COMMITMENT_PREFIX combinations that are unique on max(MAXIMUM_SENTENCE_LENGTH):", 
      grouped_by_max.ngroups)

Number of ID / COMMITMENT_PREFIX combinations that are unique on max(MAXIMUM_SENTENCE_LENGTH): 67052


In [370]:
# Check how much ground the first two filters cover:
n_min_unique = 756669
n_max_unique = 67052
print("Combinations so far:", n_min_unique + n_max_unique)
(n_min_unique + n_max_unique) / total_combinations

Combinations so far: 823721


0.9243638918127933

#### Now we have the first two of three dataframes we need, which gets us 92 percent of the combinations:
1. `ID` / `COMMITMENT_PREFIX` combinations that are unique on max`(MINIMUM_SENTENCE_LENGTH)`:
filtered_B_min_unique, with 756,669 combinations
2. `ID` / `COMMITMENT_PREFIX` combinations that are NOT unique on max`(MINIMUM_SENTENCE_LENGTH)` but are unique on max`(MAXIMUM_SENTENCE_LENGTH)`: filtered_B_max_unique, with 67,052 combinations

The last one is:
3. `ID` / `COMMITMENT_PREFIX` combinations that are NOT unique on max`(MINIMUM_SENTENCE_LENGTH)` or on max`(MAXIMUM_SENTENCE_LENGTH)`, so we choose randomly from the `PRIMARY_OFFENSE_CODES` remaining after the second filter.

But before we run the final filter, concatenate the first two dataframes and merge with dataset A.

In [371]:
concat_1_2 = filtered_B_min_unique.append(filtered_B_max_unique)
concat_1_2.shape

(823722, 5)

In [372]:
check_primary_offense_match = pd.merge(concat_1_2, dataset_A, on=['ID','COMMITMENT_PREFIX'])
check_primary_offense_match.head()

Unnamed: 0,ID,COMMITMENT_PREFIX,PRIMARY_OFFENSE_CODE,MINIMUM_SENTENCE_LENGTH,MAXIMUM_SENTENCE_LENGTH,EARLIEST_SENTENCE_EFFECTIVE_DT,MOST_SERIOUS_OFFENSE_CODE,INMATE_COMPUTATION_STATUS_FLAG,END_DATE,PROJ_END_DATE,INMATE_RECORD_STATUS_CODE,INMATE_ADMIN_STATUS_CODE,DATE_OF_LAST_INMATE_MOVEMENT,TYPE_OF_LAST_INMATE_MOVEMENT,CURRENT_COMMITMENT_PREFIX,CONTROL_STATUS,GENDER,RACE,BIRTH_DATE,STATE_BORN,ETHNICITY,CITIZENSHIP,DISCIPLINARY_INFRACTION_COUNT
0,4,AA,SELL SCHEDULE II,0,30000,1983-07-12,,EXPIRED,1984-07-11,1984-07-11,INACTIVE,INACTIVE,1984-07-11,TERMINATED PAROLE,,REGULAR POPULATION RPOP,MALE,WHITE,1961-10-15,ALASKA,UNKNOWN,BORN IN U.S.,
1,6,AA,WORTHLESS CHECK,60,90,1973-01-30,,EXPIRED,1973-03-28,0001-01-01,INACTIVE,INACTIVE,1975-08-18,TERMINATED PAROLE,,REGULAR POPULATION RPOP,MALE,WHITE,1951-07-17,NORTH CAROLINA,UNKNOWN,BORN IN U.S.,
2,6,AB,WORTHLESS CHECK,30,0,1973-04-11,,EXPIRED,1975-08-18,1974-08-10,INACTIVE,INACTIVE,1975-08-18,TERMINATED PAROLE,,REGULAR POPULATION RPOP,MALE,WHITE,1951-07-17,NORTH CAROLINA,UNKNOWN,BORN IN U.S.,
3,8,AA,DWI DRIVING WHILE IMPAIRED,10000,10000,1990-04-09,,EXPIRED,1990-05-17,1990-10-09,INACTIVE,INACTIVE,1995-09-14,TERMINATED PAROLE,,REGULAR POPULATION RPOP,MALE,WHITE,1963-12-29,NORTH CAROLINA,UNKNOWN,BORN IN U.S.,
4,8,AB,HABITUAL IMPAIRED DRIVING,0,10000,1993-08-30,,EXPIRED,1994-01-26,1994-02-18,INACTIVE,INACTIVE,1995-09-14,TERMINATED PAROLE,,REGULAR POPULATION RPOP,MALE,WHITE,1963-12-29,NORTH CAROLINA,UNKNOWN,BORN IN U.S.,


In [373]:
# For observations where MOST_SERIOUS_OFFENSE is not NaN, compare with PRIMARY_OFFENSE_CODE
compare_df = check_primary_offense_match.loc[check_primary_offense_match['MOST_SERIOUS_OFFENSE_CODE'].notna()][['PRIMARY_OFFENSE_CODE', 'MOST_SERIOUS_OFFENSE_CODE']]
compare_df['SAME_CODE'] = (compare_df['PRIMARY_OFFENSE_CODE'] == compare_df['MOST_SERIOUS_OFFENSE_CODE'])
compare_df.sample(10)

Unnamed: 0,PRIMARY_OFFENSE_CODE,MOST_SERIOUS_OFFENSE_CODE,SAME_CODE
447680,POSSESS WITS SCHEDULE II,POSSESS WITS SCHEDULE II,True
401319,FELONY B&E,MISD B&E,False
665564,TRAFFICKING SCHEDULE II,TRAFFICKING SCHEDULE II,True
797995,FELONY B&E,MISD B&E,False
471796,LARCENY,LARCENY,True
606257,POSSESS SCHEDULE I,POSSESS SCHEDULE I,True
482044,DRUG PARA - USE/POSSESS,DRUG PARA - USE/POSSESS,True
617199,SELL SCHEDULE I,SELL SCHEDULE I,True
622603,POSSESS WITS SCHEDULE II,POSSESS WITS SCHEDULE II,True
454226,COMMON LAW ROBBERY,COMMON LAW ROBBERY,True


In [374]:
(compare_df['SAME_CODE'].sum()) / compare_df.shape[0]

0.9323824972823901

In [375]:
compare_df.shape

(534477, 3)

#### This is good news.

By just filtering on `MINIMUM_SENTENCE_LENGTH` and `MAXIMUM_SENTENCE_LENGTH`, we got a 93 percent match on `PRIMARY_OFFENSE_CODE` and `MOST_SERIOUS_OFFENSE_CODE`.

In [376]:
# Count number of observations with filtered primary offenses (without having to choose randomly)
concat_1_2.shape

(823722, 5)

In [377]:
total_obs = dataset_B.shape[0]
concat_1_2.shape[0] / total_combinations

0.9243650139935946

Filtering on MINIMUM_SENTENCE_LENGTH and MAXIMUM_SENTENCE_LENGTH gets us 92 percent of the way there: now to create the last dataset...

In [378]:
# Apply last filter to get the ID & COMMITMENT_PREFIX combinations that are not unique on either SENTENCE_LENGTH column
nonunique_max = count_offenses_by_max[count_offenses_by_max['count'] != 1][['ID', 'COMMITMENT_PREFIX']]
nonunique_max_filter = [tuple(x) for x in nonunique_max.to_numpy()]

filtered_B_max_nonunique = filtered_B_max[filtered_B_max[['ID', 'COMMITMENT_PREFIX']].apply(tuple, axis=1).isin(nonunique_max_filter)]
filtered_B_max_nonunique.head()

Unnamed: 0,ID,COMMITMENT_PREFIX,PRIMARY_OFFENSE_CODE,MINIMUM_SENTENCE_LENGTH,MAXIMUM_SENTENCE_LENGTH
41,19,AA,MISD B&E,2400,2400
42,19,AA,LARCENY (OVER $200),2400,2400
45,20,BA,FELONY B&E,0,100000
46,20,BA,B & E & L,0,100000
140,74,AA,SPEEDING FROM POLICE,0,20000


In [379]:
# How many ID and COMMITMENT PREFIXES to sample?
filtered_B_max_nonunique.groupby(['ID', 'COMMITMENT_PREFIX']).ngroups

67401

In [380]:
seed = 1000

to_sample = filtered_B_max_nonunique.groupby(['ID', 'COMMITMENT_PREFIX'])
sampled = to_sample.apply(lambda x: x.sample(n=1, random_state=seed))

In [381]:
sampled = sampled.reset_index(drop=True)
n_min_sampled = sampled.shape[0]

(n_min_unique + n_max_unique + n_min_sampled) / total_combinations
# Why isn't this 100 percent? Did we drop one along the way?

1.0

In [382]:
# Check for duplicates
sampled[sampled.duplicated(subset=['ID','COMMITMENT_PREFIX','PRIMARY_OFFENSE_CODE'])]

Unnamed: 0,ID,COMMITMENT_PREFIX,PRIMARY_OFFENSE_CODE,MINIMUM_SENTENCE_LENGTH,MAXIMUM_SENTENCE_LENGTH


#### One last check: What are the offenses associated with 9999999 minimum sentence lengths?

Based on the grouped dataframe below, most of the offenses are high-risk felonies, so likely equivalent to a life sentence.

In [383]:
pd.DataFrame(dataset_B[dataset_B['MINIMUM_SENTENCE_LENGTH'] == 9999999].groupby('PRIMARY_OFFENSE_CODE').size().reset_index(name='count'))

Unnamed: 0,PRIMARY_OFFENSE_CODE,count


I also checked the MAXIMUM_SENTENCE_LENGTH column to see what that looked like: either 9999999 or 0, so that squares with the hypothesis that these are life sentences. 

In [384]:
dataset_B[dataset_B['MINIMUM_SENTENCE_LENGTH'] == 9999999]['MAXIMUM_SENTENCE_LENGTH'].unique()

array([], dtype=object)

#### Steps to get here:
* Choosing observations (ID and COMMITMENT_PREFIX combination) based on sentence length:
    * Choose the observation that has the maximum MINIMUM_SENTENCE_LENGTH
    * If there is a tie on MINIMUM_SENTENCE_LENGTH, select observation with the maximum MAXIMUM_SENTENCE_LENGTH 
    * If there is a tie on MAXIMUM_SENTENCE_LENGTH, then choose randomly
* Do not change 9999999 values to NA, since these likely correspond to life-sentences. Check against court commitment data.

In [417]:
# Pulling it all together - get most serious offense code for each id and commitment prefix
#concat_1_2_3 = concat_1_2.append(sampled)
# note - adding in sampled lowered our match rate to 89% so we have decided against
# doing that - will only use the 820k obs we get from using just minimum/maximum length of sentence
# which gives us a 93% match with existing most serious offense code
check_primary_offense_match = pd.merge(concat_1_2, dataset_A, on=['ID','COMMITMENT_PREFIX'])
check_primary_offense_match.head()

Unnamed: 0,ID,COMMITMENT_PREFIX,PRIMARY_OFFENSE_CODE,MINIMUM_SENTENCE_LENGTH,MAXIMUM_SENTENCE_LENGTH,EARLIEST_SENTENCE_EFFECTIVE_DT,MOST_SERIOUS_OFFENSE_CODE,INMATE_COMPUTATION_STATUS_FLAG,END_DATE,PROJ_END_DATE,INMATE_RECORD_STATUS_CODE,INMATE_ADMIN_STATUS_CODE,DATE_OF_LAST_INMATE_MOVEMENT,TYPE_OF_LAST_INMATE_MOVEMENT,CURRENT_COMMITMENT_PREFIX,CONTROL_STATUS,GENDER,RACE,BIRTH_DATE,STATE_BORN,ETHNICITY,CITIZENSHIP,DISCIPLINARY_INFRACTION_COUNT
0,4,AA,SELL SCHEDULE II,0,30000,1983-07-12,,EXPIRED,1984-07-11,1984-07-11,INACTIVE,INACTIVE,1984-07-11,TERMINATED PAROLE,,REGULAR POPULATION RPOP,MALE,WHITE,1961-10-15,ALASKA,UNKNOWN,BORN IN U.S.,
1,6,AA,WORTHLESS CHECK,60,90,1973-01-30,,EXPIRED,1973-03-28,0001-01-01,INACTIVE,INACTIVE,1975-08-18,TERMINATED PAROLE,,REGULAR POPULATION RPOP,MALE,WHITE,1951-07-17,NORTH CAROLINA,UNKNOWN,BORN IN U.S.,
2,6,AB,WORTHLESS CHECK,30,0,1973-04-11,,EXPIRED,1975-08-18,1974-08-10,INACTIVE,INACTIVE,1975-08-18,TERMINATED PAROLE,,REGULAR POPULATION RPOP,MALE,WHITE,1951-07-17,NORTH CAROLINA,UNKNOWN,BORN IN U.S.,
3,8,AA,DWI DRIVING WHILE IMPAIRED,10000,10000,1990-04-09,,EXPIRED,1990-05-17,1990-10-09,INACTIVE,INACTIVE,1995-09-14,TERMINATED PAROLE,,REGULAR POPULATION RPOP,MALE,WHITE,1963-12-29,NORTH CAROLINA,UNKNOWN,BORN IN U.S.,
4,8,AB,HABITUAL IMPAIRED DRIVING,0,10000,1993-08-30,,EXPIRED,1994-01-26,1994-02-18,INACTIVE,INACTIVE,1995-09-14,TERMINATED PAROLE,,REGULAR POPULATION RPOP,MALE,WHITE,1963-12-29,NORTH CAROLINA,UNKNOWN,BORN IN U.S.,


In [418]:
# For observations where MOST_SERIOUS_OFFENSE is not NaN, compare with PRIMARY_OFFENSE_CODE
compare_df = check_primary_offense_match.loc[check_primary_offense_match['MOST_SERIOUS_OFFENSE_CODE'].notna()][['PRIMARY_OFFENSE_CODE', 'MOST_SERIOUS_OFFENSE_CODE']]
compare_df['SAME_CODE'] = (compare_df['PRIMARY_OFFENSE_CODE'] == compare_df['MOST_SERIOUS_OFFENSE_CODE'])
compare_df.sample(10)

Unnamed: 0,PRIMARY_OFFENSE_CODE,MOST_SERIOUS_OFFENSE_CODE,SAME_CODE
307565,FAIL TO REGISTER (SEX OFFENDER,FAIL TO REGISTER (SEX OFFENDER,True
617938,POSSESSION OF FIREARM BY FELON,POSSESSION OF FIREARM BY FELON,True
119570,POSSESSING STOLEN GOODS,POSSESSING STOLEN GOODS,True
590105,HABITUAL IMPAIRED DRIVING,HABITUAL IMPAIRED DRIVING,True
258954,ARSON 1ST DEGREE,ARSON 1ST DEGREE,True
445027,POSSESSION OF FIREARM BY FELON,POSSESSION OF FIREARM BY FELON,True
738828,CHILD ABUSE ISI,CHILD ABUSE ISI,True
697480,COMMON LAW ROBBERY,COMMON LAW ROBBERY,True
743292,COMMON LAW ROBBERY,COMMON LAW ROBBERY,True
459960,SELL SCHEDULE II,SELL SCHEDULE II,True


In [419]:
(compare_df['SAME_CODE'].sum()) / compare_df.shape[0]

0.9323824972823901

In [420]:
# Final merged version of datasets A and B
dataset_with_most_serious = check_primary_offense_match
dataset_with_most_serious.shape

(823711, 23)

In [421]:
dataset_with_most_serious.sample(10)

Unnamed: 0,ID,COMMITMENT_PREFIX,PRIMARY_OFFENSE_CODE,MINIMUM_SENTENCE_LENGTH,MAXIMUM_SENTENCE_LENGTH,EARLIEST_SENTENCE_EFFECTIVE_DT,MOST_SERIOUS_OFFENSE_CODE,INMATE_COMPUTATION_STATUS_FLAG,END_DATE,PROJ_END_DATE,INMATE_RECORD_STATUS_CODE,INMATE_ADMIN_STATUS_CODE,DATE_OF_LAST_INMATE_MOVEMENT,TYPE_OF_LAST_INMATE_MOVEMENT,CURRENT_COMMITMENT_PREFIX,CONTROL_STATUS,GENDER,RACE,BIRTH_DATE,STATE_BORN,ETHNICITY,CITIZENSHIP,DISCIPLINARY_INFRACTION_COUNT
227745,247473,AA,FORGERY,0,20000,1985-10-14,,EXPIRED,1987-05-12,1987-05-12,INACTIVE,INACTIVE,2013-09-20,EXPIRATION,BC,REGULAR POPULATION RPOP,FEMALE,WHITE,1961-06-29,NORTH CAROLINA,EUROPEAN/N.AM./AUSTR,BORN IN U.S.,4.0
254398,276042,AA,POSSESSION OF FIREARM BY FELON,0,20000,1986-02-04,,EXPIRED,1987-06-19,1987-06-19,INACTIVE,INACTIVE,1999-10-29,EXPIRATION,BB,REGULAR POPULATION RPOP,MALE,BLACK,1956-04-04,NORTH CAROLINA,UNKNOWN,BORN IN U.S.,
403676,440652,AB,RECEIVING STOLEN GOODS,0,601,1983-05-06,RECEIVING STOLEN GOODS,EXPIRED,1983-07-22,1983-07-21,INACTIVE,INACTIVE,2011-05-19,TERMINATED PAROLE,BD,REGULAR POPULATION RPOP,MALE,BLACK,1959-03-03,NORTH CAROLINA,AFRICAN,BORN IN U.S.,31.0
796379,342558,BC,WILL/WANT INJ REAL PROPERTY,0,400,2004-12-14,WILL/WANT INJ REAL PROPERTY,EXPIRED,2005-05-07,2005-05-07,INACTIVE,INACTIVE,2006-05-24,EXPIRATION,BD,REGULAR POPULATION RPOP,MALE,BLACK,1976-06-09,NORTH CAROLINA,AFRICAN,BORN IN U.S.,20.0
546419,692182,BC,POSSESSION OF FIREARM BY FELON,10300,20300,2019-01-23,POSSESSION OF FIREARM BY FELON,POST REL,2020-03-01,2020-03-01,PAROLED,INACTIVE,2020-03-01,PAROLE/RETURN TO PAR,BC,REST.HOUS. ADMINISTRATIVE RHAP,MALE,OTHER,1981-07-21,,HISPANIC/LATINO,NATURALIZED,5.0
290168,315819,AA,DRIVING UNDER INFLUENCE (DUI),600,0,1973-08-07,,EXPIRED,1973-12-13,1973-12-13,INACTIVE,INACTIVE,1973-12-13,EXPIRATION,,REGULAR POPULATION RPOP,MALE,BLACK,1929-05-05,NORTH CAROLINA,UNKNOWN,BORN IN U.S.,
470164,544410,BC,POSSESS WITS SCHEDULE VI,400,500,2010-03-08,POSSESS WITS SCHEDULE VI,EXPIRED,2010-07-16,2010-07-16,ACTIVE,ACTIVE,2020-03-17,RECEIVED FROM,BD,REGULAR POPULATION RPOP,MALE,BLACK,1977-12-25,NORTH CAROLINA,AFRICAN,BORN IN U.S.,16.0
620870,872118,BA,BURGLARY 1ST DEGREE,50400,70200,2007-07-10,BURGLARY 1ST DEGREE,EXPIRED,2012-05-15,2012-05-15,INACTIVE,INACTIVE,2013-02-09,TERMINATED PAROLE,BA,INTENSIVE CONTROL ICON,MALE,BLACK,1988-02-17,KENTUCKY,AFRICAN,BORN IN U.S.,38.0
87199,94454,BE,FELONY B&E,1100,10200,2004-07-27,FELONY B&E,EXPIRED,2006-02-17,2006-02-17,INACTIVE,INACTIVE,2008-09-19,EXPIRATION,BF,REGULAR POPULATION RPOP,MALE,BLACK,1965-12-12,NORTH CAROLINA,AFRICAN,BORN IN U.S.,15.0
570566,745518,BA,SELL SCHEDULE II,10100,10400,2002-10-21,SELL SCHEDULE II,EXPIRED,2003-11-23,2003-11-23,INACTIVE,INACTIVE,2009-03-17,EXPIRATION,BC,REGULAR POPULATION RPOP,MALE,BLACK,1979-07-16,NORTH CAROLINA,AFRICAN,BORN IN U.S.,3.0


In [422]:
# Create a CSV
dataset_with_most_serious.to_csv('../data/most_serious_offenses.csv', index=False)

### Merge on dataset with most serious offense to dataset A before we do the rest of the processing on dataset A

In [None]:
# Part C: Puts together dataset A and B
datasetB_primary_offense = dataset_with_most_serious.loc[:,['ID','COMMITMENT_PREFIX','PRIMARY_OFFENSE_CODE']]

print(datasetB_primary_offense.shape)

# merging on datasetA (court commitment + sentence computation) with datasetB ("self constructed" primary offenses from
# sentence component)
data_A_B = data.merge(datasetB_primary_offense, on = ['ID','COMMITMENT_PREFIX'], how='left') 


In [423]:
datasetB_primary_offense = dataset_with_most_serious.loc[:,['ID','COMMITMENT_PREFIX','PRIMARY_OFFENSE_CODE']]

In [424]:
datasetB_primary_offense.head()

Unnamed: 0,ID,COMMITMENT_PREFIX,PRIMARY_OFFENSE_CODE
0,4,AA,SELL SCHEDULE II
1,6,AA,WORTHLESS CHECK
2,6,AB,WORTHLESS CHECK
3,8,AA,DWI DRIVING WHILE IMPAIRED
4,8,AB,HABITUAL IMPAIRED DRIVING


In [425]:
datasetB_primary_offense.shape

(823711, 3)

In [300]:
#dataset_with_most_serious = dataset_with_most_serious[['ID','COMMITMENT_PREFIX','PRIMARY_OFFENSE_CODE','EARLIEST_SENTENCE_EFFECTIVE_DT','END_DATE']]


In [None]:
# Need to reconcile number of obs - why does dataset_with_most_serious have 823,711 whereas dataset A has around
# 888,000 observations?

# Checked back above and we should have 890k observations for dataset B


In [432]:
# merging on datasetA (court commitment + sentence computation) with datasetB ("self constructed" primary offenses from
# sentence component)
data_A_B = data.merge(datasetB_primary_offense, on = ['ID','COMMITMENT_PREFIX'], how='left') 


### As detailed above, here's where we stand with "most serious offense code"
    - 33% of dataset A is missing most_serious_offense
    - using sentence component, we created primary offense code for about 92% of the sentence component data (using minimum and maximum length) 
    - this variable (call it Offense_Constructed) has a 93% match rate with MosT Serious Offense in dataset A (where its available)
    - we're going to use Most Serious Offense where available (66% of the time), replace with Offense_Constructed where Most Serious Offense is unavailable and Offense_Constructed is available (32% of data). 
    - This will mean we are still missing Most Serious Offense for 4% of observations. Not all of these will be relevant to our outcome variable (only relevant when someone recidivates) but a) we want to use most serious offense as a predictor so missingness is relevant and b) how many of these are relevant for recidivating might keep changing depending on our # of years for recidivating. After we do all other changes to this dataset (e.g. dropping for weird dates) will check again how many are missing most serious offense. will drop those at that point (2.6% obs)
    - Finally, ~1% of the remaining data is missing our outcome variable once it is merged on because we only coded
       up offenses that took up 95% of all offenses. We drop these as well

In [433]:
data_A_B.head()

Unnamed: 0,ID,COMMITMENT_PREFIX,EARLIEST_SENTENCE_EFFECTIVE_DT,MOST_SERIOUS_OFFENSE_CODE,INMATE_COMPUTATION_STATUS_FLAG,END_DATE,PROJ_END_DATE,INMATE_RECORD_STATUS_CODE,INMATE_ADMIN_STATUS_CODE,DATE_OF_LAST_INMATE_MOVEMENT,TYPE_OF_LAST_INMATE_MOVEMENT,CURRENT_COMMITMENT_PREFIX,CONTROL_STATUS,GENDER,RACE,BIRTH_DATE,STATE_BORN,ETHNICITY,CITIZENSHIP,DISCIPLINARY_INFRACTION_COUNT,PRIMARY_OFFENSE_CODE
0,4,AA,1983-07-12,,EXPIRED,1984-07-11,1984-07-11,INACTIVE,INACTIVE,1984-07-11,TERMINATED PAROLE,,REGULAR POPULATION RPOP,MALE,WHITE,1961-10-15,ALASKA,UNKNOWN,BORN IN U.S.,,SELL SCHEDULE II
1,6,AA,1973-01-30,,EXPIRED,1973-03-28,0001-01-01,INACTIVE,INACTIVE,1975-08-18,TERMINATED PAROLE,,REGULAR POPULATION RPOP,MALE,WHITE,1951-07-17,NORTH CAROLINA,UNKNOWN,BORN IN U.S.,,WORTHLESS CHECK
2,6,AB,1973-04-11,,EXPIRED,1975-08-18,1974-08-10,INACTIVE,INACTIVE,1975-08-18,TERMINATED PAROLE,,REGULAR POPULATION RPOP,MALE,WHITE,1951-07-17,NORTH CAROLINA,UNKNOWN,BORN IN U.S.,,WORTHLESS CHECK
3,8,AA,1990-04-09,,EXPIRED,1990-05-17,1990-10-09,INACTIVE,INACTIVE,1995-09-14,TERMINATED PAROLE,,REGULAR POPULATION RPOP,MALE,WHITE,1963-12-29,NORTH CAROLINA,UNKNOWN,BORN IN U.S.,,DWI DRIVING WHILE IMPAIRED
4,8,AB,1993-08-30,,EXPIRED,1994-01-26,1994-02-18,INACTIVE,INACTIVE,1995-09-14,TERMINATED PAROLE,,REGULAR POPULATION RPOP,MALE,WHITE,1963-12-29,NORTH CAROLINA,UNKNOWN,BORN IN U.S.,,HABITUAL IMPAIRED DRIVING


In [434]:
data_A_B['MOST_SERIOUS_OFFENSE_CODE'].isnull().sum()

324309

In [435]:
data_A_B['MOST_SERIOUS_OFFENSE_CODE'].isnull().sum() / data_A_B.shape[0]

0.3590738079368278

In [436]:
data_A_B['PRIMARY_OFFENSE_CODE'].isnull().sum() / data_A_B.shape[0]

0.0879911247124057

In [437]:
# Replace Most Serious Offense with our constructed Primary Offense Code if missing
data_A_B['MOST_SERIOUS_OFFENSE_CODE'].mask(data_A_B['MOST_SERIOUS_OFFENSE_CODE'].isnull(), data_A_B['PRIMARY_OFFENSE_CODE'], inplace=True)


In [438]:
data_A_B['MOST_SERIOUS_OFFENSE_CODE'].isnull().sum()

35076

In [439]:
data_A_B['MOST_SERIOUS_OFFENSE_CODE'].isnull().sum() / data_A_B.shape[0]

0.03883602640442347

In [441]:
(324309 - 35076) / data_A_B.shape[0]

0.32023778153240434

In [442]:
# Part C broken down in more detail
    # 1. Deal with date issues (takes us from 903,000 obs to 888,120). For more details, see below:
        # a. replace end date with projected end date where END_DATE = 0001-01-01 (placeholder for missing)
        # b. drop observations still missing end_date (should be only about ~350 observations)
        # c. drop observations missing EARLIEST_SENTENCE_EFFECTIVE_DT (about 12k observations)
        # b and c are dropping those where the sentence is either only in court commitment or only in sentence comp
    # 2. Query the remaining dataset to get the the next commitment prefix, next sentence date, and most serious
        # offense code for the next observation - where all of these exist. for a sentence that does not result in 
        # recidivism, nextprefix, nextstart, nextoffense will be 0
    # 3. Clean up dates - turn them into date format, after recoding the top coded 9999 dates (for life sentences)
    # 4. Get recidivism flags. See decision rule below
    # 5. Hold out active sentences (~approx 32,000 obs)
    # 6. Drop observation with no recidivism flag (Takes us from 888,120 to 850970, i.e. dropping
        # 38,000 observations. 32k of those are active sentences, 6k are "out of universe" i.e.
        # sentences that are expired but the individual was never released (mostly death in prison)
    # 7. Sanity check - Merge on our coded offenses to most serious offenses and see how well we cover the offenses
        # Approx 5% of observations that have Most Serious Offense do NOT have "Decided Category" (our variable)
        # This makes sense because we only coded up offenses that made up 95% of the offenses
    # 8. Merge on our coded offenses to "NextOffense" - the relevant variable now is "Decided category".
    # 9. Replace Decided Category to 0 if recidivism = 0 ; leave it as NA otherwise
        # After holding out active sentences and dropping "out of universe observations", we have ~850k observations
        # of these, we are missing a "Decided Category" flag (as defined by our coded offenses) for 7% of the data
        # this is a lot better than missing it for 33% of the data (since we're missing "most serious offense" for 
        # 33% of the data) but its still not great - hopefully once we bring in most serious offense from sentence
        # component, we can reduce 7% down to something more negligible
    # 10. Understand the missingness of our possible features
    
# We now have two datasets that are ready for pre processing and feature engineering:
    # dataset_with_offenses = datasetA 
    # active_sentences = data on which we will apply our predictions
        # Next steps (I think): develop a list of features and functions that can clean up those features, which can 
        # be applied to both of the datasets above
        # Additionally - do we want to write both of these to csv that we push to github?
    
# More details on Dates
    # In addition to the dates that are null (see above) because some data exists in court commitment
    # that doesnt exist in sentence computation (and vice versa) we also have start and end
    # dates that are 0001-01-01 - based on looking up some offenders with these dates, these
    # are often just missing so 0001-01-01 is a placeholder for missing date

    # There are about 10k observations with end_date = 0001-01-01. These don't seem random -
    # 9k of these are for the commitment prefix BA, and on spot checking many of them look like
    # the sentences were categorized as "FAIR FELONS" - related to the fair sentencing act that
    # affects sentences from 1982 to October 1994 (before NC enacted structured sentencing which
    # abolished parole). It also seems like many of those sentences are missing an "actual release
    # date" from prison but have a release date from parole
    # 
    # Where available, the end date will be replaced with the projected release date. on spot
    # checking, this seems to be a reasonable proxy for when inmate was last moved
    # There are 397 observations missing both end date and projected end date - dropping these
    #
    # About 12k observations have start date = 0001-01-01. On spot checking, some of these
    # appear to be entirely missing from sentence component and from the offender's online
    # profile - as if the sentences were removed ex-post. Since there is no way to get a start
    # date for these, they will be dropped. Approx 1% of the data

# Note on "Active" Flag    
    # To get "Active" sentences, we should probably not trust the Inmate Commitment
    # status flag in court commitment. This often appears active even for sentences that
    # online show "service status" = "Expired"

    # instead, we should merge on information from INmate Profile. This has "inmate record status"
    # and "inmate admin status". After some exploration, it seems like admin status = active
    # means one is in prison; record status = active (if admin status = inactive) is mostly for
    # people on parole/probation.


# Decision rule for recidivism flag:
    # if NextPrefix != 0: if nextStart - endDate is less than XXX (make this a parameter) then recidivism = 1 else 0

    # if nextprefix = 0, inmate is inactive, and they did not die in prison 
    # (e.g. serving life sentence or  other wise) then 
    # recidivism = 0

    # if nextprefix = 0, inmate status code is not active or inactive (could be missing) and 
    # end date is not 2261-01-02 (life sentence), they were likely released from prison
    # recidivism = 0

In [257]:
data_A_B['EARLIEST_SENTENCE_EFFECTIVE_DT'].isnull().sum()

838

In [443]:
# sentences missing earliest effective date from datasetA
data_A_B['END_DATE'].isnull().sum()

1262

In [444]:
data_A_B[data_A_B['END_DATE']=='0001-01-01']['END_DATE'].count()

10110

In [445]:
#data[data['EARLIEST_SENTENCE_EFFECTIVE_DT']=='0001-01-01']['EARLIEST_SENTENCE_EFFECTIVE_DT'].count()
data_missing_start = data_A_B[data_A_B['EARLIEST_SENTENCE_EFFECTIVE_DT']=='0001-01-01']
data_missing_start.shape


(12595, 21)

In [446]:
data_merge_missing = data_missing_start.merge(sent_comp_small,  how='left',on = ['ID','COMMITMENT_PREFIX'])


In [447]:
# of the 12,595 observations with EARLIEST_SENTENCE_EFFECTIVE_DT == 0001-01-01, 
# 11,400 dont have a match in sentence component, from which we migtht be 
# able to get sentence start date. Not sure it seems worth it for 1,100 observations
data_merge_missing.isnull().sum()

ID                                    0
COMMITMENT_PREFIX                     0
EARLIEST_SENTENCE_EFFECTIVE_DT        0
MOST_SERIOUS_OFFENSE_CODE         12163
INMATE_COMPUTATION_STATUS_FLAG       32
END_DATE                             32
PROJ_END_DATE                        32
INMATE_RECORD_STATUS_CODE          1647
INMATE_ADMIN_STATUS_CODE           1647
DATE_OF_LAST_INMATE_MOVEMENT       1647
TYPE_OF_LAST_INMATE_MOVEMENT       1980
CURRENT_COMMITMENT_PREFIX          8484
CONTROL_STATUS                     1647
GENDER                                0
RACE                                  0
BIRTH_DATE                            0
STATE_BORN                          358
ETHNICITY                             0
CITIZENSHIP                           0
DISCIPLINARY_INFRACTION_COUNT      4425
PRIMARY_OFFENSE_CODE_x            12163
SENTENCE_COMPONENT_NUMBER         11428
PRIMARY_OFFENSE_CODE_y            11438
PRIMARY_FELONYMISDEMEANOR_CD      11428
SENTENCING_PENALTY_CLASS_CODE     11428


In [448]:
sent_comp_small.head()

Unnamed: 0,ID,COMMITMENT_PREFIX,SENTENCE_COMPONENT_NUMBER,PRIMARY_OFFENSE_CODE,PRIMARY_FELONYMISDEMEANOR_CD,SENTENCING_PENALTY_CLASS_CODE,PRIOR_RECORD_LEVEL_CODE,MINIMUM_SENTENCE_LENGTH,MAXIMUM_SENTENCE_LENGTH,SENTENCE_TYPE_CODE,COUNTY_OF_CONVICTION_CODE
0,4,AA,1,SELL SCHEDULE II,FELON,CLASS H,,0,30000,DEPT OF CORR DIV OF PRISONS,PERQUIMANS
1,4,AA,2,SELL SCHEDULE II,FELON,CLASS H,,0,30000,DEPT OF CORR DIV OF PRISONS,PERQUIMANS
2,6,AA,1,WORTHLESS CHECK,MISD.,MISD.(PRE-STRUCTURE),,60,90,DEPT OF CORR DIV OF PRISONS,CUMBERLAND
3,6,AB,1,WORTHLESS CHECK,MISD.,MISD.(PRE-STRUCTURE),,30,0,DEPT OF CORR DIV OF PRISONS,CUMBERLAND
4,6,AB,2,WORTHLESS CHECK,MISD.,MISD.(PRE-STRUCTURE),,30,0,DEPT OF CORR DIV OF PRISONS,CUMBERLAND


In [449]:
# Step 1
# https://kanoki.org/2019/07/17/pandas-how-to-replace-values-based-on-conditions/
data_A_B['END_DATE'].mask(data_A_B['END_DATE'] == '0001-01-01', data_A_B['PROJ_END_DATE'], inplace=True)
data_A_B = data_A_B[data_A_B['END_DATE']!='0001-01-01']
data_A_B = data_A_B[data_A_B['EARLIEST_SENTENCE_EFFECTIVE_DT']!='0001-01-01']
data_A_B = data_A_B[data_A_B['END_DATE'].notna()]
data_A_B = data_A_B[data_A_B['EARLIEST_SENTENCE_EFFECTIVE_DT'].notna()]


In [451]:
data_A_B.shape

(888121, 21)

In [452]:
data_A_B['MOST_SERIOUS_OFFENSE_CODE'].isnull().sum() / data_A_B.shape[0]

0.02623966779301469

In [453]:
# Step 1.5 drop observations missing most serious offense code
data_A_B = data_A_B[data_A_B['MOST_SERIOUS_OFFENSE_CODE'].notna()]


In [454]:
data_A_B.shape


(864817, 21)

In [476]:
# Step 2
# write data to sqlite in memory so can query it to get next record
conn = sqlite3.connect(':memory:')
data_A_B.to_sql('data', conn, index=False)

start = datetime.datetime.now()
# https://stackoverflow.com/questions/37360901/sql-self-join-compare-current-record-with-the-record-of-the-previous-date
query_datasetA = '''
                        SELECT *, 
                        LEAD(COMMITMENT_PREFIX,1,0) OVER (
                                                    PARTITION BY ID
                                                    ORDER BY COMMITMENT_PREFIX
                                                    ) NextPrefix,
                        LEAD(EARLIEST_SENTENCE_EFFECTIVE_DT,1,0) OVER (
                                                    PARTITION BY ID
                                                    ORDER BY COMMITMENT_PREFIX
                                                    ) NextStart,
                        LEAD(MOST_SERIOUS_OFFENSE_CODE,1,0) OVER (
                                                    PARTITION BY ID
                                                    ORDER BY COMMITMENT_PREFIX
                                                    ) NextOffense                                                    
                                                    
                        FROM data ;

                        '''


dataset_flag = qd.query_db_notebook(conn,query_datasetA)
conn.close
stop = datetime.datetime.now()
print("Time Elapsed:", stop - start) 

Time Elapsed: 0:01:19.226138


In [478]:
dataset_flag.head()

Unnamed: 0,ID,COMMITMENT_PREFIX,EARLIEST_SENTENCE_EFFECTIVE_DT,MOST_SERIOUS_OFFENSE_CODE,INMATE_COMPUTATION_STATUS_FLAG,END_DATE,PROJ_END_DATE,INMATE_RECORD_STATUS_CODE,INMATE_ADMIN_STATUS_CODE,DATE_OF_LAST_INMATE_MOVEMENT,TYPE_OF_LAST_INMATE_MOVEMENT,CURRENT_COMMITMENT_PREFIX,CONTROL_STATUS,GENDER,RACE,BIRTH_DATE,STATE_BORN,ETHNICITY,CITIZENSHIP,DISCIPLINARY_INFRACTION_COUNT,PRIMARY_OFFENSE_CODE,NextPrefix,NextStart,NextOffense
0,4,AA,1983-07-12,SELL SCHEDULE II,EXPIRED,1984-07-11,1984-07-11,INACTIVE,INACTIVE,1984-07-11,TERMINATED PAROLE,,REGULAR POPULATION RPOP,MALE,WHITE,1961-10-15,ALASKA,UNKNOWN,BORN IN U.S.,,SELL SCHEDULE II,0,0,0
1,6,AA,1973-01-30,WORTHLESS CHECK,EXPIRED,1973-03-28,0001-01-01,INACTIVE,INACTIVE,1975-08-18,TERMINATED PAROLE,,REGULAR POPULATION RPOP,MALE,WHITE,1951-07-17,NORTH CAROLINA,UNKNOWN,BORN IN U.S.,,WORTHLESS CHECK,AB,1973-04-11,WORTHLESS CHECK
2,6,AB,1973-04-11,WORTHLESS CHECK,EXPIRED,1975-08-18,1974-08-10,INACTIVE,INACTIVE,1975-08-18,TERMINATED PAROLE,,REGULAR POPULATION RPOP,MALE,WHITE,1951-07-17,NORTH CAROLINA,UNKNOWN,BORN IN U.S.,,WORTHLESS CHECK,0,0,0
3,8,AA,1990-04-09,DWI DRIVING WHILE IMPAIRED,EXPIRED,1990-05-17,1990-10-09,INACTIVE,INACTIVE,1995-09-14,TERMINATED PAROLE,,REGULAR POPULATION RPOP,MALE,WHITE,1963-12-29,NORTH CAROLINA,UNKNOWN,BORN IN U.S.,,DWI DRIVING WHILE IMPAIRED,AB,1993-08-30,HABITUAL IMPAIRED DRIVING
4,8,AB,1993-08-30,HABITUAL IMPAIRED DRIVING,EXPIRED,1994-01-26,1994-02-18,INACTIVE,INACTIVE,1995-09-14,TERMINATED PAROLE,,REGULAR POPULATION RPOP,MALE,WHITE,1963-12-29,NORTH CAROLINA,UNKNOWN,BORN IN U.S.,,HABITUAL IMPAIRED DRIVING,BA,1995-01-02,HABITUAL IMPAIRED DRIVING


In [479]:
# Step 3. 
# call fix dates function to fix relevant dates
dataset_flag = fix_dates(dataset_flag,'EARLIEST_SENTENCE_EFFECTIVE_DT')
dataset_flag = fix_dates(dataset_flag,'END_DATE')
dataset_flag = fix_dates(dataset_flag,'NextStart')

In [480]:
# check that dates converted successfully
dataset_flag.dtypes

ID                                        object
COMMITMENT_PREFIX                         object
EARLIEST_SENTENCE_EFFECTIVE_DT    datetime64[ns]
MOST_SERIOUS_OFFENSE_CODE                 object
INMATE_COMPUTATION_STATUS_FLAG            object
END_DATE                          datetime64[ns]
PROJ_END_DATE                             object
INMATE_RECORD_STATUS_CODE                 object
INMATE_ADMIN_STATUS_CODE                  object
DATE_OF_LAST_INMATE_MOVEMENT              object
TYPE_OF_LAST_INMATE_MOVEMENT              object
CURRENT_COMMITMENT_PREFIX                 object
CONTROL_STATUS                            object
GENDER                                    object
RACE                                      object
BIRTH_DATE                                object
STATE_BORN                                object
ETHNICITY                                 object
CITIZENSHIP                               object
DISCIPLINARY_INFRACTION_COUNT            float64
PRIMARY_OFFENSE_CODE

In [481]:
# Step 4
# get recidivism flag - see decision rules and function above 
dataset_flag = get_recidivism_label(dataset_flag)
dataset_flag.head()

Unnamed: 0,ID,COMMITMENT_PREFIX,EARLIEST_SENTENCE_EFFECTIVE_DT,MOST_SERIOUS_OFFENSE_CODE,INMATE_COMPUTATION_STATUS_FLAG,END_DATE,PROJ_END_DATE,INMATE_RECORD_STATUS_CODE,INMATE_ADMIN_STATUS_CODE,DATE_OF_LAST_INMATE_MOVEMENT,TYPE_OF_LAST_INMATE_MOVEMENT,CURRENT_COMMITMENT_PREFIX,CONTROL_STATUS,GENDER,RACE,BIRTH_DATE,STATE_BORN,ETHNICITY,CITIZENSHIP,DISCIPLINARY_INFRACTION_COUNT,PRIMARY_OFFENSE_CODE,NextPrefix,NextStart,NextOffense,new_col,Time_Diff,Recidivate
0,4,AA,1983-07-12,SELL SCHEDULE II,EXPIRED,1984-07-11,1984-07-11,INACTIVE,INACTIVE,1984-07-11,TERMINATED PAROLE,,REGULAR POPULATION RPOP,MALE,WHITE,1961-10-15,ALASKA,UNKNOWN,BORN IN U.S.,,SELL SCHEDULE II,0,NaT,0,0,,0.0
1,6,AA,1973-01-30,WORTHLESS CHECK,EXPIRED,1973-03-28,0001-01-01,INACTIVE,INACTIVE,1975-08-18,TERMINATED PAROLE,,REGULAR POPULATION RPOP,MALE,WHITE,1951-07-17,NORTH CAROLINA,UNKNOWN,BORN IN U.S.,,WORTHLESS CHECK,AB,1973-04-11,WORTHLESS CHECK,1973,0.0,1.0
2,6,AB,1973-04-11,WORTHLESS CHECK,EXPIRED,1975-08-18,1974-08-10,INACTIVE,INACTIVE,1975-08-18,TERMINATED PAROLE,,REGULAR POPULATION RPOP,MALE,WHITE,1951-07-17,NORTH CAROLINA,UNKNOWN,BORN IN U.S.,,WORTHLESS CHECK,0,NaT,0,0,,0.0
3,8,AA,1990-04-09,DWI DRIVING WHILE IMPAIRED,EXPIRED,1990-05-17,1990-10-09,INACTIVE,INACTIVE,1995-09-14,TERMINATED PAROLE,,REGULAR POPULATION RPOP,MALE,WHITE,1963-12-29,NORTH CAROLINA,UNKNOWN,BORN IN U.S.,,DWI DRIVING WHILE IMPAIRED,AB,1993-08-30,HABITUAL IMPAIRED DRIVING,1993,3.0,0.0
4,8,AB,1993-08-30,HABITUAL IMPAIRED DRIVING,EXPIRED,1994-01-26,1994-02-18,INACTIVE,INACTIVE,1995-09-14,TERMINATED PAROLE,,REGULAR POPULATION RPOP,MALE,WHITE,1963-12-29,NORTH CAROLINA,UNKNOWN,BORN IN U.S.,,HABITUAL IMPAIRED DRIVING,BA,1995-01-02,HABITUAL IMPAIRED DRIVING,1995,1.0,1.0


In [519]:
dataset_flag = get_recidivism_label(dataset_flag)
dataset_flag.head()

Unnamed: 0,ID,COMMITMENT_PREFIX,EARLIEST_SENTENCE_EFFECTIVE_DT,MOST_SERIOUS_OFFENSE_CODE,INMATE_COMPUTATION_STATUS_FLAG,END_DATE,PROJ_END_DATE,INMATE_RECORD_STATUS_CODE,INMATE_ADMIN_STATUS_CODE,DATE_OF_LAST_INMATE_MOVEMENT,TYPE_OF_LAST_INMATE_MOVEMENT,CURRENT_COMMITMENT_PREFIX,CONTROL_STATUS,GENDER,RACE,BIRTH_DATE,STATE_BORN,ETHNICITY,CITIZENSHIP,DISCIPLINARY_INFRACTION_COUNT,PRIMARY_OFFENSE_CODE,NextPrefix,NextStart,NextOffense,new_col,Time_Diff,Recidivate
0,4,AA,1983-07-12,SELL SCHEDULE II,EXPIRED,1984-07-11,1984-07-11,INACTIVE,INACTIVE,1984-07-11,TERMINATED PAROLE,,REGULAR POPULATION RPOP,MALE,WHITE,1961-10-15,ALASKA,UNKNOWN,BORN IN U.S.,,SELL SCHEDULE II,0,NaT,0,0,,0.0
1,6,AA,1973-01-30,WORTHLESS CHECK,EXPIRED,1973-03-28,0001-01-01,INACTIVE,INACTIVE,1975-08-18,TERMINATED PAROLE,,REGULAR POPULATION RPOP,MALE,WHITE,1951-07-17,NORTH CAROLINA,UNKNOWN,BORN IN U.S.,,WORTHLESS CHECK,AB,1973-04-11,WORTHLESS CHECK,1973,0.0,1.0
2,6,AB,1973-04-11,WORTHLESS CHECK,EXPIRED,1975-08-18,1974-08-10,INACTIVE,INACTIVE,1975-08-18,TERMINATED PAROLE,,REGULAR POPULATION RPOP,MALE,WHITE,1951-07-17,NORTH CAROLINA,UNKNOWN,BORN IN U.S.,,WORTHLESS CHECK,0,NaT,0,0,,0.0
3,8,AA,1990-04-09,DWI DRIVING WHILE IMPAIRED,EXPIRED,1990-05-17,1990-10-09,INACTIVE,INACTIVE,1995-09-14,TERMINATED PAROLE,,REGULAR POPULATION RPOP,MALE,WHITE,1963-12-29,NORTH CAROLINA,UNKNOWN,BORN IN U.S.,,DWI DRIVING WHILE IMPAIRED,AB,1993-08-30,HABITUAL IMPAIRED DRIVING,1993,3.0,0.0
4,8,AB,1993-08-30,HABITUAL IMPAIRED DRIVING,EXPIRED,1994-01-26,1994-02-18,INACTIVE,INACTIVE,1995-09-14,TERMINATED PAROLE,,REGULAR POPULATION RPOP,MALE,WHITE,1963-12-29,NORTH CAROLINA,UNKNOWN,BORN IN U.S.,,HABITUAL IMPAIRED DRIVING,BA,1995-01-02,HABITUAL IMPAIRED DRIVING,1995,1.0,1.0


In [482]:
missing_recidivate = dataset_flag[dataset_flag['Recidivate'].isnull()]

In [483]:
missing_recidivate.shape

(37086, 27)

In [484]:
# 40k obs missing recidivism flag.
# 32,801 of those are active
# 4,261 of the inactive ones are those that died in prison
# so about 3,358 obs missing recidivate flag because they have no INMATE_ADMIN_STATUS_CODE
# let's look at just those
missing_recidivate.groupby('INMATE_ADMIN_STATUS_CODE').count()

Unnamed: 0_level_0,ID,COMMITMENT_PREFIX,EARLIEST_SENTENCE_EFFECTIVE_DT,MOST_SERIOUS_OFFENSE_CODE,INMATE_COMPUTATION_STATUS_FLAG,END_DATE,PROJ_END_DATE,INMATE_RECORD_STATUS_CODE,DATE_OF_LAST_INMATE_MOVEMENT,TYPE_OF_LAST_INMATE_MOVEMENT,CURRENT_COMMITMENT_PREFIX,CONTROL_STATUS,GENDER,RACE,BIRTH_DATE,STATE_BORN,ETHNICITY,CITIZENSHIP,DISCIPLINARY_INFRACTION_COUNT,PRIMARY_OFFENSE_CODE,NextPrefix,NextStart,NextOffense,new_col,Time_Diff,Recidivate
INMATE_ADMIN_STATUS_CODE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1
ACTIVE,32801,32801,32801,32801,32801,32801,32801,32801,32801,27587,32729,32801,32801,32801,32801,31123,32789,32790,28291,30766,32801,0,32801,32801,0,0
INACTIVE,4261,4261,4261,4261,4261,4261,4261,4261,4261,4261,2656,4261,4261,4261,4261,4008,4260,4260,2980,4068,4261,0,4261,4261,0,0
TEM.ABSENT,24,24,24,24,24,24,24,24,24,9,24,24,24,24,24,21,24,24,20,23,24,0,24,24,0,0


In [485]:
40420 - 32801 - 4261 

3358

In [486]:
missing_recidivate.groupby('TYPE_OF_LAST_INMATE_MOVEMENT').count()

Unnamed: 0_level_0,ID,COMMITMENT_PREFIX,EARLIEST_SENTENCE_EFFECTIVE_DT,MOST_SERIOUS_OFFENSE_CODE,INMATE_COMPUTATION_STATUS_FLAG,END_DATE,PROJ_END_DATE,INMATE_RECORD_STATUS_CODE,INMATE_ADMIN_STATUS_CODE,DATE_OF_LAST_INMATE_MOVEMENT,CURRENT_COMMITMENT_PREFIX,CONTROL_STATUS,GENDER,RACE,BIRTH_DATE,STATE_BORN,ETHNICITY,CITIZENSHIP,DISCIPLINARY_INFRACTION_COUNT,PRIMARY_OFFENSE_CODE,NextPrefix,NextStart,NextOffense,new_col,Time_Diff,Recidivate
TYPE_OF_LAST_INMATE_MOVEMENT,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1
DEATH,4261,4261,4261,4261,4261,4261,4261,4261,4261,4261,2656,4261,4261,4261,4261,4008,4260,4260,2980,4068,4261,0,4261,4261,0,0
ESCAPED,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,2,5,5,2,4,5,0,5,5,0,0
ESCAPEE IN CUSTODY,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,0,1,1,0,1,1,0,0
NEW ADMISSION,1109,1109,1109,1109,1109,1109,1109,1109,1109,1109,1109,1109,1109,1109,1109,1023,1100,1100,425,1035,1109,0,1109,1109,0,0
RE-ADMISSION,1078,1078,1078,1078,1078,1078,1078,1078,1078,1078,1010,1078,1078,1078,1078,1066,1078,1078,848,1004,1078,0,1078,1078,0,0
RECEIVE PSD,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,1,2,2,0,2,2,0,0
RECEIVED FROM,24186,24186,24186,24186,24186,24186,24186,24186,24186,24186,24183,24186,24186,24186,24186,22905,24183,24184,21323,22754,24186,0,24186,24186,0,0
RETURN FROM ESCAPE,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,0,3,3,0,0
RETURN FROM REL ERR,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,0,2,2,0,0
RETURN FROM TEMP LVE,585,585,585,585,585,585,585,585,585,585,585,585,585,585,585,576,585,585,513,560,585,0,585,585,0,0


In [487]:
32801 + 4325 + 697

37823

In [488]:
40544 - 37823

2721

In [489]:
missing_recidivate_no_inmate_admin_record = missing_recidivate[missing_recidivate['INMATE_ADMIN_STATUS_CODE'].isnull()]


In [490]:
# All those missing INMATE_ADMIN_STATUS_CODE are also missing INMATE_RECORD_STATUS_CODE
missing_recidivate_no_inmate_admin_record.groupby('INMATE_RECORD_STATUS_CODE').count()


Unnamed: 0_level_0,ID,COMMITMENT_PREFIX,EARLIEST_SENTENCE_EFFECTIVE_DT,MOST_SERIOUS_OFFENSE_CODE,INMATE_COMPUTATION_STATUS_FLAG,END_DATE,PROJ_END_DATE,INMATE_ADMIN_STATUS_CODE,DATE_OF_LAST_INMATE_MOVEMENT,TYPE_OF_LAST_INMATE_MOVEMENT,CURRENT_COMMITMENT_PREFIX,CONTROL_STATUS,GENDER,RACE,BIRTH_DATE,STATE_BORN,ETHNICITY,CITIZENSHIP,DISCIPLINARY_INFRACTION_COUNT,PRIMARY_OFFENSE_CODE,NextPrefix,NextStart,NextOffense,new_col,Time_Diff,Recidivate
INMATE_RECORD_STATUS_CODE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1


In [491]:
# on spot checking the IDs, the offenders show up as "inactive" for their parole/probation
# status, even though inmate_record_status in our data shows none
# they also have "current incarceration record not available"
# not really sure if this is just some weird data quirk 
# i'm okay with either dropping these ~3k observations, or assuming they were released
# and
missing_recidivate_no_inmate_admin_record.head()

Unnamed: 0,ID,COMMITMENT_PREFIX,EARLIEST_SENTENCE_EFFECTIVE_DT,MOST_SERIOUS_OFFENSE_CODE,INMATE_COMPUTATION_STATUS_FLAG,END_DATE,PROJ_END_DATE,INMATE_RECORD_STATUS_CODE,INMATE_ADMIN_STATUS_CODE,DATE_OF_LAST_INMATE_MOVEMENT,TYPE_OF_LAST_INMATE_MOVEMENT,CURRENT_COMMITMENT_PREFIX,CONTROL_STATUS,GENDER,RACE,BIRTH_DATE,STATE_BORN,ETHNICITY,CITIZENSHIP,DISCIPLINARY_INFRACTION_COUNT,PRIMARY_OFFENSE_CODE,NextPrefix,NextStart,NextOffense,new_col,Time_Diff,Recidivate


In [492]:
# sentence with max end date is also inactive on offender search
missing_recidivate_no_inmate_admin_record[missing_recidivate_no_inmate_admin_record['END_DATE']==missing_recidivate_no_inmate_admin_record['END_DATE'].max()]



Unnamed: 0,ID,COMMITMENT_PREFIX,EARLIEST_SENTENCE_EFFECTIVE_DT,MOST_SERIOUS_OFFENSE_CODE,INMATE_COMPUTATION_STATUS_FLAG,END_DATE,PROJ_END_DATE,INMATE_RECORD_STATUS_CODE,INMATE_ADMIN_STATUS_CODE,DATE_OF_LAST_INMATE_MOVEMENT,TYPE_OF_LAST_INMATE_MOVEMENT,CURRENT_COMMITMENT_PREFIX,CONTROL_STATUS,GENDER,RACE,BIRTH_DATE,STATE_BORN,ETHNICITY,CITIZENSHIP,DISCIPLINARY_INFRACTION_COUNT,PRIMARY_OFFENSE_CODE,NextPrefix,NextStart,NextOffense,new_col,Time_Diff,Recidivate


In [520]:
dataset_flag.groupby('Recidivate').count()

Unnamed: 0_level_0,ID,COMMITMENT_PREFIX,EARLIEST_SENTENCE_EFFECTIVE_DT,MOST_SERIOUS_OFFENSE_CODE,INMATE_COMPUTATION_STATUS_FLAG,END_DATE,PROJ_END_DATE,INMATE_RECORD_STATUS_CODE,INMATE_ADMIN_STATUS_CODE,DATE_OF_LAST_INMATE_MOVEMENT,TYPE_OF_LAST_INMATE_MOVEMENT,CURRENT_COMMITMENT_PREFIX,CONTROL_STATUS,GENDER,RACE,BIRTH_DATE,STATE_BORN,ETHNICITY,CITIZENSHIP,DISCIPLINARY_INFRACTION_COUNT,PRIMARY_OFFENSE_CODE,NextPrefix,NextStart,NextOffense,new_col,Time_Diff
Recidivate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1
0.0,679080,679080,679080,679080,679080,679080,679080,674639,674639,674639,669057,515238,674639,679080,679079,679080,652115,678402,678417,403051,645902,679080,254782,679080,679080,254782
1.0,148651,148651,148651,148651,148651,148651,148651,147162,147162,147162,143900,123844,147162,148651,148650,148651,146667,148638,148645,124150,139931,148651,148651,148651,148651,148651


In [521]:
# Step 5
# Hold out active senteces
active_sentences = dataset_flag[(dataset_flag['INMATE_ADMIN_STATUS_CODE']=='ACTIVE') & (dataset_flag['NextPrefix']==0)]
active_sentences.shape

(0, 27)

In [522]:
# Step 6
# drop observations with no recidivism flag (this will also drop active sentences, but we've already separated those)
dataset_flag = dataset_flag[(dataset_flag['Recidivate'].notnull())]
dataset_flag.shape


(827731, 27)

In [512]:
267114 / (267114 + 560617)

0.3227062898453725

In [497]:
# Step 7
# Bring in coded offenses - sanity check

# this merges our coded offenses onto "most serious offense" to check how much coverage
# our variable is giving us. however, this not what we ultimately want - in the end, we want
# our codes to be merged onto "nextOffense" - i.e., the offense code for the next offense 
# someone committed that resulted in re-incarceration
# NextOffense can be missing for 2 reasons: because most serious offense is missing, or because
# the individual did not recidivate. after merging our codes onto "NextOffense", we can replace
# "Decided Category" with 0 if recidivism = 0, and leave it as NA otherwise
dataset_with_offenses_test = dataset_flag.merge(coded_offenses, how='left', left_on='MOST_SERIOUS_OFFENSE_CODE', right_on='Primary offense code')



In [498]:
dataset_with_offenses_test.sample()

Unnamed: 0,ID,COMMITMENT_PREFIX,EARLIEST_SENTENCE_EFFECTIVE_DT,MOST_SERIOUS_OFFENSE_CODE,INMATE_COMPUTATION_STATUS_FLAG,END_DATE,PROJ_END_DATE,INMATE_RECORD_STATUS_CODE,INMATE_ADMIN_STATUS_CODE,DATE_OF_LAST_INMATE_MOVEMENT,TYPE_OF_LAST_INMATE_MOVEMENT,CURRENT_COMMITMENT_PREFIX,CONTROL_STATUS,GENDER,RACE,BIRTH_DATE,STATE_BORN,ETHNICITY,CITIZENSHIP,DISCIPLINARY_INFRACTION_COUNT,PRIMARY_OFFENSE_CODE,NextPrefix,NextStart,NextOffense,new_col,Time_Diff,Recidivate,Primary offense code,Description (if needed),Decided Category,Needed a check?,More lenient,More harsh
744667,1058849,BB,2013-02-14,RECEIVING STOLEN VEHICLE,EXPIRED,2015-03-16,2015-03-16,PAROLED,INACTIVE,2020-04-07,PAROLE/RETURN TO PAR,BC,REGULAR POPULATION RPOP,MALE,WHITE,1989-10-08,NORTH CAROLINA,EUROPEAN/N.AM./AUSTR,BORN IN U.S.,36.0,RECEIVING STOLEN VEHICLE,BC,2015-05-18,HABITUAL FELON,2015,0.0,1.0,RECEIVING STOLEN VEHICLE,0,1.0,NO,1.0,1.0


In [499]:
# Obs not missing Most Serious Offense
dataset_with_offenses_test['MOST_SERIOUS_OFFENSE_CODE'].notnull().sum()

827731

In [501]:
# Of obs not missing Most Serious Offense, how many are missing our coded value?
#dataset_with_offenses_test[dataset_with_offenses_test['MOST_SERIOUS_OFFENSE_CODE'].notnull()]['Primary offense code'].isnull().sum()
dataset_with_offenses_test['Primary offense code'].isnull().sum()


41678

In [502]:
41678/827731

0.050352107145920595

In [503]:
dataset_with_offenses_test.groupby('Decided Category').count()

Unnamed: 0_level_0,ID,COMMITMENT_PREFIX,EARLIEST_SENTENCE_EFFECTIVE_DT,MOST_SERIOUS_OFFENSE_CODE,INMATE_COMPUTATION_STATUS_FLAG,END_DATE,PROJ_END_DATE,INMATE_RECORD_STATUS_CODE,INMATE_ADMIN_STATUS_CODE,DATE_OF_LAST_INMATE_MOVEMENT,TYPE_OF_LAST_INMATE_MOVEMENT,CURRENT_COMMITMENT_PREFIX,CONTROL_STATUS,GENDER,RACE,BIRTH_DATE,STATE_BORN,ETHNICITY,CITIZENSHIP,DISCIPLINARY_INFRACTION_COUNT,PRIMARY_OFFENSE_CODE,NextPrefix,NextStart,NextOffense,new_col,Time_Diff,Recidivate,Primary offense code,Description (if needed),Needed a check?,More lenient,More harsh
Decided Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1
1.0,223738,223738,223738,223738,223738,223738,223738,221827,221827,221827,219717,156393,221827,223738,223738,223738,217722,223635,223636,132007,212357,223738,119684,223738,223738,119684,223738,223738,223738,223738,223738,223738
2.0,211241,211241,211241,211241,211241,211241,211241,209782,209782,209782,207662,166963,209782,211241,211241,211241,206791,211106,211111,129363,197567,211241,113823,211241,211241,113823,211241,211241,211241,211241,211241,211241
3.0,249430,249430,249430,249430,249430,249430,249430,247991,247991,247991,244807,207870,247991,249430,249430,249430,237519,249050,249061,173964,238040,249430,111053,249430,249430,111053,249430,249430,249430,249430,249430,249430
4.0,83891,83891,83891,83891,83891,83891,83891,83164,83164,83164,82276,63959,83164,83891,83891,83891,80918,83858,83860,52369,80553,83891,39540,83891,83891,39540,83891,83891,83891,83891,83891,83891
5.0,17753,17753,17753,17753,17753,17753,17753,17653,17653,17653,17494,12757,17653,17753,17753,17753,16631,17752,17752,13995,17366,17753,5045,17753,17753,5045,17753,17753,17753,17753,17753,17753


In [504]:
dataset_with_offenses_test.shape

(827731, 33)

In [523]:
# Step 8 and 9
# Now, merge on coded offenses onto NextOffense, turn Decided Category, More Lenient, and more harsh = 0 if recidivism = 0
dataset_with_offenses = dataset_flag.merge(coded_offenses, how='left', left_on='NextOffense', right_on='Primary offense code')
dataset_with_offenses.loc[dataset_with_offenses['Recidivate']==0,'Decided Category'] = 0
dataset_with_offenses.loc[dataset_with_offenses['Recidivate']==0,'More lenient'] = 0
dataset_with_offenses.loc[dataset_with_offenses['Recidivate']==0,'More harsh'] = 0
dataset_with_offenses.shape

(827731, 33)

In [524]:
dataset_with_offenses.groupby('Decided Category').count()

Unnamed: 0_level_0,ID,COMMITMENT_PREFIX,EARLIEST_SENTENCE_EFFECTIVE_DT,MOST_SERIOUS_OFFENSE_CODE,INMATE_COMPUTATION_STATUS_FLAG,END_DATE,PROJ_END_DATE,INMATE_RECORD_STATUS_CODE,INMATE_ADMIN_STATUS_CODE,DATE_OF_LAST_INMATE_MOVEMENT,TYPE_OF_LAST_INMATE_MOVEMENT,CURRENT_COMMITMENT_PREFIX,CONTROL_STATUS,GENDER,RACE,BIRTH_DATE,STATE_BORN,ETHNICITY,CITIZENSHIP,DISCIPLINARY_INFRACTION_COUNT,PRIMARY_OFFENSE_CODE,NextPrefix,NextStart,NextOffense,new_col,Time_Diff,Recidivate,Primary offense code,Description (if needed),Needed a check?,More lenient,More harsh
Decided Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1
0.0,679080,679080,679080,679080,679080,679080,679080,674639,674639,674639,669057,515238,674639,679080,679079,679080,652115,678402,678417,403051,645902,679080,254782,679080,679080,254782,679080,242674,242674,242674,679080,679080
1.0,41638,41638,41638,41638,41638,41638,41638,41120,41120,41120,40430,31655,41120,41638,41638,41638,41194,41637,41638,33209,39492,41638,41638,41638,41638,41638,41638,41638,41638,41638,41638,41638
2.0,38067,38067,38067,38067,38067,38067,38067,37688,37688,37688,36995,33072,37688,38067,38067,38067,37718,38066,38067,31547,35620,38067,38067,38067,38067,38067,38067,38067,38067,38067,38067,38067
3.0,45758,45758,45758,45758,45758,45758,45758,45427,45427,45427,44231,40135,45427,45758,45758,45758,44997,45749,45752,40076,42785,45758,45758,45758,45758,45758,45758,45758,45758,45758,45758,45758
4.0,13644,13644,13644,13644,13644,13644,13644,13482,13482,13482,13202,11186,13482,13644,13644,13644,13451,13643,13644,11199,13016,13644,13644,13644,13644,13644,13644,13644,13644,13644,13644,13644
5.0,3046,3046,3046,3046,3046,3046,3046,3025,3025,3025,2831,2485,3025,3046,3046,3046,2930,3046,3046,2796,2953,3046,3046,3046,3046,3046,3046,3046,3046,3046,3046,3046


In [516]:
dataset_with_offenses['Decided Category'].isnull().sum()

11688

In [507]:
679080 + 41638 + 38067 + 45758 + 13644 + 3046

821233

In [508]:
1-((821233)/dataset_with_offenses.shape[0])

0.007850376511209567

In [509]:
827731 - 821233

6498

In [518]:
11688/dataset_with_offenses.shape[0]

0.014120529495693649

In [526]:
# less than 1% of the data is missing our outcome variable (when using the 1 year definition)
# of recidivism
# with the 3 year definition of recidivism, 1.4% of data is missing outcome variable
# i think we can drop this amount

In [234]:
# Step 10 (next two cells)

# demographic data is not missing in inmate profile, but its missing systematically for ~6000 observations
# in final dataset - which 6k observations are missing in inmate profile?

# based on spot checking many of the 6k obs missing from inmate profile, it appears that NC offender search says
# their incarceration record is currently unavailable - not sure for what reason

# so it makes sense to try and get a lot of these variables from other datasets where possible (e.g. demographics
# from offender profile, disciplinary infractions from the displinary infractions dataset)
inmt_profile.isnull().sum()

ID                                   0
INMATE_RECORD_STATUS_CODE            0
INMATE_ADMIN_STATUS_CODE             0
DATE_OF_LAST_INMATE_MOVEMENT         0
TYPE_OF_LAST_INMATE_MOVEMENT      5774
CURRENT_COMMITMENT_PREFIX       143128
CONTROL_STATUS                       0
dtype: int64

In [235]:
# by bringing in data from other sources, there are a lot fewer missings
# still - need to decide how to deal with features that are missing
dataset_with_offenses.isnull().sum()

ID                                     0
COMMITMENT_PREFIX                      0
EARLIEST_SENTENCE_EFFECTIVE_DT         0
MOST_SERIOUS_OFFENSE_CODE         308659
INMATE_COMPUTATION_STATUS_FLAG         0
END_DATE                               0
PROJ_END_DATE                          0
INMATE_RECORD_STATUS_CODE           6216
INMATE_ADMIN_STATUS_CODE            6216
DATE_OF_LAST_INMATE_MOVEMENT        6216
TYPE_OF_LAST_INMATE_MOVEMENT       15212
CURRENT_COMMITMENT_PREFIX         202165
CONTROL_STATUS                      6216
GENDER                                 0
RACE                                   2
BIRTH_DATE                             0
STATE_BORN                         29626
ETHNICITY                            692
CITIZENSHIP                          669
DISCIPLINARY_INFRACTION_COUNT     310201
NextPrefix                             0
NextStart                         430781
NextOffense                       111635
new_col                                0
Time_Diff       

In [236]:
#dataset_with_offenses.groupby('TYPE_OF_LAST_INMATE_MOVEMENT').count()

In [525]:
dataset_with_offenses.to_csv('../data/dataset_main.csv', index=False)
active_sentences.to_csv('../data/active_sentences.csv', index=False)