In [1]:
# Purpose of this to carry out the following:
    # explore features we already have and create new ones
    # once that is done, carry out an appropriate train/test/validate split - taking temporal validation into account
    # pre processing (impute missingness separately for train/test/validate, normalize, etc)

# The datasets used are the datasets created by build_dataset - they're
# too big for github, so they have been uploaded to google drive. They should
# be downloaded into your local data folder to be imported in this notebook

# The end goal of this notebook is to have our final datasets ready for analysis
# this code should then be moved into our .py files so the notebook can eventually be deleted

In [2]:
import sqlite3
from sqlite3 import Error
import pandas as pd
import config
import os.path
from os import path

#from create_db import create_connection, create_table, clean_column_names
#from populate_db import extract_data, insert_records
#import query_db as qd

import importlib

import datetime
import re
import numpy as np

In [3]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [4]:
# Import datasets created in build_dataset

dataset_main = pd.read_csv('../data/dataset_main.csv')
dataset_active_sentences = pd.read_csv('../data/active_sentences.csv')

In [5]:
dataset_main.head()

KeyboardInterrupt: 

In [None]:
dataset_main.isnull().sum()

In [None]:
# Decisions to be made:
    # Which variables to keep?
    # Which features need to be constructed from the available variables?
    # How do we want to deal with missings?
        # e.g. impute? choose majority? some notes here: https://towardsdatascience.com/working-with-missing-data-in-machine-learning-9c0a430df4ce
        
# A first pass, following this group: https://bucklerd.github.io/MUSA801_Recidivism_Markdown/#
    # Race - keep
    # Sex - keep
    # Age at each sentence - need to calculate using birth date and effective sentence begin date - more indifferent
    # Age category each sentence - keep, look at literature to decide how to categorize
    # Ethnicity - keep
    # Citizenship - keep, but just look at variation, don't include in model
    # Age at first incarceration? Could compute as age at first sentence - keep, similar information to age category
    # Most serious current offense (since there are so many categories, do we want to map on our coded 5 point scale
        # to this..?)
        # - different versions of this (most serious offense, and turn everything else to other), one-hot encoding
        # - with 5 point scale 
    # Current crime violent or not violent (not sure where they are gtting this from, or if its self constructed)
        # - our scale 4-5 to 1, our scale 1-3 is 0 (self-constructed)
        # - feel iffy about this, so also try leaving out
    # Total sentence count - can be computed - lots of bias baked in? - would be at the individual level?
        # - keep, and see how it affects the model (prior history context)
    # Juvenile Offense Flag - would need to construct using age at first incarceration - keep
    # total count of felony and misdemeanor charges - i think these can be calculated from sentence component
        # would be at the individual level not sentence level? 
        # - keep, for the sentence that got a recidivate flag, how many flags in either category
    # custody_class_code - i think this CONTROL_STATUS
        # individual level not sentence level - don't keep (probably adds more bias than value...)
    # special characteristics - i didn't really know how to make sense of this, so i didn't include it for now...
    # - - don't keep (probably adds more bias than value...)
    # total disciplinary infractions - would be at the individual level not the sentence level
        # although this comes from a file that has infraction by date so in theory
        #   we could calculate at the sentence level with some SQL maneouvering 
        # how many infractions between each start/end date of the sentence, but this would be a bit more complicated
    # Type of last inmate movement - we have this, but i'm not sure how much value it adds. also its at the
        # offender level, not sentence level - don't keep 
    

# Thoughts on missingness:
    # Race, Sex, Birth date - basically not missing, can drop or impute couple that are
    # Ethnicity and Citizenship - Majority impute?
    # Most serious current offense - already working on trying to make this less missing
    # Disciplinary infractions - the way this is constructed is by merging on from a file that contains
        # infractions. So i think it is safe to assume that if this variable is missing for an individual,
        # they did not commit any infractions. replace with 0?
    # 
        
    
    

### Decisions:

*First model is most parsimonious
* Bring in median household income and unemployment data so the predictor mechanism isn't just on individual (maybe NC, annual)

### Splitting the work:
* Damini: (pulling via SQL)
    - Disciplinary infractions
    - Most serious current offense
    - Current crime violent
    - Total count of felony and misdemeanor charges
    - Total sentence count
    
* Charmaine:
    - Median HH income
    - Unemployment
    - Age at each sentence - need to calculate using birth date and effective sentence begin date - more indifferent
    - Age category each sentence - keep, look at literature to decide how to categorize
    - Age at first incarceration? Could compute as age at first sentence - keep, similar information to age category
    - Juvenile Offense Flag

### Charmaine's WIP code:

In [None]:
# Check how many observations you have and make sure you don't drop any while creating new features
dataset_main.shape

In [None]:
# Create AGE_AT_SENTENCE
dataset_main['EARLIEST_SENTENCE_EFFECTIVE_DT'] = pd.to_datetime(dataset_main['EARLIEST_SENTENCE_EFFECTIVE_DT'], yearfirst=True)
dataset_main.loc[dataset_main['BIRTH_DATE'] == '0001-01-01', 'BIRTH_DATE'] = np.NaN
dataset_main['BIRTH_DATE'] = pd.to_datetime(dataset_main['BIRTH_DATE'], format='%Y/%m/%d')

dataset_main['age_at_sentence'] = (dataset_main['EARLIEST_SENTENCE_EFFECTIVE_DT'] - dataset_main['BIRTH_DATE']).astype('<m8[Y]')
dataset_main['age_at_sentence'].describe()


In [None]:
# Check number of misisng
dataset_main['age_at_sentence'].isnull().sum()

In [None]:
# Check observations where age is negative
dataset_main.loc[dataset_main['age_at_sentence'] < 0, ['EARLIEST_SENTENCE_EFFECTIVE_DT', 'BIRTH_DATE']]

# Convert to NaN?
dataset_main.loc[dataset_main['age_at_sentence'] < 0, ['age_at_sentence']] = np.NaN

# Check number of missing
dataset_main['age_at_sentence'].isnull().sum()

In [None]:
# Create age categories
# Resources: https://www.ussc.gov/research/research-reports/effects-aging-recidivism-among-federal-offenders
dataset_main['age_cat'] = pd.cut(dataset_main['age_at_sentence'],
                                 bins=[0,17,21,24,29,34,39,44,49,54,59,64,90],
                                 labels=['Under 18', '18-21','22-24','25-29','30-34','35-39','40-44','45-49',
                                        '50-54','55-59','60-64','65 and older',])

In [None]:
dataset_main.groupby(['age_cat']).size()

In [None]:
# Compute age at first incarceration
first_incarceration = pd.DataFrame(dataset_main.groupby(['ID'])['EARLIEST_SENTENCE_EFFECTIVE_DT'].min().reset_index(name='first_incarceration_date'))
dataset_main = dataset_main.merge(first_incarceration, on='ID')

In [None]:
dataset_main[['ID','COMMITMENT_PREFIX','EARLIEST_SENTENCE_EFFECTIVE_DT','first_incarceration_date']].head()

In [None]:
# Flag for juvenile offense
dataset_main['age_first_offense'] = (dataset_main['first_incarceration_date'] - dataset_main['BIRTH_DATE']).astype('<m8[Y]')
dataset_main['age_first_offense'].describe()

In [None]:
# Check observations where age is negative
dataset_main.loc[dataset_main['age_first_offense'] < 0, ['EARLIEST_SENTENCE_EFFECTIVE_DT', 'BIRTH_DATE']]

In [None]:
#dataset_main[dataset_main['age_first_offense'] < 10].count()

In [None]:
import matplotlib.pyplot

dataset_main.hist(column=['age_first_offense'])

In [None]:
dataset_main[dataset_main['age_first_offense'] > 0].hist(column=['age_first_offense'])

In [None]:
# Convert to NaN?
dataset_main.loc[dataset_main['age_first_offense'] < 0, ['age_first_offense']] = np.NaN

# Check number of misisng
dataset_main['age_first_offense'].isnull().sum()

In [None]:
dataset_main['juv_first_offense'] = (dataset_main['age_first_offense'] < 18)
dataset_main.sort_values('age_first_offense')[['BIRTH_DATE','first_incarceration_date','age_first_offense', 'juv_first_offense']].head(10)

### TO DISCUSS
A lot of these dates don't make sense. How can a toddler be sentenced?

* Replace with NaN if below 10 (talked with Damini about this)
* Maybe impute to mean/median eventually.
* We could trim the data to start at 1976

In [None]:
dataset_main.sort_values('age_first_offense')[['BIRTH_DATE','first_incarceration_date','age_first_offense', 'juv_first_offense']].sample(10)

### Pull in unemployment data
* Source: BLS LAUS
* Link: https://beta.bls.gov/dataViewer/view/timeseries/LASST370000000000003

In [None]:
dataset_main['EARLIEST_SENTENCE_EFFECTIVE_DT'].describe()

In [None]:
dataset_main.groupby([dataset_main['EARLIEST_SENTENCE_EFFECTIVE_DT'].dt.year]).size().plot(kind="line")

The earliest data BLS has only goes to 1976...
### TO DISCUSS
Should we restrict our data to 1976? We would end up dropping 5% of our data. 

Or 1984? (see median HH income data limitations below...)

In [None]:
# Import downloaded CSV
unemployment = pd.read_csv('../data/unemployment_nc.csv')

In [None]:
unemployment.head()

In [None]:
unemployment['month'] = unemployment['Period'].str[1:]
unemployment['Year'] = unemployment['Year'].astype(str)
unemployment['date_to_merge'] = unemployment['Year'].str.cat(unemployment['month'], sep ="-")
unemployment['date_to_merge'].head()

In [None]:
# Create a str column to merge on
dataset_main['date_to_merge'] = dataset_main['EARLIEST_SENTENCE_EFFECTIVE_DT'].dt.strftime('%Y-%m')

# Rename variables 
unemployment = unemployment.rename(columns={"Value": "unemp_rate"})
unemployment_limited = unemployment[['date_to_merge','unemp_rate']]

# Merge with unemployment data
dataset_main = dataset_main.merge(unemployment_limited, on='date_to_merge', how='left')
check_cols = ['EARLIEST_SENTENCE_EFFECTIVE_DT','date_to_merge','unemp_rate']
dataset_main[check_cols].sample(10)

In [None]:
# Check how many are missing
dataset_main['unemp_rate'].isnull().sum() / dataset_main.shape[0]

### Pull in median household income
* Source: Table H-8
* Links: 
  * https://www.census.gov/data/tables/time-series/demo/income-poverty/historical-income-households.html
  * https://fred.stlouisfed.org/series/MEHOINUSA672N
* Note: Only has 1984-2018?

In [None]:
hh_inc = pd.read_excel('../data/h08.xls', sheet_name='edited', usecols=['Year','Median HH Income'], nrows=38)

In [None]:
hh_inc

#### Source: https://www.census.gov/topics/income-poverty/income/guidance/cps-historic-footnotes.html

36.	Beginning with 2009 income data, the Census Bureau expanded the upper income interval used to calculate medians and Gini indexes to \\$250,000 or more. Medians falling in the upper open-ended interval are plugged with "\\$250,000." Before 2009, the upper open-ended interval was \\$100,000 and a plug of "\\$100,000" was used.

37.	Implementation of Census 2010-based population controls.

38.	The 2014 CPS ASEC included redesigned questions for income and health insurance coverage. All of the approximately 98,000 addresses were eligible to receive the redesigned set of health insurance coverage questions. The redesigned income questions were implemented to a subsample of the 98,000 addresses using a probability split panel design. Approximately 68,000 addresses were eligible to receive a set of income questions similar to those used in the 2013 CPS ASEC and the remaining 30,000 addresses were eligible to receive the redesigned income questions. The source of these 2013 estimates is the portion of the CPS ASEC sample which received the income questions consistent with the 2013 CPS ASEC, approximately 68,000 addresses.

39.	The source of these 2013 estimates is the portion of the CPS ASEC sample which received the redesigned income questions, approximately 30,000 addresses.

40.	Implementation of an updated CPS ASEC processing system.

We have duplicates of 2013 and 2017, so footnotes 38, 39, and 40 are the most relevant. 

#### Decision: Which duplicates do we use?
- Should we drop the (40) version of 2017? And use the updated version? 
- Should we drop the (39) version of 2013? To be consistent with the decision above to use the updated system.

In [None]:
# Drop duplicate rows and convert to string
hh_inc_edit = hh_inc.copy()
hh_inc_edit['Year'] = hh_inc_edit['Year'].astype(str)
hh_inc_edit = hh_inc_edit.drop(axis=0, index=[2,6]) # Drop (39) and (40) versions of 2013 and 2017

# Slice string to only keep numbers
hh_inc_edit['Year'] = hh_inc_edit['Year'].str[:4]
hh_inc_edit.head()

In [None]:
# Create a str column to merge on
dataset_main['year_to_merge'] = dataset_main['EARLIEST_SENTENCE_EFFECTIVE_DT'].dt.strftime('%Y')

# Rename variable
hh_inc_edit = hh_inc_edit.rename(columns={"Year": "year_to_merge"})

# Merge with dataset_main
dataset_main = dataset_main.merge(hh_inc_edit, on='year_to_merge', how='left')
check_cols = ['EARLIEST_SENTENCE_EFFECTIVE_DT','year_to_merge','Median HH Income']
dataset_main[check_cols].sample(10)

In [None]:
# Drop merging variables
dataset_main.drop(axis=1, columns=['year_to_merge', 'date_to_merge'], inplace=True)
dataset_main.head()

### Pipeline Steps

1. **Read Data.**
Load the data. Your function for reading in data can be as simple as calling pd.read_csv. If this step is more complicated (e.g. in your projects), you will want to write more detailed functions.

2. **Explore Data.**
Automate common exploratory tasks. This can include generating distributions of variables, correlations between them, identifying outliers, summarizing by groups, identifying the time range of the data, etc. Feel free to leverage your work from previous labs and Step 1 above.

3. **Create Training and Testing Sets.**
Create training and testing splits. You should use a separate training set, validation set (to tune hyperparameters), and testing set to perform cross-validation.

4. **Pre-Process Data.**
Automate pre-processing steps. One function should impute missing values of continuous variables using the median value and the other should normalize continuous variables.
    * No need to impute BIRTH_DATE, but we can impute AGE variables with median
    * Majority-vote for juvenile flag
    * Disciplinary infractions
        * Missing should be converted to 0
    * Most serious current offense - should not be missing
        * Limit to certain number categories (e.g., top vs other), before train/test/split becuase not imputing.
        * For version where we map on our scales, will be missing 5% of the time, imputed with most common category after train/test/split
    * Current crime violent - will be missing in places
        * Will either be missing or will be in our scale
        * Impute with most common after train/test split 
    * Total count of felony and misdemeanor charges - might be missing
        * Impute with median after train/test/split
    * Total sentence count - shouldn't be missing 


5. **Generate Features.**
Faciliate feature generation. One function should perform one-hot encoding of categorical variables (e.g. with pd.get_dummies) and one function should discretize continuous variables (e.g. with pd.cut). Discretizing continuous variables can be useful in cases where the variable has a significant cutoff value (for example, age could be discretized to distinguish between children under 18 and adults 18 and older).

6. **Build Classifiers.**
Apply machine learning models to a dataset. The function should also print the amount of time required to train each model. 

7. **Evaluate Classifiers.**
Calculate the accuracy of your models based on your testing set, and validate models. 

## Damini's WIP Code

In [154]:
import pandas as pd

In [155]:
dataset_main_active = pd.read_csv('../data/dataset_main_active.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [156]:
# temporary until re run build data
dataset_main_active.loc[dataset_main_active['NextPrefix']==0,'NextPrefix'] = "NONE"
dataset_main_active.loc[dataset_main_active['NextPrefix']=="0",'NextPrefix'] = "NONE"

In [157]:
dataset_main_active.dtypes

ID                                      int64
COMMITMENT_PREFIX                      object
EARLIEST_SENTENCE_EFFECTIVE_DT         object
MOST_SERIOUS_OFFENSE_CODE              object
INMATE_COMPUTATION_STATUS_FLAG         object
END_DATE                               object
PROJ_END_DATE                          object
INMATE_RECORD_STATUS_CODE              object
INMATE_ADMIN_STATUS_CODE               object
DATE_OF_LAST_INMATE_MOVEMENT           object
TYPE_OF_LAST_INMATE_MOVEMENT           object
CURRENT_COMMITMENT_PREFIX              object
CONTROL_STATUS                         object
GENDER                                 object
RACE                                   object
BIRTH_DATE                             object
STATE_BORN                             object
ETHNICITY                              object
CITIZENSHIP                            object
PRIMARY_OFFENSE_CODE                   object
NextPrefix                             object
NextStart                         

In [158]:
dataset_main_active.rename(columns={"('Count', 'FELON')":'felon_count',"('Count', 'MISD.')":'misd_count'}, inplace=True)


In [159]:
dataset_main_active.columns

Index(['ID', 'COMMITMENT_PREFIX', 'EARLIEST_SENTENCE_EFFECTIVE_DT',
       'MOST_SERIOUS_OFFENSE_CODE', 'INMATE_COMPUTATION_STATUS_FLAG',
       'END_DATE', 'PROJ_END_DATE', 'INMATE_RECORD_STATUS_CODE',
       'INMATE_ADMIN_STATUS_CODE', 'DATE_OF_LAST_INMATE_MOVEMENT',
       'TYPE_OF_LAST_INMATE_MOVEMENT', 'CURRENT_COMMITMENT_PREFIX',
       'CONTROL_STATUS', 'GENDER', 'RACE', 'BIRTH_DATE', 'STATE_BORN',
       'ETHNICITY', 'CITIZENSHIP', 'PRIMARY_OFFENSE_CODE', 'NextPrefix',
       'NextStart', 'NextOffense', 'new_col', 'Time_Diff', 'Recidivate',
       'INFRACTION_PER_SENT', 'felon_count', 'misd_count',
       'Primary offense code_x', 'Description (if needed)_x',
       'Recidivate_Risk_Level', 'Needed a check?_x',
       'Recidivate_Risk_Level_Lenient', 'Recidivate_Risk_Level_Harsh',
       'Primary offense code_y', 'Description (if needed)_y',
       'Current_Offense_Risk_Level', 'Needed a check?_y',
       'Current_Offense_Risk_Level_Lenient',
       'Current_Offense_Risk_Level_

In [160]:
dataset_main_active = dataset_main_active.loc[:,['ID', 'COMMITMENT_PREFIX', 'EARLIEST_SENTENCE_EFFECTIVE_DT','MOST_SERIOUS_OFFENSE_CODE','END_DATE', 'INMATE_RECORD_STATUS_CODE','INMATE_ADMIN_STATUS_CODE', 'DATE_OF_LAST_INMATE_MOVEMENT','TYPE_OF_LAST_INMATE_MOVEMENT','CONTROL_STATUS', 'GENDER', 'RACE', 'BIRTH_DATE', 'STATE_BORN','ETHNICITY', 'CITIZENSHIP', 'PRIMARY_OFFENSE_CODE', 'NextPrefix','NextStart', 'NextOffense', 'Time_Diff', 'Recidivate','INFRACTION_PER_SENT', 'misd_count','felon_count','Recidivate_Risk_Level', 'Recidivate_Risk_Level_Lenient', 'Recidivate_Risk_Level_Harsh','Current_Offense_Risk_Level','Current_Offense_Risk_Level_Lenient','Current_Offense_Risk_Level_Harsh']]


In [161]:
pd.set_option('display.max_columns', None)

In [162]:
dataset_main_active.head()

Unnamed: 0,ID,COMMITMENT_PREFIX,EARLIEST_SENTENCE_EFFECTIVE_DT,MOST_SERIOUS_OFFENSE_CODE,END_DATE,INMATE_RECORD_STATUS_CODE,INMATE_ADMIN_STATUS_CODE,DATE_OF_LAST_INMATE_MOVEMENT,TYPE_OF_LAST_INMATE_MOVEMENT,CONTROL_STATUS,GENDER,RACE,BIRTH_DATE,STATE_BORN,ETHNICITY,CITIZENSHIP,PRIMARY_OFFENSE_CODE,NextPrefix,NextStart,NextOffense,Time_Diff,Recidivate,INFRACTION_PER_SENT,misd_count,felon_count,Recidivate_Risk_Level,Recidivate_Risk_Level_Lenient,Recidivate_Risk_Level_Harsh,Current_Offense_Risk_Level,Current_Offense_Risk_Level_Lenient,Current_Offense_Risk_Level_Harsh
0,4,AA,1983-07-12,SELL SCHEDULE II,1984-07-11,INACTIVE,INACTIVE,1984-07-11,TERMINATED PAROLE,REGULAR POPULATION RPOP,MALE,WHITE,1961-10-15,ALASKA,UNKNOWN,BORN IN U.S.,SELL SCHEDULE II,NONE,,0,,0.0,0.0,0.0,2.0,0.0,0.0,0.0,3.0,2.0,4.0
1,6,AA,1973-01-30,WORTHLESS CHECK,1973-03-28,INACTIVE,INACTIVE,1975-08-18,TERMINATED PAROLE,REGULAR POPULATION RPOP,MALE,WHITE,1951-07-17,NORTH CAROLINA,UNKNOWN,BORN IN U.S.,WORTHLESS CHECK,AB,1973-04-11,WORTHLESS CHECK,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0
2,6,AB,1973-04-11,WORTHLESS CHECK,1975-08-18,INACTIVE,INACTIVE,1975-08-18,TERMINATED PAROLE,REGULAR POPULATION RPOP,MALE,WHITE,1951-07-17,NORTH CAROLINA,UNKNOWN,BORN IN U.S.,WORTHLESS CHECK,NONE,,0,,0.0,0.0,27.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
3,8,AA,1990-04-09,DWI DRIVING WHILE IMPAIRED,1990-05-17,INACTIVE,INACTIVE,1995-09-14,TERMINATED PAROLE,REGULAR POPULATION RPOP,MALE,WHITE,1963-12-29,NORTH CAROLINA,UNKNOWN,BORN IN U.S.,DWI DRIVING WHILE IMPAIRED,AB,1993-08-30,HABITUAL IMPAIRED DRIVING,3.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0,2.0,2.0
4,8,AB,1993-08-30,HABITUAL IMPAIRED DRIVING,1994-01-26,INACTIVE,INACTIVE,1995-09-14,TERMINATED PAROLE,REGULAR POPULATION RPOP,MALE,WHITE,1963-12-29,NORTH CAROLINA,UNKNOWN,BORN IN U.S.,HABITUAL IMPAIRED DRIVING,BA,1995-01-02,HABITUAL IMPAIRED DRIVING,1.0,1.0,0.0,0.0,1.0,3.0,3.0,3.0,3.0,3.0,3.0


## Steps to follow for features:
    - Stage 0: constructing variables, understanding weirdness and outliers
    - Stage 1: train/test/validate/active split
    - Stage 2: pre-process - including imputing messiness for vars created in Stage 0, and creating variables based on those (e.g. once Age has been imputed, construct Age_cat)
    

### Disciplinary infractions
- Stage 0: constructed in build_dataset, missings replaced with 0 - no infractions assumed if ID not found in   infractions dataset
- Stage 2: Normalization of continuous var required


### Most serious current offense
- Stage 0 (v1): Construct more generalized var which captures X% of offenses, turning all else to others
- Stage 2 (v1): There should be no missingness (double check this) - will need one-hot encoding
    
- Stage 0 (v2): (alt var - scale coded by us): Already merged, nothing further needed
- Stage 2 (v2): there will be missings (since we only coded 95% of offenses) so fill NA with majority then one-hot


In [163]:
# Most serious current offense v1
#most_offenses = dataset_main_active.groupby("MOST_SERIOUS_OFFENSE_CODE").size().reset_index(name="count")
most_offenses = dataset_main_active.groupby("MOST_SERIOUS_OFFENSE_CODE")['ID'].size().reset_index(name="count")
most_offenses['PCT'] = most_offenses['count'] / dataset_main_active.shape[0]
most_offenses = most_offenses.sort_values(by='PCT', ascending=False)
most_offenses['CUMSUM'] = most_offenses['PCT'].cumsum()

most_offenses['OFFENSE_CLEAN'] = most_offenses['MOST_SERIOUS_OFFENSE_CODE']
most_offenses.loc[most_offenses['CUMSUM'] > 0.9,'OFFENSE_CLEAN'] = "OTHER"
most_offenses = most_offenses.loc[:,['MOST_SERIOUS_OFFENSE_CODE','OFFENSE_CLEAN']]

# Merge this back onto main dataset
dataset_main_active = dataset_main_active.merge(most_offenses, how="left", on="MOST_SERIOUS_OFFENSE_CODE")

In [164]:
dataset_main_active.head()

Unnamed: 0,ID,COMMITMENT_PREFIX,EARLIEST_SENTENCE_EFFECTIVE_DT,MOST_SERIOUS_OFFENSE_CODE,END_DATE,INMATE_RECORD_STATUS_CODE,INMATE_ADMIN_STATUS_CODE,DATE_OF_LAST_INMATE_MOVEMENT,TYPE_OF_LAST_INMATE_MOVEMENT,CONTROL_STATUS,GENDER,RACE,BIRTH_DATE,STATE_BORN,ETHNICITY,CITIZENSHIP,PRIMARY_OFFENSE_CODE,NextPrefix,NextStart,NextOffense,Time_Diff,Recidivate,INFRACTION_PER_SENT,misd_count,felon_count,Recidivate_Risk_Level,Recidivate_Risk_Level_Lenient,Recidivate_Risk_Level_Harsh,Current_Offense_Risk_Level,Current_Offense_Risk_Level_Lenient,Current_Offense_Risk_Level_Harsh,OFFENSE_CLEAN
0,4,AA,1983-07-12,SELL SCHEDULE II,1984-07-11,INACTIVE,INACTIVE,1984-07-11,TERMINATED PAROLE,REGULAR POPULATION RPOP,MALE,WHITE,1961-10-15,ALASKA,UNKNOWN,BORN IN U.S.,SELL SCHEDULE II,NONE,,0,,0.0,0.0,0.0,2.0,0.0,0.0,0.0,3.0,2.0,4.0,SELL SCHEDULE II
1,6,AA,1973-01-30,WORTHLESS CHECK,1973-03-28,INACTIVE,INACTIVE,1975-08-18,TERMINATED PAROLE,REGULAR POPULATION RPOP,MALE,WHITE,1951-07-17,NORTH CAROLINA,UNKNOWN,BORN IN U.S.,WORTHLESS CHECK,AB,1973-04-11,WORTHLESS CHECK,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,WORTHLESS CHECK
2,6,AB,1973-04-11,WORTHLESS CHECK,1975-08-18,INACTIVE,INACTIVE,1975-08-18,TERMINATED PAROLE,REGULAR POPULATION RPOP,MALE,WHITE,1951-07-17,NORTH CAROLINA,UNKNOWN,BORN IN U.S.,WORTHLESS CHECK,NONE,,0,,0.0,0.0,27.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,WORTHLESS CHECK
3,8,AA,1990-04-09,DWI DRIVING WHILE IMPAIRED,1990-05-17,INACTIVE,INACTIVE,1995-09-14,TERMINATED PAROLE,REGULAR POPULATION RPOP,MALE,WHITE,1963-12-29,NORTH CAROLINA,UNKNOWN,BORN IN U.S.,DWI DRIVING WHILE IMPAIRED,AB,1993-08-30,HABITUAL IMPAIRED DRIVING,3.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0,2.0,2.0,DWI DRIVING WHILE IMPAIRED
4,8,AB,1993-08-30,HABITUAL IMPAIRED DRIVING,1994-01-26,INACTIVE,INACTIVE,1995-09-14,TERMINATED PAROLE,REGULAR POPULATION RPOP,MALE,WHITE,1963-12-29,NORTH CAROLINA,UNKNOWN,BORN IN U.S.,HABITUAL IMPAIRED DRIVING,BA,1995-01-02,HABITUAL IMPAIRED DRIVING,1.0,1.0,0.0,0.0,1.0,3.0,3.0,3.0,3.0,3.0,3.0,HABITUAL IMPAIRED DRIVING


### Current Crime Violent Flag
- Stage 0 (v1): don't include variable (we dont have it)
- Stage 2 (v1): dont include variable (we dont have it)
    
- Stage 0 (v2 - scale coded by us): Already merged, nothing further needed
- Stage 2: there will be missings (since we only coded 95% of offenses) so fill NA with majority then categorize as 1 for receiving score 4 or 5 and 0 otherwise


### Total count of felony and misdemeanor charges
- Stage 0: merged and created, nothing further needed
- Stage 2: impute NAs with median (very few missing) then normalize since continuous var


### Total sentence count
- Stage 0: construct
- Stage 2: shouldnt be missing (check) - then normalize since continuous var


In [165]:
count = dataset_main_active.groupby(['ID','COMMITMENT_PREFIX']).count().groupby(level=0).cumsum().reset_index()
count['sentence_count'] = count['EARLIEST_SENTENCE_EFFECTIVE_DT'] - 1
count = count.loc[:,['ID','COMMITMENT_PREFIX','sentence_count']]
dataset_main_active = dataset_main_active.merge(count, how="left", on = ['ID','COMMITMENT_PREFIX'])


In [166]:
dataset_main_active.head()

Unnamed: 0,ID,COMMITMENT_PREFIX,EARLIEST_SENTENCE_EFFECTIVE_DT,MOST_SERIOUS_OFFENSE_CODE,END_DATE,INMATE_RECORD_STATUS_CODE,INMATE_ADMIN_STATUS_CODE,DATE_OF_LAST_INMATE_MOVEMENT,TYPE_OF_LAST_INMATE_MOVEMENT,CONTROL_STATUS,GENDER,RACE,BIRTH_DATE,STATE_BORN,ETHNICITY,CITIZENSHIP,PRIMARY_OFFENSE_CODE,NextPrefix,NextStart,NextOffense,Time_Diff,Recidivate,INFRACTION_PER_SENT,misd_count,felon_count,Recidivate_Risk_Level,Recidivate_Risk_Level_Lenient,Recidivate_Risk_Level_Harsh,Current_Offense_Risk_Level,Current_Offense_Risk_Level_Lenient,Current_Offense_Risk_Level_Harsh,OFFENSE_CLEAN,sentence_count
0,4,AA,1983-07-12,SELL SCHEDULE II,1984-07-11,INACTIVE,INACTIVE,1984-07-11,TERMINATED PAROLE,REGULAR POPULATION RPOP,MALE,WHITE,1961-10-15,ALASKA,UNKNOWN,BORN IN U.S.,SELL SCHEDULE II,NONE,,0,,0.0,0.0,0.0,2.0,0.0,0.0,0.0,3.0,2.0,4.0,SELL SCHEDULE II,0
1,6,AA,1973-01-30,WORTHLESS CHECK,1973-03-28,INACTIVE,INACTIVE,1975-08-18,TERMINATED PAROLE,REGULAR POPULATION RPOP,MALE,WHITE,1951-07-17,NORTH CAROLINA,UNKNOWN,BORN IN U.S.,WORTHLESS CHECK,AB,1973-04-11,WORTHLESS CHECK,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,WORTHLESS CHECK,0
2,6,AB,1973-04-11,WORTHLESS CHECK,1975-08-18,INACTIVE,INACTIVE,1975-08-18,TERMINATED PAROLE,REGULAR POPULATION RPOP,MALE,WHITE,1951-07-17,NORTH CAROLINA,UNKNOWN,BORN IN U.S.,WORTHLESS CHECK,NONE,,0,,0.0,0.0,27.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,WORTHLESS CHECK,1
3,8,AA,1990-04-09,DWI DRIVING WHILE IMPAIRED,1990-05-17,INACTIVE,INACTIVE,1995-09-14,TERMINATED PAROLE,REGULAR POPULATION RPOP,MALE,WHITE,1963-12-29,NORTH CAROLINA,UNKNOWN,BORN IN U.S.,DWI DRIVING WHILE IMPAIRED,AB,1993-08-30,HABITUAL IMPAIRED DRIVING,3.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0,2.0,2.0,DWI DRIVING WHILE IMPAIRED,0
4,8,AB,1993-08-30,HABITUAL IMPAIRED DRIVING,1994-01-26,INACTIVE,INACTIVE,1995-09-14,TERMINATED PAROLE,REGULAR POPULATION RPOP,MALE,WHITE,1963-12-29,NORTH CAROLINA,UNKNOWN,BORN IN U.S.,HABITUAL IMPAIRED DRIVING,BA,1995-01-02,HABITUAL IMPAIRED DRIVING,1.0,1.0,0.0,0.0,1.0,3.0,3.0,3.0,3.0,3.0,3.0,HABITUAL IMPAIRED DRIVING,1


In [167]:
dataset_main_active.isnull().sum()

ID                                         0
COMMITMENT_PREFIX                          0
EARLIEST_SENTENCE_EFFECTIVE_DT             0
MOST_SERIOUS_OFFENSE_CODE                  0
END_DATE                                   0
INMATE_RECORD_STATUS_CODE               5853
INMATE_ADMIN_STATUS_CODE                5853
DATE_OF_LAST_INMATE_MOVEMENT            5853
TYPE_OF_LAST_INMATE_MOVEMENT           19702
CONTROL_STATUS                          5853
GENDER                                     0
RACE                                       1
BIRTH_DATE                                 0
STATE_BORN                             30517
ETHNICITY                                702
CITIZENSHIP                              680
PRIMARY_OFFENSE_CODE                   43502
NextPrefix                                 0
NextStart                             457099
NextOffense                                0
Time_Diff                             457099
Recidivate                             32801
INFRACTION

# Next Steps:
- Train/test/validate/active split
- Write functions to conduct stage 2 for the above vars

## Stratified Sampling For Train Test Validate Split

In [168]:
# hold out active sentences
active_sentences = dataset_main_active[(dataset_main_active['INMATE_ADMIN_STATUS_CODE']=='ACTIVE') & (dataset_main_active['NextPrefix']=="NONE") ]
print("Size of active sentences dataset: ",active_sentences.shape[0])

Size of active sentences dataset:  32801


In [181]:
# Drop those missing decided category
dataset_no_active = dataset_main_active[(dataset_main_active['Recidivate_Risk_Level'].notnull())]
print("Dataset size: " , dataset_no_active.shape[0])

Dataset size:  821314


In [182]:
dataset_no_active.head(2)

Unnamed: 0,ID,COMMITMENT_PREFIX,EARLIEST_SENTENCE_EFFECTIVE_DT,MOST_SERIOUS_OFFENSE_CODE,END_DATE,INMATE_RECORD_STATUS_CODE,INMATE_ADMIN_STATUS_CODE,DATE_OF_LAST_INMATE_MOVEMENT,TYPE_OF_LAST_INMATE_MOVEMENT,CONTROL_STATUS,GENDER,RACE,BIRTH_DATE,STATE_BORN,ETHNICITY,CITIZENSHIP,PRIMARY_OFFENSE_CODE,NextPrefix,NextStart,NextOffense,Time_Diff,Recidivate,INFRACTION_PER_SENT,misd_count,felon_count,Recidivate_Risk_Level,Recidivate_Risk_Level_Lenient,Recidivate_Risk_Level_Harsh,Current_Offense_Risk_Level,Current_Offense_Risk_Level_Lenient,Current_Offense_Risk_Level_Harsh,OFFENSE_CLEAN,sentence_count
0,4,AA,1983-07-12,SELL SCHEDULE II,1984-07-11,INACTIVE,INACTIVE,1984-07-11,TERMINATED PAROLE,REGULAR POPULATION RPOP,MALE,WHITE,1961-10-15,ALASKA,UNKNOWN,BORN IN U.S.,SELL SCHEDULE II,NONE,,0,,0.0,0.0,0.0,2.0,0.0,0.0,0.0,3.0,2.0,4.0,SELL SCHEDULE II,0
1,6,AA,1973-01-30,WORTHLESS CHECK,1973-03-28,INACTIVE,INACTIVE,1975-08-18,TERMINATED PAROLE,REGULAR POPULATION RPOP,MALE,WHITE,1951-07-17,NORTH CAROLINA,UNKNOWN,BORN IN U.S.,WORTHLESS CHECK,AB,1973-04-11,WORTHLESS CHECK,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,WORTHLESS CHECK,0


In [183]:
import datetime

In [184]:
holdOut = 0.2
randomState = 1234


In [185]:
#dataset_no_active = dataset_no_active_backup
#dataset_no_active_backup = dataset_no_active

In [186]:
# Train, val, test split:
start = datetime.datetime.now()

# get number of unique ids and the uniqe IDs
n_ID = len(dataset_no_active.ID.unique())
ids = pd.DataFrame(dataset_no_active.ID.unique())

# sample from IDs
train_index = ids.sample(round(n_ID*(1-holdOut)),random_state = randomState ).values.tolist()
train_index = [item for sublist in train_index for item in sublist]
# train data is data from any IDs that show up in train index
train_val = dataset_no_active[dataset_no_active['ID'].isin(train_index)]
# test data is data from any IDs that don't show up in train index
test_data = dataset_no_active[~dataset_no_active['ID'].isin(train_index)]

# repeat similar process for validate data
n_ID = len(train_val.ID.unique())
ids = pd.DataFrame(train_val.ID.unique())

# sample from IDs
train_index = ids.sample(round(n_ID*(1-holdOut)),random_state = randomState ).values.tolist()
train_index = [item for sublist in train_index for item in sublist]
# train data is data from any IDs that show up in train index
train_data = train_val[train_val['ID'].isin(train_index)]
# test data is data from any IDs that don't show up in train index
validate_data = train_val[~train_val['ID'].isin(train_index)]

stop = datetime.datetime.now()
print("Time Elapsed:", stop - start)  


Time Elapsed: 0:00:03.815541


In [187]:
train_data.shape

(524854, 33)

In [188]:
test_data.shape

(164921, 33)

In [189]:
validate_data.shape

(131539, 33)

In [190]:
# Sanity check

print("Total Number of Unique IDs:" , len(dataset_no_active.ID.unique()))
print("Total Number of IDs in Test Data:" , len(test_data.ID.unique()))
print("Total Number of IDs in Train Data:" , len(train_data.ID.unique()))
print("Total Number of IDs in Validate Data:" , len(validate_data.ID.unique()))

print("Do the IDs add up?" , len(test_data.ID.unique()) + len(train_data.ID.unique()) +  len(validate_data.ID.unique()) ==  len(dataset_no_active.ID.unique()))

print("Does Test Represent 20% of the data?", (len(test_data.ID.unique())/len(dataset_no_active.ID.unique())) == holdOut)
print("Test Represents X% of the data:", (len(test_data.ID.unique())/len(dataset_no_active.ID.unique())))
print("Does Train+Validate Represent 80% of the data?", len(train_data.ID.unique())+len(validate_data.ID.unique())/len(dataset_no_active.ID.unique()) == (1-holdOut))
print("Train+Validate Represents X% of the data:", (len(train_data.ID.unique())+len(validate_data.ID.unique()))/len(dataset_no_active.ID.unique()))
print("Does Validate Represent 20% of the Train+Validate Data?:", len(validate_data.ID.unique())/(len(train_data.ID.unique())+len(validate_data.ID.unique())))
print("Does Train Represent 20% of the Train+Validate Data?:", len(train_data.ID.unique())/(len(train_data.ID.unique())+len(validate_data.ID.unique())))


Total Number of Unique IDs: 444316
Total Number of IDs in Test Data: 88863
Total Number of IDs in Train Data: 284362
Total Number of IDs in Validate Data: 71091
Do the IDs add up? True
Does Test Represent 20% of the data? False
Test Represents X% of the data: 0.1999995498699124
Does Train+Validate Represent 80% of the data? False
Train+Validate Represents X% of the data: 0.8000004501300876
Does Validate Represent 20% of the Train+Validate Data?: 0.2000011253245858
Does Train Represent 20% of the Train+Validate Data?: 0.7999988746754142


In [191]:
# Sanity Check #2 - how representative are our datasets compared to the overall dataset
dataset_no_active.describe()

Unnamed: 0,ID,Time_Diff,Recidivate,INFRACTION_PER_SENT,misd_count,felon_count,Recidivate_Risk_Level,Recidivate_Risk_Level_Lenient,Recidivate_Risk_Level_Harsh,Current_Offense_Risk_Level,Current_Offense_Risk_Level_Lenient,Current_Offense_Risk_Level_Harsh,sentence_count
count,821314.0,397016.0,821314.0,821314.0,821271.0,821271.0,821314.0,821314.0,821314.0,781307.0,781307.0,781307.0,821314.0
mean,488843.2,3.624128,0.171411,3.249752,0.873464,1.011789,0.390327,0.323152,0.467961,2.313956,1.961114,2.723273,1.019291
std,370885.4,4.158104,0.376868,11.627107,1.548466,1.177446,0.961667,0.814176,1.145918,1.064602,0.979385,1.214607,1.614028
min,4.0,-265.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0
25%,198942.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,2.0,0.0
50%,399746.0,2.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,2.0,3.0,0.0
75%,704239.0,5.0,0.0,2.0,1.0,1.0,0.0,0.0,0.0,3.0,2.0,4.0,1.0
max,1638741.0,45.0,1.0,1009.0,125.0,78.0,5.0,5.0,5.0,5.0,5.0,5.0,42.0


In [192]:
test_data.describe()

Unnamed: 0,ID,Time_Diff,Recidivate,INFRACTION_PER_SENT,misd_count,felon_count,Recidivate_Risk_Level,Recidivate_Risk_Level_Lenient,Recidivate_Risk_Level_Harsh,Current_Offense_Risk_Level,Current_Offense_Risk_Level_Lenient,Current_Offense_Risk_Level_Harsh,sentence_count
count,164921.0,80157.0,164921.0,164921.0,164907.0,164907.0,164921.0,164921.0,164921.0,156790.0,156790.0,156790.0,164921.0
mean,489021.9,3.621493,0.173404,3.298876,0.875797,1.009848,0.395165,0.326944,0.473784,2.307673,1.954168,2.718177,1.024588
std,370774.4,4.135034,0.378598,11.580602,1.530071,1.173411,0.967567,0.818879,1.152613,1.063567,0.97716,1.215263,1.603956
min,6.0,-10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0
25%,199036.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,2.0,0.0
50%,399706.0,2.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,2.0,3.0,0.0
75%,704759.0,5.0,0.0,2.0,1.0,1.0,0.0,0.0,0.0,3.0,2.0,4.0,1.0
max,1638691.0,42.0,1.0,590.0,67.0,48.0,5.0,5.0,5.0,5.0,5.0,5.0,26.0


In [193]:
train_data.describe()

Unnamed: 0,ID,Time_Diff,Recidivate,INFRACTION_PER_SENT,misd_count,felon_count,Recidivate_Risk_Level,Recidivate_Risk_Level_Lenient,Recidivate_Risk_Level_Harsh,Current_Offense_Risk_Level,Current_Offense_Risk_Level_Lenient,Current_Offense_Risk_Level_Harsh,sentence_count
count,524854.0,253263.0,524854.0,524854.0,524831.0,524831.0,524854.0,524854.0,524854.0,499345.0,499345.0,499345.0,524854.0
mean,488571.8,3.623846,0.170697,3.222081,0.87132,1.0126,0.388668,0.321973,0.465773,2.315545,1.962939,2.724609,1.017138
std,370963.1,4.168319,0.376244,11.542112,1.540026,1.182059,0.959678,0.813035,1.143334,1.064989,0.98026,1.214526,1.617431
min,4.0,-265.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0
25%,198644.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,2.0,0.0
50%,399177.5,2.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,2.0,3.0,0.0
75%,704033.0,5.0,0.0,2.0,1.0,1.0,0.0,0.0,0.0,3.0,2.0,4.0,1.0
max,1638741.0,44.0,1.0,1009.0,103.0,78.0,5.0,5.0,5.0,5.0,5.0,5.0,42.0


In [194]:
validate_data.describe()

Unnamed: 0,ID,Time_Diff,Recidivate,INFRACTION_PER_SENT,misd_count,felon_count,Recidivate_Risk_Level,Recidivate_Risk_Level_Lenient,Recidivate_Risk_Level_Harsh,Current_Offense_Risk_Level,Current_Offense_Risk_Level_Lenient,Current_Offense_Risk_Level_Harsh,sentence_count
count,131539.0,63596.0,131539.0,131539.0,131533.0,131533.0,131539.0,131539.0,131539.0,125172.0,125172.0,125172.0,131539.0
mean,489701.9,3.628577,0.171759,3.298573,0.879095,1.010986,0.39088,0.323098,0.469389,2.315486,1.962532,2.724323,1.021241
std,370715.8,4.146419,0.377172,12.016407,1.603973,1.163983,0.962164,0.812803,1.147778,1.06433,0.978646,1.214104,1.613017
min,8.0,-13.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0
25%,199723.5,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,2.0,0.0
50%,401918.0,2.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,2.0,3.0,0.0
75%,704783.0,5.0,0.0,2.0,1.0,1.0,0.0,0.0,0.0,3.0,2.0,4.0,1.0
max,1638726.0,45.0,1.0,670.0,125.0,40.0,5.0,5.0,5.0,5.0,5.0,5.0,21.0


In [195]:
active_sentences.describe()

Unnamed: 0,ID,Time_Diff,Recidivate,INFRACTION_PER_SENT,misd_count,felon_count,Recidivate_Risk_Level,Recidivate_Risk_Level_Lenient,Recidivate_Risk_Level_Harsh,Current_Offense_Risk_Level,Current_Offense_Risk_Level_Lenient,Current_Offense_Risk_Level_Harsh,sentence_count
count,32801.0,0.0,0.0,32801.0,32801.0,32801.0,0.0,0.0,0.0,26557.0,26557.0,26557.0,32801.0
mean,896960.9,,,20.133563,0.104082,2.236731,,,,3.44798,2.976315,3.781263,1.521935
std,479394.2,,,41.599285,0.48062,2.013134,,,,1.181862,1.175837,1.097834,2.047846
min,80.0,,,0.0,0.0,0.0,,,,1.0,1.0,1.0,0.0
25%,512314.0,,,1.0,0.0,1.0,,,,3.0,2.0,3.0,0.0
50%,893441.0,,,6.0,0.0,2.0,,,,3.0,3.0,4.0,1.0
75%,1333759.0,,,22.0,0.0,3.0,,,,5.0,4.0,5.0,2.0
max,1638770.0,,,989.0,20.0,87.0,,,,5.0,5.0,5.0,27.0


In [None]:
# Distributions look pretty good across train,test, and validate compared to dataset minus active
# Active sentences don't look as close (most variables are fine, not Infractions though) but there's no reason 
# currently incarcerated people would be a random sample of historical sentences 

# Next need to 