In [8]:
# Purpose of this to carry out the following:
    # explore features we already have and create new ones
    # once that is done, carry out an appropriate train/test/validate split - taking temporal validation into account
    # pre processing (impute missingness separately for train/test/validate, normalize, etc)

# The datasets used are the datasets created by build_dataset - they're
# too big for github, so they have been uploaded to google drive. They should
# be downloaded into your local data folder to be imported in this notebook

# The end goal of this notebook is to have our final datasets ready for analysis
# this code should then be moved into our .py files so the notebook can eventually be deleted

In [1]:
import sqlite3
from sqlite3 import Error
import pandas as pd
import config
import os.path
from os import path

from create_db import create_connection, create_table, clean_column_names
from populate_db import extract_data, insert_records
import query_db as qd

import importlib

import datetime
import re
import numpy as np

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [3]:
# Import datasets created in build_dataset

dataset_main = pd.read_csv('../data/dataset_main.csv')
dataset_active_sentences = pd.read_csv('../data/active_sentences.csv')

In [6]:
dataset_main.head()

Unnamed: 0,ID,COMMITMENT_PREFIX,EARLIEST_SENTENCE_EFFECTIVE_DT,MOST_SERIOUS_OFFENSE_CODE,INMATE_COMPUTATION_STATUS_FLAG,END_DATE,PROJ_END_DATE,INMATE_RECORD_STATUS_CODE,INMATE_ADMIN_STATUS_CODE,DATE_OF_LAST_INMATE_MOVEMENT,TYPE_OF_LAST_INMATE_MOVEMENT,CURRENT_COMMITMENT_PREFIX,CONTROL_STATUS,GENDER,RACE,BIRTH_DATE,STATE_BORN,ETHNICITY,CITIZENSHIP,DISCIPLINARY_INFRACTION_COUNT,NextPrefix,NextStart,NextOffense,new_col,Time_Diff,Recidivate,Primary offense code,Description (if needed),Decided Category,Needed a check?,More lenient,More harsh
0,4,AA,1983-07-12,,EXPIRED,1984-07-11,1984-07-11,INACTIVE,INACTIVE,1984-07-11,TERMINATED PAROLE,,REGULAR POPULATION RPOP,MALE,WHITE,1961-10-15,ALASKA,UNKNOWN,BORN IN U.S.,,0,,0.0,0,,0.0,,,0.0,,0.0,0.0
1,6,AA,1973-01-30,,EXPIRED,1973-03-28,0001-01-01,INACTIVE,INACTIVE,1975-08-18,TERMINATED PAROLE,,REGULAR POPULATION RPOP,MALE,WHITE,1951-07-17,NORTH CAROLINA,UNKNOWN,BORN IN U.S.,,AB,1973-04-11,,1973,0.0,1.0,,,,,,
2,6,AB,1973-04-11,,EXPIRED,1975-08-18,1974-08-10,INACTIVE,INACTIVE,1975-08-18,TERMINATED PAROLE,,REGULAR POPULATION RPOP,MALE,WHITE,1951-07-17,NORTH CAROLINA,UNKNOWN,BORN IN U.S.,,0,,0.0,0,,0.0,,,0.0,,0.0,0.0
3,8,AA,1990-04-09,,EXPIRED,1990-05-17,1990-10-09,INACTIVE,INACTIVE,1995-09-14,TERMINATED PAROLE,,REGULAR POPULATION RPOP,MALE,WHITE,1963-12-29,NORTH CAROLINA,UNKNOWN,BORN IN U.S.,,AB,1993-08-30,,1993,3.0,0.0,,,0.0,,0.0,0.0
4,8,AB,1993-08-30,,EXPIRED,1994-01-26,1994-02-18,INACTIVE,INACTIVE,1995-09-14,TERMINATED PAROLE,,REGULAR POPULATION RPOP,MALE,WHITE,1963-12-29,NORTH CAROLINA,UNKNOWN,BORN IN U.S.,,BA,1995-01-02,,1995,1.0,1.0,,,,,,


In [4]:
dataset_main.isnull().sum()

ID                                     0
COMMITMENT_PREFIX                      0
EARLIEST_SENTENCE_EFFECTIVE_DT         0
MOST_SERIOUS_OFFENSE_CODE         308659
INMATE_COMPUTATION_STATUS_FLAG         0
END_DATE                               0
PROJ_END_DATE                          0
INMATE_RECORD_STATUS_CODE           6216
INMATE_ADMIN_STATUS_CODE            6216
DATE_OF_LAST_INMATE_MOVEMENT        6216
TYPE_OF_LAST_INMATE_MOVEMENT       15212
CURRENT_COMMITMENT_PREFIX         202165
CONTROL_STATUS                      6216
GENDER                                 0
RACE                                   2
BIRTH_DATE                             0
STATE_BORN                         29626
ETHNICITY                            692
CITIZENSHIP                          669
DISCIPLINARY_INFRACTION_COUNT     310201
NextPrefix                             0
NextStart                         430781
NextOffense                       111635
new_col                                0
Time_Diff       

In [7]:
# Decisions to be made:
    # Which variables to keep?
    # Which features need to be constructed from the available variables?
    # How do we want to deal with missings?
        # e.g. impute? choose majority? some notes here: https://towardsdatascience.com/working-with-missing-data-in-machine-learning-9c0a430df4ce
        
# A first pass, following this group: https://bucklerd.github.io/MUSA801_Recidivism_Markdown/#
    # Race
    # Sex
    # Age at each sentence - need to calculate using birth date and effective sentence begin date
    # Age category each sentence
    # Ethnicity
    # Citizenship
    # Age at first incarceration? Could compute as age at first sentence
    # Most serious current offense (since there are so many categories, do we want to map on our coded 5 point scale
        # to this..?)
    # Current crime violent or not violent (not sure where they are gtting this from, or if its self constructed)
    # Total sentence count - can be computed - lots of bias baked in? - would be at the individual level?
    # Juvenile Offense Flag - would need to construct using age at first incarceration 
    # total count of felony and misdemeanor charges - i think these can be calculated from sentence component
        # would be at the individual level not sentence level?
    # custody_class_code - i think this CONTROL_STATUS
        # individual level not sentence level
    # special characteristics - i didn't really know how to make sense of this, so i didn't include it for now...
    # total disciplinary infractions - would be at the individual level not the sentence level
        # although this comes from a file that has infraction by date so in theory we could calculate
        # how many infractions between each start/end date of the sentence, but this would be a bit more complicated
    # Type of last inmate movement - we have this, but i'm not sure how much value it adds. also its at the
        # offender level, not sentence level
    

# Thoughts on missingness:
    # Race, Sex, Birth date - basically not missing, can drop or impute couple that are
    # Ethnicity and Citizenship - Majority impute?
    # Most serious current offense - already working on trying to make this less missing
    # Disciplinary infractions - the way this is constructed is by merging on from a file that contains
        # infractions. So i think it is safe to assume that if this variable is missing for an individual,
        # they did not commit any infractions. replace with 0?
    # 
        
    
    