# Expungement Eligibility Classification - Parallel Featurization

## Dask Transformations

Loading up the Dask client is necessary to run processes on multiple workers

In [1]:
from datetime import datetime
import os

import sqlalchemy as sa
from sqlalchemy.sql import select
from sqlalchemy import (
    Table, 
    Column, 
    Integer, 
    String, 
    MetaData, 
    DateTime,
    or_
)
import pandas as pd
import numpy as np
import dask.dataframe as dd

In [2]:
from distributed import Client

client = Client(n_workers=4)
client

0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:8787/status,

0,1
Dashboard: http://127.0.0.1:8787/status,Workers: 4
Total threads: 16,Total memory: 117.93 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:33095,Workers: 4
Dashboard: http://127.0.0.1:8787/status,Total threads: 16
Started: Just now,Total memory: 117.93 GiB

0,1
Comm: tcp://127.0.0.1:43113,Total threads: 4
Dashboard: http://127.0.0.1:45343/status,Memory: 29.48 GiB
Nanny: tcp://127.0.0.1:42859,
Local directory: /home/jupyter-isaak-a/dask-worker-space/worker-syesmqwr,Local directory: /home/jupyter-isaak-a/dask-worker-space/worker-syesmqwr

0,1
Comm: tcp://127.0.0.1:46523,Total threads: 4
Dashboard: http://127.0.0.1:46565/status,Memory: 29.48 GiB
Nanny: tcp://127.0.0.1:33087,
Local directory: /home/jupyter-isaak-a/dask-worker-space/worker-sb8mpit0,Local directory: /home/jupyter-isaak-a/dask-worker-space/worker-sb8mpit0

0,1
Comm: tcp://127.0.0.1:33481,Total threads: 4
Dashboard: http://127.0.0.1:39121/status,Memory: 29.48 GiB
Nanny: tcp://127.0.0.1:34201,
Local directory: /home/jupyter-isaak-a/dask-worker-space/worker-er371qi_,Local directory: /home/jupyter-isaak-a/dask-worker-space/worker-er371qi_

0,1
Comm: tcp://127.0.0.1:45755,Total threads: 4
Dashboard: http://127.0.0.1:39397/status,Memory: 29.48 GiB
Nanny: tcp://127.0.0.1:41459,
Local directory: /home/jupyter-isaak-a/dask-worker-space/worker-p58dd_sn,Local directory: /home/jupyter-isaak-a/dask-worker-space/worker-p58dd_sn


### Data Loading

In [3]:
USER = 'jupyter'
PASSWORD = os.environ['POSTGRES_PASS']
HOST = 'localhost'
PORT = '5432'
DB = 'expunge'

DATABASE_URI = f"postgresql://{USER}:{PASSWORD}@{HOST}:{PORT}/{DB}"
engine = sa.create_engine(DATABASE_URI)

Loading SQL extension for useful spot-checking

In [4]:
%load_ext sql
%sql {DATABASE_URI}

Dask DataFrame does not accept raw SQL, but will accept a SQLAlchemy ORM query object. We are using this to read data from `expunge` sorted by both `person_id` and `HearingDate`. This ordering is important for some of the partitioned aggregations. 

In [5]:
metadata_obj = MetaData()
expunge = Table('expunge_clean', metadata_obj, # Full Dataset
# expunge = Table('expunge_10k_clean', metadata_obj, # ~26K records
# expunge = Table('expunge_1k_clean', metadata_obj, # ~26K records
     Column('person_id', Integer),
     Column('HearingDate', DateTime),
     Column('CodeSection', String),
     Column('ChargeType', String),
     Column('Class', String),
     Column('DispositionCode', String),
     Column('Plea', String),
     Column('Race', String),
     Column('Sex', String),
     Column('fips', Integer),
)

Here you can see the raw query string to which the `query` SQLAlchemy object translates

In [6]:
query = (
    select(expunge)
    # Where clause just for testing, comment out for full run
    .where(
        or_(
            expunge.c.person_id == 127051000000102, 
            expunge.c.person_id == 224010000000817,
            expunge.c.person_id == 1000000000362
        )
    )
    .order_by(expunge.c.person_id, expunge.c.HearingDate)
)
print(str(query))

SELECT expunge_clean.person_id, expunge_clean."HearingDate", expunge_clean."CodeSection", expunge_clean."ChargeType", expunge_clean."Class", expunge_clean."DispositionCode", expunge_clean."Plea", expunge_clean."Race", expunge_clean."Sex", expunge_clean.fips 
FROM expunge_clean 
WHERE expunge_clean.person_id = :person_id_1 OR expunge_clean.person_id = :person_id_2 OR expunge_clean.person_id = :person_id_3 ORDER BY expunge_clean.person_id, expunge_clean."HearingDate"


In [7]:
meta_dict = {
    'HearingDate': 'datetime64[ns]',
    'CodeSection': str,
    'ChargeType': str,
    'Class': str,
    'DispositionCode': str,
    'Plea': str,
    'Race': str,
    'Sex': str,
    'fips': 'int64'
}

meta_frame = pd.DataFrame(columns=meta_dict.keys()).astype(meta_dict)

meta_frame.dtypes

HearingDate        datetime64[ns]
CodeSection                object
ChargeType                 object
Class                      object
DispositionCode            object
Plea                       object
Race                       object
Sex                        object
fips                        int64
dtype: object

In [8]:
%%time
df = dd.read_sql_table(
    table=query,
    index_col='person_id',
    uri=DATABASE_URI,
#     npartitions=32,
#     npartitions=8
    meta=meta_frame
)

CPU times: user 31.9 ms, sys: 3.96 ms, total: 35.9 ms
Wall time: 45.3 ms


In [9]:
df

Unnamed: 0_level_0,HearingDate,CodeSection,ChargeType,Class,DispositionCode,Plea,Race,Sex,fips
npartitions=1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1000000000000.0,datetime64[ns],object,object,object,object,object,object,object,int64
224010000000000.0,...,...,...,...,...,...,...,...,...


In [10]:
pd.set_option('max_columns', None)

In [11]:
df.head()

Unnamed: 0_level_0,HearingDate,CodeSection,ChargeType,Class,DispositionCode,Plea,Race,Sex,fips
person_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1000000000362,2004-09-22,C.18.2-266,Misdemeanor,,Guilty,Guilty,Black,Male,760
1000000000362,2009-04-21,18.2-268.3,Misdemeanor,1.0,Guilty,Guilty,Black,Male,87
1000000000362,2009-05-28,C.18.2-266,Felony,,Guilty,Guilty,Black,Male,87
1000000000362,2010-11-23,A.46.2-391D,Felony,,Not Guilty/Acquitted,Not Guilty,Black,Male,87
1000000000362,2011-03-02,C.18.2-266,Felony,,Guilty,,Black,Male,87


Number of partitions that the data is split into. Essentially, `npartitions` is equal to the number of separate Pandas DataFrames that Dask is operating on under the hood

In [12]:
df.npartitions

1

These divisions are the cutoffs for the various partitions. Dask automatically generates the divisions, splitting data into ~100-250mb Pandas DataFrames. 

Since `person_id` is the index, Dask will guarantee that a given `person_id` always falls entirely within a single partition. This is important for performing aggregations on a single person_id without shuffling records across nodes.

In [13]:
df.divisions[:5]

(1000000000362.0, 224010000000817.0)

### Data Cleaning & Featurization

In [14]:
df['CodeSection'] = df['CodeSection'].fillna('MISSING')

In [15]:
VALID_DISPOSITIONS = [
    'Guilty',
    'Guilty In Absentia',
    'Dismissed',
    'Nolle Prosequi',
    'Not Guilty',
    'Not Guilty/Acquitted',
    'No Indictment Presented',
    'Not True Bill',
    'Dismissed/Other'
]

df = df[
    (~df['DispositionCode'].isna())
    & (df['DispositionCode'].isin(VALID_DISPOSITIONS))
]

In [16]:
%%time
DISPOSITION_MAP = {
    'Nolle Prosequi': 'Dismissed',
    'No Indictment Presented': 'Dismissed',
    'Not True Bill': 'Dismissed',
    'Dismissed/Other': 'Dismissed',
    'Not Guilty': 'Dismissed',
    'Not Guilty/Acquitted': 'Dismissed',
    'Guilty In Absentia': 'Conviction',
    'Guilty': 'Conviction',
}

df['disposition'] = df['DispositionCode'].replace(DISPOSITION_MAP)

df.head()

CPU times: user 56.8 ms, sys: 851 µs, total: 57.6 ms
Wall time: 493 ms


Unnamed: 0_level_0,HearingDate,CodeSection,ChargeType,Class,DispositionCode,Plea,Race,Sex,fips,disposition
person_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1000000000362,2004-09-22,C.18.2-266,Misdemeanor,,Guilty,Guilty,Black,Male,760,Conviction
1000000000362,2009-04-21,18.2-268.3,Misdemeanor,1.0,Guilty,Guilty,Black,Male,87,Conviction
1000000000362,2009-05-28,C.18.2-266,Felony,,Guilty,Guilty,Black,Male,87,Conviction
1000000000362,2010-11-23,A.46.2-391D,Felony,,Not Guilty/Acquitted,Not Guilty,Black,Male,87,Dismissed
1000000000362,2011-03-02,C.18.2-266,Felony,,Guilty,,Black,Male,87,Conviction


In [17]:
%%time
deferral_pleas = [
    'Alford',
    'Guilty',
    'Nolo Contendere'
]

deferral_conditions = (
    (df['Plea'].isin(deferral_pleas))
    & (df['disposition']=='Dismissed')
)

df['disposition'] = df['disposition'].mask(deferral_conditions, 'Deferral Dismissal')

df[df['disposition']=='Deferral Dismissal'].head()

CPU times: user 46.2 ms, sys: 2.83 ms, total: 49 ms
Wall time: 113 ms


Unnamed: 0_level_0,HearingDate,CodeSection,ChargeType,Class,DispositionCode,Plea,Race,Sex,fips,disposition
person_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1


In [18]:
%%time
df['chargetype'] = df['ChargeType']

df.head()

CPU times: user 62.9 ms, sys: 7.15 ms, total: 70.1 ms
Wall time: 512 ms


Unnamed: 0_level_0,HearingDate,CodeSection,ChargeType,Class,DispositionCode,Plea,Race,Sex,fips,disposition,chargetype
person_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1000000000362,2004-09-22,C.18.2-266,Misdemeanor,,Guilty,Guilty,Black,Male,760,Conviction,Misdemeanor
1000000000362,2009-04-21,18.2-268.3,Misdemeanor,1.0,Guilty,Guilty,Black,Male,87,Conviction,Misdemeanor
1000000000362,2009-05-28,C.18.2-266,Felony,,Guilty,Guilty,Black,Male,87,Conviction,Felony
1000000000362,2010-11-23,A.46.2-391D,Felony,,Not Guilty/Acquitted,Not Guilty,Black,Male,87,Dismissed,Felony
1000000000362,2011-03-02,C.18.2-266,Felony,,Guilty,,Black,Male,87,Conviction,Felony


In [19]:
COVERED_SECTIONS_A = [
    '4.1-305', 
    '18.2-250.1'
]

COVERED_SECTIONS_B = [
    '4.1-305',
    '18.2-96',
    '18.2-103',
    '18.2-119',
    '18.2-120',
    '18.2-134',
    '18.2-250.1',
    '18.2-415'
]

COVERED_SECTIONS_B_MISDEMEANOR = [
    '18.2-248.1'
]

EXCLUDED_SECTIONS_TWELVE = [
    '18.2-36.1',
    '18.2-36.2',
    '18.2-51.4',
    '18.2-51.5',
    '18.2-57.2',
    '18.2-266',
    '46.2-341.24'
]

In [20]:
def assign_code_section(row):
    if (
        row['CodeSection'] in COVERED_SECTIONS_A 
        and row['disposition']=='Deferral Dismissal'
    ):
        return 'covered in 19.2-392.6 - A'
    
    elif (
        row['CodeSection'] in COVERED_SECTIONS_B
        or (
            row['CodeSection'] in COVERED_SECTIONS_B_MISDEMEANOR
            and row['chargetype']=='Misdemeanor'
        )
    ):
        return 'covered in 19.2-392.6 - B'
    
    elif row['CodeSection'] in EXCLUDED_SECTIONS_TWELVE:
        return 'excluded by 19.2-392.12'
    
    else:
        return 'covered elsewhere'

In [21]:
%%time
df['codesection'] = df.map_partitions(
    lambda df: df.apply(assign_code_section, axis=1),
    meta=pd.Series(dtype=str)
)

df.head()

CPU times: user 59.7 ms, sys: 937 µs, total: 60.6 ms
Wall time: 126 ms


Unnamed: 0_level_0,HearingDate,CodeSection,ChargeType,Class,DispositionCode,Plea,Race,Sex,fips,disposition,chargetype,codesection
person_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1000000000362,2004-09-22,C.18.2-266,Misdemeanor,,Guilty,Guilty,Black,Male,760,Conviction,Misdemeanor,covered elsewhere
1000000000362,2009-04-21,18.2-268.3,Misdemeanor,1.0,Guilty,Guilty,Black,Male,87,Conviction,Misdemeanor,covered elsewhere
1000000000362,2009-05-28,C.18.2-266,Felony,,Guilty,Guilty,Black,Male,87,Conviction,Felony,covered elsewhere
1000000000362,2010-11-23,A.46.2-391D,Felony,,Not Guilty/Acquitted,Not Guilty,Black,Male,87,Dismissed,Felony,covered elsewhere
1000000000362,2011-03-02,C.18.2-266,Felony,,Guilty,,Black,Male,87,Conviction,Felony,covered elsewhere


In [22]:
def has_conviction(df):
    conviction_map = (df['disposition']
              .apply(lambda x: x=='Conviction')
              .groupby('person_id')
              .any())
    
    return df.index.map(conviction_map)

In [23]:
%%time
df['convictions'] = df.map_partitions(
    has_conviction,
    meta=pd.Series(dtype=bool)
)

df.head()

CPU times: user 41.9 ms, sys: 6.72 ms, total: 48.6 ms
Wall time: 107 ms


Unnamed: 0_level_0,HearingDate,CodeSection,ChargeType,Class,DispositionCode,Plea,Race,Sex,fips,disposition,chargetype,codesection,convictions
person_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1000000000362,2004-09-22,C.18.2-266,Misdemeanor,,Guilty,Guilty,Black,Male,760,Conviction,Misdemeanor,covered elsewhere,True
1000000000362,2009-04-21,18.2-268.3,Misdemeanor,1.0,Guilty,Guilty,Black,Male,87,Conviction,Misdemeanor,covered elsewhere,True
1000000000362,2009-05-28,C.18.2-266,Felony,,Guilty,Guilty,Black,Male,87,Conviction,Felony,covered elsewhere,True
1000000000362,2010-11-23,A.46.2-391D,Felony,,Not Guilty/Acquitted,Not Guilty,Black,Male,87,Dismissed,Felony,covered elsewhere,True
1000000000362,2011-03-02,C.18.2-266,Felony,,Guilty,,Black,Male,87,Conviction,Felony,covered elsewhere,True


**Question** - What about same day hearings?

In [24]:
def shift_hearing_date(df, shift_by):
    return (
        df.groupby('person_id')['HearingDate']
          .shift(periods=shift_by)
    )

In [25]:
%%time
df['last_hearing_date'] = df.map_partitions(
    shift_hearing_date,
    shift_by=1,
    meta=pd.Series(dtype='datetime64[ns]')
)

df.head()

CPU times: user 51.9 ms, sys: 627 µs, total: 52.5 ms
Wall time: 116 ms


Unnamed: 0_level_0,HearingDate,CodeSection,ChargeType,Class,DispositionCode,Plea,Race,Sex,fips,disposition,chargetype,codesection,convictions,last_hearing_date
person_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1000000000362,2004-09-22,C.18.2-266,Misdemeanor,,Guilty,Guilty,Black,Male,760,Conviction,Misdemeanor,covered elsewhere,True,NaT
1000000000362,2009-04-21,18.2-268.3,Misdemeanor,1.0,Guilty,Guilty,Black,Male,87,Conviction,Misdemeanor,covered elsewhere,True,2004-09-22
1000000000362,2009-05-28,C.18.2-266,Felony,,Guilty,Guilty,Black,Male,87,Conviction,Felony,covered elsewhere,True,2009-04-21
1000000000362,2010-11-23,A.46.2-391D,Felony,,Not Guilty/Acquitted,Not Guilty,Black,Male,87,Dismissed,Felony,covered elsewhere,True,2009-05-28
1000000000362,2011-03-02,C.18.2-266,Felony,,Guilty,,Black,Male,87,Conviction,Felony,covered elsewhere,True,2010-11-23


In [26]:
%%time
df['next_hearing_date'] = df.map_partitions(
    shift_hearing_date,
    shift_by=-1,
    meta=pd.Series(dtype='datetime64[ns]')
)

df.head()

CPU times: user 41.8 ms, sys: 6.47 ms, total: 48.3 ms
Wall time: 112 ms


Unnamed: 0_level_0,HearingDate,CodeSection,ChargeType,Class,DispositionCode,Plea,Race,Sex,fips,disposition,chargetype,codesection,convictions,last_hearing_date,next_hearing_date
person_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1000000000362,2004-09-22,C.18.2-266,Misdemeanor,,Guilty,Guilty,Black,Male,760,Conviction,Misdemeanor,covered elsewhere,True,NaT,2009-04-21
1000000000362,2009-04-21,18.2-268.3,Misdemeanor,1.0,Guilty,Guilty,Black,Male,87,Conviction,Misdemeanor,covered elsewhere,True,2004-09-22,2009-05-28
1000000000362,2009-05-28,C.18.2-266,Felony,,Guilty,Guilty,Black,Male,87,Conviction,Felony,covered elsewhere,True,2009-04-21,2010-11-23
1000000000362,2010-11-23,A.46.2-391D,Felony,,Not Guilty/Acquitted,Not Guilty,Black,Male,87,Dismissed,Felony,covered elsewhere,True,2009-05-28,2011-03-02
1000000000362,2011-03-02,C.18.2-266,Felony,,Guilty,,Black,Male,87,Conviction,Felony,covered elsewhere,True,2010-11-23,2016-05-16


In [27]:
def get_felony_conviction_dates(df):
    return np.where(
        (df['disposition']=='Conviction') & (df['chargetype']=='Felony'), 
        df['HearingDate'],
        np.datetime64('NaT')
    )

In [28]:
%%time
df['felony_conviction_date'] = df.map_partitions(
    get_felony_conviction_dates,
    meta=pd.Series(dtype='datetime64[ns]')
)

df.head()

CPU times: user 44.1 ms, sys: 7.85 ms, total: 52 ms
Wall time: 116 ms


Unnamed: 0_level_0,HearingDate,CodeSection,ChargeType,Class,DispositionCode,Plea,Race,Sex,fips,disposition,chargetype,codesection,convictions,last_hearing_date,next_hearing_date,felony_conviction_date
person_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1000000000362,2004-09-22,C.18.2-266,Misdemeanor,,Guilty,Guilty,Black,Male,760,Conviction,Misdemeanor,covered elsewhere,True,NaT,2009-04-21,NaT
1000000000362,2009-04-21,18.2-268.3,Misdemeanor,1.0,Guilty,Guilty,Black,Male,87,Conviction,Misdemeanor,covered elsewhere,True,2004-09-22,2009-05-28,NaT
1000000000362,2009-05-28,C.18.2-266,Felony,,Guilty,Guilty,Black,Male,87,Conviction,Felony,covered elsewhere,True,2009-04-21,2010-11-23,2009-05-28
1000000000362,2010-11-23,A.46.2-391D,Felony,,Not Guilty/Acquitted,Not Guilty,Black,Male,87,Dismissed,Felony,covered elsewhere,True,2009-05-28,2011-03-02,NaT
1000000000362,2011-03-02,C.18.2-266,Felony,,Guilty,,Black,Male,87,Conviction,Felony,covered elsewhere,True,2010-11-23,2016-05-16,2011-03-02


In [29]:
def get_last_felony_conviction_date(df):
    return (
        df['felony_conviction_date']
            .groupby('person_id')
            .shift(1)
            .groupby('person_id')
            .ffill()
            .fillna(pd.NaT)
    )

In [30]:
%%time
df['last_felony_conviction_date'] = df.map_partitions(
    get_last_felony_conviction_date,
    meta=pd.Series(dtype='datetime64[ns]')
)
df = df.drop('felony_conviction_date', axis='columns')

df.head()

CPU times: user 65.9 ms, sys: 2.14 ms, total: 68.1 ms
Wall time: 140 ms


Unnamed: 0_level_0,HearingDate,CodeSection,ChargeType,Class,DispositionCode,Plea,Race,Sex,fips,disposition,chargetype,codesection,convictions,last_hearing_date,next_hearing_date,last_felony_conviction_date
person_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1000000000362,2004-09-22,C.18.2-266,Misdemeanor,,Guilty,Guilty,Black,Male,760,Conviction,Misdemeanor,covered elsewhere,True,NaT,2009-04-21,NaT
1000000000362,2009-04-21,18.2-268.3,Misdemeanor,1.0,Guilty,Guilty,Black,Male,87,Conviction,Misdemeanor,covered elsewhere,True,2004-09-22,2009-05-28,NaT
1000000000362,2009-05-28,C.18.2-266,Felony,,Guilty,Guilty,Black,Male,87,Conviction,Felony,covered elsewhere,True,2009-04-21,2010-11-23,NaT
1000000000362,2010-11-23,A.46.2-391D,Felony,,Not Guilty/Acquitted,Not Guilty,Black,Male,87,Dismissed,Felony,covered elsewhere,True,2009-05-28,2011-03-02,2009-05-28
1000000000362,2011-03-02,C.18.2-266,Felony,,Guilty,,Black,Male,87,Conviction,Felony,covered elsewhere,True,2010-11-23,2016-05-16,2009-05-28


In [31]:
%%time
df['days_since_last_hearing'] = df['HearingDate'] - df['last_hearing_date']
df['days_until_next_hearing'] = df['next_hearing_date'] - df['HearingDate']
df['days_since_last_felony_conviction'] = df['HearingDate'] - df['last_felony_conviction_date']

df.head()

CPU times: user 77.2 ms, sys: 1.64 ms, total: 78.9 ms
Wall time: 151 ms


Unnamed: 0_level_0,HearingDate,CodeSection,ChargeType,Class,DispositionCode,Plea,Race,Sex,fips,disposition,chargetype,codesection,convictions,last_hearing_date,next_hearing_date,last_felony_conviction_date,days_since_last_hearing,days_until_next_hearing,days_since_last_felony_conviction
person_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1000000000362,2004-09-22,C.18.2-266,Misdemeanor,,Guilty,Guilty,Black,Male,760,Conviction,Misdemeanor,covered elsewhere,True,NaT,2009-04-21,NaT,NaT,1672 days,NaT
1000000000362,2009-04-21,18.2-268.3,Misdemeanor,1.0,Guilty,Guilty,Black,Male,87,Conviction,Misdemeanor,covered elsewhere,True,2004-09-22,2009-05-28,NaT,1672 days,37 days,NaT
1000000000362,2009-05-28,C.18.2-266,Felony,,Guilty,Guilty,Black,Male,87,Conviction,Felony,covered elsewhere,True,2009-04-21,2010-11-23,NaT,37 days,544 days,NaT
1000000000362,2010-11-23,A.46.2-391D,Felony,,Not Guilty/Acquitted,Not Guilty,Black,Male,87,Dismissed,Felony,covered elsewhere,True,2009-05-28,2011-03-02,2009-05-28,544 days,99 days,544 days
1000000000362,2011-03-02,C.18.2-266,Felony,,Guilty,,Black,Male,87,Conviction,Felony,covered elsewhere,True,2010-11-23,2016-05-16,2009-05-28,99 days,1902 days,643 days


In [32]:
df.head(20)[['disposition','chargetype','HearingDate','last_felony_conviction_date','days_since_last_felony_conviction']]

Unnamed: 0_level_0,disposition,chargetype,HearingDate,last_felony_conviction_date,days_since_last_felony_conviction
person_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1000000000362,Conviction,Misdemeanor,2004-09-22,NaT,NaT
1000000000362,Conviction,Misdemeanor,2009-04-21,NaT,NaT
1000000000362,Conviction,Felony,2009-05-28,NaT,NaT
1000000000362,Dismissed,Felony,2010-11-23,2009-05-28,544 days
1000000000362,Conviction,Felony,2011-03-02,2009-05-28,643 days
1000000000362,Conviction,Felony,2016-05-16,2011-03-02,1902 days
1000000000362,Dismissed,Felony,2016-05-17,2016-05-16,1 days
1000000000362,Conviction,Felony,2016-07-08,2016-05-16,53 days
1000000000362,Conviction,Felony,2016-07-08,2016-07-08,0 days
127051000000102,Dismissed,Misdemeanor,2008-12-01,NaT,NaT


## To Do - Features
1. `days_until_next_conviction`
2. `class_3_or_4_last_20`
3. `class_1_or_2`

### Writing and Loading Data
1. Write data to csv in `/tmp` directory
2. Load data to PostGres via `COPY` statements

This approach is *much* faster than loading via `df.to_sql`, since PostGres will help us load many records at once, instead of loading 1 by 1 via `INSERT` statements

In [33]:
target_dir = '/tmp/expunge_data'
target_glob = f'{target_dir}/expunge_features-*.csv'

return_val = os.system(f'rm -rf {target_glob}')

In [34]:
%%time
file_paths = df.to_csv(target_glob)

file_paths[:5]

CPU times: user 52.3 ms, sys: 9.62 ms, total: 62 ms
Wall time: 139 ms


['/tmp/expunge_data/expunge_features-0.csv']

Useful pandas functionality to approximate the SQL statement to create a table

In [35]:
from pandas.io.sql import get_schema

In [36]:
print(get_schema(df.head(), 'expunge_features'))

CREATE TABLE "expunge_features" (
"HearingDate" TIMESTAMP,
  "CodeSection" TEXT,
  "ChargeType" TEXT,
  "Class" TEXT,
  "DispositionCode" TEXT,
  "Plea" TEXT,
  "Race" TEXT,
  "Sex" TEXT,
  "fips" INTEGER,
  "disposition" TEXT,
  "chargetype" TEXT,
  "codesection" TEXT,
  "convictions" INTEGER,
  "last_hearing_date" TIMESTAMP,
  "next_hearing_date" TIMESTAMP,
  "last_felony_conviction_date" TIMESTAMP,
  "days_since_last_hearing" INTEGER,
  "days_until_next_hearing" INTEGER,
  "days_since_last_felony_conviction" INTEGER
)


  frame, name, keys=keys, dtype=dtype, schema=schema


We're truncating before loading to avoid duplicate rows on re-runs

In [37]:
%%sql
DROP TABLE expunge_features;

 * postgresql://jupyter:***@localhost:5432/expunge
Done.


[]

In [38]:
engine.execute("""
    CREATE TABLE IF NOT EXISTS expunge_features (
        person_id BIGINT,
        "HearingDate" DATE,
        "CodeSection" TEXT,
        "ChargeType" TEXT,
        "Class" TEXT,
        "DispositionCode" TEXT,
        "Plea" TEXT,
        "Race" TEXT,
        "Sex" TEXT,
        "fips" INTEGER,
        "disposition" TEXT,
        "chargetype" TEXT,
        "codesection" TEXT,
        "convictions" BOOLEAN,
        "last_hearing_date" DATE,
        "next_hearing_date" DATE,
        "last_felony_conviction_date" DATE,
        "days_since_last_hearing" TEXT,
        "days_until_next_hearing" TEXT,
        "days_since_last_felony_conviction" TEXT
    );
    
    TRUNCATE TABLE expunge_features;
""")

<sqlalchemy.engine.cursor.LegacyCursorResult at 0x7f5ee1261c50>

These `COPY` statements do all of the data loading from CSVs

In [39]:
for path in file_paths:
    engine.execute(f"""
        COPY expunge_features
        FROM '{path}'
        WITH CSV HEADER;
        commit;
    """)

Make sure the data made it to the database

In [40]:
%%sql
SELECT COUNT(*)
FROM expunge_features

 * postgresql://jupyter:***@localhost:5432/expunge
1 rows affected.


count
23


### Notes/Questions

- `ChargeType` and `chargetype` appear the same in `expunge` - is that because of cleaning done post-load?

### Added Columns
- `last_hearing_date`
- `last_felony_conviction_date`
- `next_hearing_date`
- `days_since_last_hearing`
- `days_since_last_felony_conviction`
- `days_until_next_hearing`

In [41]:
%%sql
SELECT *
FROM expunge_features
LIMIT 10

 * postgresql://jupyter:***@localhost:5432/expunge
10 rows affected.


person_id,HearingDate,CodeSection,ChargeType,Class,DispositionCode,Plea,Race,Sex,fips,disposition,chargetype,codesection,convictions,last_hearing_date,next_hearing_date,last_felony_conviction_date,days_since_last_hearing,days_until_next_hearing,days_since_last_felony_conviction
1000000000362,2004-09-22,C.18.2-266,Misdemeanor,,Guilty,Guilty,Black,Male,760,Conviction,Misdemeanor,covered elsewhere,True,,2009-04-21,,,1672 days,
1000000000362,2009-04-21,18.2-268.3,Misdemeanor,1,Guilty,Guilty,Black,Male,87,Conviction,Misdemeanor,covered elsewhere,True,2004-09-22,2009-05-28,,1672 days,37 days,
1000000000362,2009-05-28,C.18.2-266,Felony,,Guilty,Guilty,Black,Male,87,Conviction,Felony,covered elsewhere,True,2009-04-21,2010-11-23,,37 days,544 days,
1000000000362,2010-11-23,A.46.2-391D,Felony,,Not Guilty/Acquitted,Not Guilty,Black,Male,87,Dismissed,Felony,covered elsewhere,True,2009-05-28,2011-03-02,2009-05-28,544 days,99 days,544 days
1000000000362,2011-03-02,C.18.2-266,Felony,,Guilty,,Black,Male,87,Conviction,Felony,covered elsewhere,True,2010-11-23,2016-05-16,2009-05-28,99 days,1902 days,643 days
1000000000362,2016-05-16,C.18.2-266,Felony,6,Guilty,,Black,Male,760,Conviction,Felony,covered elsewhere,True,2011-03-02,2016-05-17,2011-03-02,1902 days,1 days,1902 days
1000000000362,2016-05-17,18.2-272A,Felony,6,Nolle Prosequi,,Black,Male,730,Dismissed,Felony,covered elsewhere,True,2016-05-16,2016-07-08,2016-05-16,1 days,52 days,1 days
1000000000362,2016-07-08,F.18.2-266,Felony,6,Guilty,,Black,Male,730,Conviction,Felony,covered elsewhere,True,2016-05-17,2016-07-08,2016-05-16,52 days,0 days,53 days
1000000000362,2016-07-08,46.2-391D2,Felony,U,Guilty,,Black,Male,730,Conviction,Felony,covered elsewhere,True,2016-07-08,,2016-07-08,0 days,,0 days
127051000000102,2008-12-01,4.1-308,Misdemeanor,4,Nolle Prosequi,,White,Female,1,Dismissed,Misdemeanor,covered elsewhere,True,,2012-01-30,,,1155 days,


## Tables for Testing

Move 10k person_id's from clean table into materialized view for testing

In [52]:
%%sql
CREATE MATERIALIZED VIEW expunge_10k_clean AS
WITH ids AS (
    SELECT 
        DISTINCT person_id
    FROM expunge_clean
    LIMIT 10000
)
SELECT e.*
FROM expunge_clean e
WHERE EXISTS (
    SELECT 1
    FROM ids i
    WHERE i.person_id = e.person_id
)
ORDER BY e.person_id, e."HearingDate"

 * postgresql://jupyter:***@localhost:5432/expunge
(psycopg2.errors.DuplicateTable) relation "expunge_10k_clean" already exists

[SQL: CREATE MATERIALIZED VIEW expunge_10k_clean AS
WITH ids AS (
    SELECT 
        DISTINCT person_id
    FROM expunge_clean
    LIMIT 10000
)
SELECT e.*
FROM expunge_clean e
WHERE EXISTS (
    SELECT 1
    FROM ids i
    WHERE i.person_id = e.person_id
)
ORDER BY e.person_id, e."HearingDate"]
(Background on this error at: https://sqlalche.me/e/14/f405)


In [21]:
%%sql
CREATE MATERIALIZED VIEW expunge_1k_clean AS
WITH ids AS (
    SELECT 
        DISTINCT person_id
    FROM expunge_clean
    LIMIT 1000
)
SELECT e.*
FROM expunge_clean e
WHERE EXISTS (
    SELECT 1
    FROM ids i
    WHERE i.person_id = e.person_id
)
ORDER BY e.person_id, e."HearingDate"

 * postgresql://jupyter:***@localhost:5432/expunge
2422 rows affected.


[]

In [29]:
%%sql
SELECT *
FROM expunge_1k_clean
WHERE person_id = 1000000000003

 * postgresql://jupyter:***@localhost:5432/expunge
1 rows affected.


person_id,HearingDate,CodeSection,codesection,ChargeType,chargetype,Class,DispositionCode,disposition,Plea,Race,Sex,fips,convictions,arrests,felony10,sevenyear,tenyear,within7,within10,class1_2,class3_4,expungable,old_expungable,expungable_no_lifetimelimit,reason,sameday,lifetime
1000000000003,2015-10-21,A.46.2-853,covered elsewhere,Misdemeanor,Misdemeanor,,Nolle Prosequi,Dismissed,,White,Male,153,False,False,False,False,False,True,True,False,False,Automatic,True,Automatic,Dismissal of misdemeanor charges with no arrests or charges in the past 3 years and no convictions on the person's record,False,False


In [13]:
%%sql
SELECT *
FROM expunge_10k_clean
ORDER BY person_id, "HearingDate"
LIMIT 10

 * postgresql://jupyter:***@localhost:5432/expunge
10 rows affected.


person_id,HearingDate,CodeSection,codesection,ChargeType,chargetype,Class,DispositionCode,disposition,Plea,Race,Sex,fips,convictions,arrests,felony10,sevenyear,tenyear,within7,within10,class1_2,class3_4,expungable,old_expungable,expungable_no_lifetimelimit,reason,sameday,lifetime
1001000000103,2008-07-11,18.2-172,covered elsewhere,Felony,Felony,5,Nolle Prosequi,Dismissed,,Black,Female,195,True,False,False,True,True,False,False,False,False,Not eligible,True,Petition,"Dismissal of felony charges; HOWEVER, the outcome is changed to not eligible because the lifetime limit of two expungements has been exceeded",False,True
1001000000103,2008-07-11,18.2-172,covered elsewhere,Felony,Felony,5,Nolle Prosequi,Dismissed,,Black,Female,195,True,False,False,True,True,False,False,False,False,Not eligible,True,Petition,"Dismissal of felony charges; HOWEVER, the outcome is changed to not eligible because the lifetime limit of two expungements has been exceeded",False,True
1001000000103,2008-07-11,18.2-172,covered elsewhere,Felony,Felony,5,Nolle Prosequi,Dismissed,,Black,Female,195,True,False,False,True,True,False,False,False,False,Not eligible,True,Petition,"Dismissal of felony charges; HOWEVER, the outcome is changed to not eligible because the lifetime limit of two expungements has been exceeded",False,True
1001000000103,2008-07-11,18.2-172,covered elsewhere,Felony,Felony,5,Nolle Prosequi,Dismissed,,Black,Female,195,True,False,False,True,True,False,False,False,False,Not eligible,True,Petition,"Dismissal of felony charges; HOWEVER, the outcome is changed to not eligible because the lifetime limit of two expungements has been exceeded",False,True
1001000000103,2008-07-11,18.2-172,covered elsewhere,Felony,Felony,5,Nolle Prosequi,Dismissed,,Black,Female,195,True,False,False,True,True,False,False,False,False,Not eligible,True,Petition,"Dismissal of felony charges; HOWEVER, the outcome is changed to not eligible because the lifetime limit of two expungements has been exceeded",False,True
1001000000103,2008-07-11,18.2-172,covered elsewhere,Felony,Felony,5,Nolle Prosequi,Dismissed,,Black,Female,195,True,False,False,True,True,False,False,False,False,Not eligible,True,Petition,"Dismissal of felony charges; HOWEVER, the outcome is changed to not eligible because the lifetime limit of two expungements has been exceeded",False,True
1001000000103,2008-07-11,18.2-172,covered elsewhere,Felony,Felony,5,Nolle Prosequi,Dismissed,,Black,Female,195,True,False,False,True,True,False,False,False,False,Not eligible,True,Petition,"Dismissal of felony charges; HOWEVER, the outcome is changed to not eligible because the lifetime limit of two expungements has been exceeded",False,True
1001000000103,2008-07-11,18.2-172,covered elsewhere,Felony,Felony,5,Nolle Prosequi,Dismissed,,Black,Female,195,True,False,False,True,True,False,False,False,False,Not eligible,True,Petition,"Dismissal of felony charges; HOWEVER, the outcome is changed to not eligible because the lifetime limit of two expungements has been exceeded",False,True
1001000000103,2008-07-11,18.2-172,covered elsewhere,Felony,Felony,5,Nolle Prosequi,Dismissed,,Black,Female,195,True,False,False,True,True,False,False,False,False,Not eligible,True,Petition,"Dismissal of felony charges; HOWEVER, the outcome is changed to not eligible because the lifetime limit of two expungements has been exceeded",False,True
1001000000103,2008-07-11,18.2-172,covered elsewhere,Felony,Felony,5,Nolle Prosequi,Dismissed,,Black,Female,195,True,False,False,True,True,False,False,False,False,Not eligible,True,Petition,"Dismissal of felony charges; HOWEVER, the outcome is changed to not eligible because the lifetime limit of two expungements has been exceeded",False,True
