# Expungement Eligibility Classification - Parallel Featurization

## Dask Transformations

Loading up the Dask client is necessary to run processes on multiple workers

In [1]:
from datetime import datetime
import os

import sqlalchemy as sa
from sqlalchemy.sql import select
from sqlalchemy import (
    Table, 
    Column, 
    Integer, 
    String, 
    MetaData, 
    DateTime,
    or_
)
import pandas as pd
import numpy as np
import dask.dataframe as dd

In [2]:
from distributed import Client

client = Client(n_workers=4)
client

distributed.diskutils - INFO - Found stale lock file and directory '/home/jupyter-isaak-a/dask-worker-space/worker-dezj0vs1', purging
distributed.diskutils - INFO - Found stale lock file and directory '/home/jupyter-isaak-a/dask-worker-space/worker-fe_mii34', purging
distributed.diskutils - INFO - Found stale lock file and directory '/home/jupyter-isaak-a/dask-worker-space/worker-thtwkaiz', purging
distributed.diskutils - INFO - Found stale lock file and directory '/home/jupyter-isaak-a/dask-worker-space/worker-yj4yt2ef', purging


0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:8787/status,

0,1
Dashboard: http://127.0.0.1:8787/status,Workers: 4
Total threads: 16,Total memory: 117.93 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:36105,Workers: 4
Dashboard: http://127.0.0.1:8787/status,Total threads: 16
Started: Just now,Total memory: 117.93 GiB

0,1
Comm: tcp://127.0.0.1:38035,Total threads: 4
Dashboard: http://127.0.0.1:36089/status,Memory: 29.48 GiB
Nanny: tcp://127.0.0.1:42301,
Local directory: /home/jupyter-isaak-a/dask-worker-space/worker-cffn81mg,Local directory: /home/jupyter-isaak-a/dask-worker-space/worker-cffn81mg

0,1
Comm: tcp://127.0.0.1:40149,Total threads: 4
Dashboard: http://127.0.0.1:37299/status,Memory: 29.48 GiB
Nanny: tcp://127.0.0.1:45353,
Local directory: /home/jupyter-isaak-a/dask-worker-space/worker-oad603o1,Local directory: /home/jupyter-isaak-a/dask-worker-space/worker-oad603o1

0,1
Comm: tcp://127.0.0.1:44383,Total threads: 4
Dashboard: http://127.0.0.1:46481/status,Memory: 29.48 GiB
Nanny: tcp://127.0.0.1:38453,
Local directory: /home/jupyter-isaak-a/dask-worker-space/worker-dtrs92b5,Local directory: /home/jupyter-isaak-a/dask-worker-space/worker-dtrs92b5

0,1
Comm: tcp://127.0.0.1:44113,Total threads: 4
Dashboard: http://127.0.0.1:40675/status,Memory: 29.48 GiB
Nanny: tcp://127.0.0.1:32871,
Local directory: /home/jupyter-isaak-a/dask-worker-space/worker-gqv9id_t,Local directory: /home/jupyter-isaak-a/dask-worker-space/worker-gqv9id_t


### Data Loading

In [3]:
USER = 'jupyter'
PASSWORD = os.environ['POSTGRES_PASS']
HOST = 'localhost'
PORT = '5432'
DB = 'expunge'

DATABASE_URI = f"postgresql://{USER}:{PASSWORD}@{HOST}:{PORT}/{DB}"
engine = sa.create_engine(DATABASE_URI)

Loading SQL extension for useful spot-checking

In [4]:
%load_ext sql
%sql {DATABASE_URI}

Dask DataFrame does not accept raw SQL, but will accept a SQLAlchemy ORM query object. We are using this to read data from `expunge` sorted by both `person_id` and `HearingDate`. This ordering is important for some of the partitioned aggregations. 

In [5]:
metadata_obj = MetaData()
expunge = Table('expunge_clean', metadata_obj, # Full Dataset
# expunge = Table('expunge_10k_clean', metadata_obj, # ~26K records
# expunge = Table('expunge_1k_clean', metadata_obj, # ~26K records
     Column('person_id', Integer),
     Column('HearingDate', DateTime),
     Column('CodeSection', String),
     Column('ChargeType', String),
     Column('Class', String),
     Column('DispositionCode', String),
     Column('Plea', String),
     Column('Race', String),
     Column('Sex', String),
     Column('fips', Integer),
)

Here you can see the raw query string to which the `query` SQLAlchemy object translates

In [6]:
query = (
    select(expunge)
    # Where clause just for testing, comment out for full run
#     .where(
#         or_(
#             expunge.c.person_id == 127051000000102, 
#             expunge.c.person_id == 224010000000817,
#             expunge.c.person_id == 1000000000362
#         )
#     )
    .order_by(expunge.c.person_id, expunge.c.HearingDate)
)
print(str(query))

SELECT expunge_clean.person_id, expunge_clean."HearingDate", expunge_clean."CodeSection", expunge_clean."ChargeType", expunge_clean."Class", expunge_clean."DispositionCode", expunge_clean."Plea", expunge_clean."Race", expunge_clean."Sex", expunge_clean.fips 
FROM expunge_clean ORDER BY expunge_clean.person_id, expunge_clean."HearingDate"


In [7]:
meta_dict = {
    'HearingDate': 'datetime64[ns]',
    'CodeSection': str,
    'ChargeType': str,
    'Class': str,
    'DispositionCode': str,
    'Plea': str,
    'Race': str,
    'Sex': str,
    'fips': 'int64'
}

meta_frame = pd.DataFrame(columns=meta_dict.keys()).astype(meta_dict)

meta_frame.dtypes

HearingDate        datetime64[ns]
CodeSection                object
ChargeType                 object
Class                      object
DispositionCode            object
Plea                       object
Race                       object
Sex                        object
fips                        int64
dtype: object

In [8]:
%%time
df = dd.read_sql_table(
    table=query,
    index_col='person_id',
    uri=DATABASE_URI,
    npartitions=32,
#     npartitions=8
    meta=meta_frame
)

CPU times: user 661 ms, sys: 96.5 ms, total: 757 ms
Wall time: 13.9 s


In [9]:
df

Unnamed: 0_level_0,HearingDate,CodeSection,ChargeType,Class,DispositionCode,Plea,Race,Sex,fips
npartitions=32,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1.000000e+12,datetime64[ns],object,object,object,object,object,object,object,int64
1.241409e+13,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...
3.548369e+14,...,...,...,...,...,...,...,...,...
3.662510e+14,...,...,...,...,...,...,...,...,...


In [10]:
pd.set_option('max_columns', None)

In [11]:
df.head()

Unnamed: 0_level_0,HearingDate,CodeSection,ChargeType,Class,DispositionCode,Plea,Race,Sex,fips
person_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1000000000002,2013-04-04,C.46.2-862,Misdemeanor,1.0,Guilty,Guilty,Unknown,Male,107
1000000000003,2015-10-21,A.46.2-853,Misdemeanor,,Nolle Prosequi,,White,Male,153
1000000000004,2009-10-07,A.46.2-862,Misdemeanor,1.0,Guilty,Guilty,Unknown,Male,87
1000000000006,2003-04-14,MISSING,Misdemeanor,,Dismissed,,Black,Male,700
1000000000008,2014-02-05,23-26,Misdemeanor,1.0,Nolle Prosequi,,Unknown,Male,740


Number of partitions that the data is split into. Essentially, `npartitions` is equal to the number of separate Pandas DataFrames that Dask is operating on under the hood

In [12]:
df.npartitions

32

These divisions are the cutoffs for the various partitions. Dask automatically generates the divisions, splitting data into ~100-250mb Pandas DataFrames. 

Since `person_id` is the index, Dask will guarantee that a given `person_id` always falls entirely within a single partition. This is important for performing aggregations on a single person_id without shuffling records across nodes.

In [13]:
df.divisions[:5]

(1000000000002.0,
 12414093750002.969,
 23828187500003.938,
 35242281250004.91,
 46656375000005.875)

### Data Cleaning & Featurization

In [14]:
df['CodeSection'] = df['CodeSection'].fillna('MISSING')

In [15]:
VALID_DISPOSITIONS = [
    'Guilty',
    'Guilty In Absentia',
    'Dismissed',
    'Nolle Prosequi',
    'Not Guilty',
    'Not Guilty/Acquitted',
    'No Indictment Presented',
    'Not True Bill',
    'Dismissed/Other'
]

df = df[
    (~df['DispositionCode'].isna())
    & (df['DispositionCode'].isin(VALID_DISPOSITIONS))
]

In [16]:
%%time
DISPOSITION_MAP = {
    'Nolle Prosequi': 'Dismissed',
    'No Indictment Presented': 'Dismissed',
    'Not True Bill': 'Dismissed',
    'Dismissed/Other': 'Dismissed',
    'Not Guilty': 'Dismissed',
    'Not Guilty/Acquitted': 'Dismissed',
    'Guilty In Absentia': 'Conviction',
    'Guilty': 'Conviction',
}

df['disposition'] = df['DispositionCode'].replace(DISPOSITION_MAP)

df.head()

CPU times: user 285 ms, sys: 53.6 ms, total: 339 ms
Wall time: 5.89 s


Unnamed: 0_level_0,HearingDate,CodeSection,ChargeType,Class,DispositionCode,Plea,Race,Sex,fips,disposition
person_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1000000000002,2013-04-04,C.46.2-862,Misdemeanor,1.0,Guilty,Guilty,Unknown,Male,107,Conviction
1000000000003,2015-10-21,A.46.2-853,Misdemeanor,,Nolle Prosequi,,White,Male,153,Dismissed
1000000000004,2009-10-07,A.46.2-862,Misdemeanor,1.0,Guilty,Guilty,Unknown,Male,87,Conviction
1000000000006,2003-04-14,MISSING,Misdemeanor,,Dismissed,,Black,Male,700,Dismissed
1000000000008,2014-02-05,23-26,Misdemeanor,1.0,Nolle Prosequi,,Unknown,Male,740,Dismissed


In [17]:
%%time
deferral_pleas = [
    'Alford',
    'Guilty',
    'Nolo Contendere'
]

deferral_conditions = (
    (df['Plea'].isin(deferral_pleas))
    & (df['disposition']=='Dismissed')
)

df['disposition'] = df['disposition'].mask(deferral_conditions, 'Deferral Dismissal')

df[df['disposition']=='Deferral Dismissal'].head()

CPU times: user 300 ms, sys: 53 ms, total: 353 ms
Wall time: 6.06 s


Unnamed: 0_level_0,HearingDate,CodeSection,ChargeType,Class,DispositionCode,Plea,Race,Sex,fips,disposition
person_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1000000000037,2012-06-01,46.2-2812,Misdemeanor,1,Dismissed,Guilty,White,Male,107,Deferral Dismissal
1000000000565,2017-05-26,46.2-300,Misdemeanor,,Dismissed,Guilty,Hispanic,Male,153,Deferral Dismissal
1000000001176,2010-06-15,18.2-250.1,Misdemeanor,U,Not Guilty,Nolo Contendere,Unknown,Male,59,Deferral Dismissal
1000000001290,2014-08-06,41.1-2-2,Misdemeanor,,Nolle Prosequi,Guilty,White,Male,59,Deferral Dismissal
1000000001537,2011-05-17,18.2-53.1,Felony,U,Nolle Prosequi,Nolo Contendere,Black,Male,760,Deferral Dismissal


In [18]:
%%time
df['chargetype'] = df['ChargeType']

df.head()

CPU times: user 285 ms, sys: 72.6 ms, total: 358 ms
Wall time: 6.17 s


Unnamed: 0_level_0,HearingDate,CodeSection,ChargeType,Class,DispositionCode,Plea,Race,Sex,fips,disposition,chargetype
person_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1000000000002,2013-04-04,C.46.2-862,Misdemeanor,1.0,Guilty,Guilty,Unknown,Male,107,Conviction,Misdemeanor
1000000000003,2015-10-21,A.46.2-853,Misdemeanor,,Nolle Prosequi,,White,Male,153,Dismissed,Misdemeanor
1000000000004,2009-10-07,A.46.2-862,Misdemeanor,1.0,Guilty,Guilty,Unknown,Male,87,Conviction,Misdemeanor
1000000000006,2003-04-14,MISSING,Misdemeanor,,Dismissed,,Black,Male,700,Dismissed,Misdemeanor
1000000000008,2014-02-05,23-26,Misdemeanor,1.0,Nolle Prosequi,,Unknown,Male,740,Dismissed,Misdemeanor


In [19]:
COVERED_SECTIONS_A = [
    '4.1-305', 
    '18.2-250.1'
]

COVERED_SECTIONS_B = [
    '4.1-305',
    '18.2-96',
    '18.2-103',
    '18.2-119',
    '18.2-120',
    '18.2-134',
    '18.2-250.1',
    '18.2-415'
]

COVERED_SECTIONS_B_MISDEMEANOR = [
    '18.2-248.1'
]

EXCLUDED_SECTIONS_TWELVE = [
    '18.2-36.1',
    '18.2-36.2',
    '18.2-51.4',
    '18.2-51.5',
    '18.2-57.2',
    '18.2-266',
    '46.2-341.24'
]

In [20]:
def assign_code_section(row):
    if (
        row['CodeSection'] in COVERED_SECTIONS_A 
        and row['disposition']=='Deferral Dismissal'
    ):
        return 'covered in 19.2-392.6 - A'
    
    elif (
        row['CodeSection'] in COVERED_SECTIONS_B
        or (
            row['CodeSection'] in COVERED_SECTIONS_B_MISDEMEANOR
            and row['chargetype']=='Misdemeanor'
        )
    ):
        return 'covered in 19.2-392.6 - B'
    
    elif row['CodeSection'] in EXCLUDED_SECTIONS_TWELVE:
        return 'excluded by 19.2-392.12'
    
    else:
        return 'covered elsewhere'

In [21]:
%%time
df['codesection'] = df.map_partitions(
    lambda df: df.apply(assign_code_section, axis=1),
    meta=(None, str)
)

df.head()

CPU times: user 587 ms, sys: 119 ms, total: 707 ms
Wall time: 13.6 s


Unnamed: 0_level_0,HearingDate,CodeSection,ChargeType,Class,DispositionCode,Plea,Race,Sex,fips,disposition,chargetype,codesection
person_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1000000000002,2013-04-04,C.46.2-862,Misdemeanor,1.0,Guilty,Guilty,Unknown,Male,107,Conviction,Misdemeanor,covered elsewhere
1000000000003,2015-10-21,A.46.2-853,Misdemeanor,,Nolle Prosequi,,White,Male,153,Dismissed,Misdemeanor,covered elsewhere
1000000000004,2009-10-07,A.46.2-862,Misdemeanor,1.0,Guilty,Guilty,Unknown,Male,87,Conviction,Misdemeanor,covered elsewhere
1000000000006,2003-04-14,MISSING,Misdemeanor,,Dismissed,,Black,Male,700,Dismissed,Misdemeanor,covered elsewhere
1000000000008,2014-02-05,23-26,Misdemeanor,1.0,Nolle Prosequi,,Unknown,Male,740,Dismissed,Misdemeanor,covered elsewhere


In [22]:
def has_conviction(df):
    conviction_map = (df['disposition']
              .apply(lambda x: x=='Conviction')
              .groupby('person_id')
              .any())
    
    return df.index.map(conviction_map)

In [23]:
%%time
df['convictions'] = df.map_partitions(
    has_conviction,
    meta=(None, bool)
)

df.head()

CPU times: user 621 ms, sys: 76.7 ms, total: 698 ms
Wall time: 13.9 s


Unnamed: 0_level_0,HearingDate,CodeSection,ChargeType,Class,DispositionCode,Plea,Race,Sex,fips,disposition,chargetype,codesection,convictions
person_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1000000000002,2013-04-04,C.46.2-862,Misdemeanor,1.0,Guilty,Guilty,Unknown,Male,107,Conviction,Misdemeanor,covered elsewhere,True
1000000000003,2015-10-21,A.46.2-853,Misdemeanor,,Nolle Prosequi,,White,Male,153,Dismissed,Misdemeanor,covered elsewhere,False
1000000000004,2009-10-07,A.46.2-862,Misdemeanor,1.0,Guilty,Guilty,Unknown,Male,87,Conviction,Misdemeanor,covered elsewhere,True
1000000000006,2003-04-14,MISSING,Misdemeanor,,Dismissed,,Black,Male,700,Dismissed,Misdemeanor,covered elsewhere,False
1000000000008,2014-02-05,23-26,Misdemeanor,1.0,Nolle Prosequi,,Unknown,Male,740,Dismissed,Misdemeanor,covered elsewhere,False


**Question** - What about same day hearings?

In [24]:
def shift_hearing_date(df, shift_by):
    return (
        df.groupby('person_id')['HearingDate']
          .shift(periods=shift_by)
    )

In [25]:
%%time
df['last_hearing_date'] = df.map_partitions(
    shift_hearing_date,
    shift_by=1,
    meta=(None, 'datetime64[ns]')
)

df.head()

CPU times: user 610 ms, sys: 66 ms, total: 676 ms
Wall time: 13.8 s


Unnamed: 0_level_0,HearingDate,CodeSection,ChargeType,Class,DispositionCode,Plea,Race,Sex,fips,disposition,chargetype,codesection,convictions,last_hearing_date
person_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1000000000002,2013-04-04,C.46.2-862,Misdemeanor,1.0,Guilty,Guilty,Unknown,Male,107,Conviction,Misdemeanor,covered elsewhere,True,NaT
1000000000003,2015-10-21,A.46.2-853,Misdemeanor,,Nolle Prosequi,,White,Male,153,Dismissed,Misdemeanor,covered elsewhere,False,NaT
1000000000004,2009-10-07,A.46.2-862,Misdemeanor,1.0,Guilty,Guilty,Unknown,Male,87,Conviction,Misdemeanor,covered elsewhere,True,NaT
1000000000006,2003-04-14,MISSING,Misdemeanor,,Dismissed,,Black,Male,700,Dismissed,Misdemeanor,covered elsewhere,False,NaT
1000000000008,2014-02-05,23-26,Misdemeanor,1.0,Nolle Prosequi,,Unknown,Male,740,Dismissed,Misdemeanor,covered elsewhere,False,NaT


In [26]:
%%time
df['next_hearing_date'] = df.map_partitions(
    shift_hearing_date,
    shift_by=-1,
    meta=(None, 'datetime64[ns]')
)

df.head()

CPU times: user 612 ms, sys: 120 ms, total: 732 ms
Wall time: 14.1 s


Unnamed: 0_level_0,HearingDate,CodeSection,ChargeType,Class,DispositionCode,Plea,Race,Sex,fips,disposition,chargetype,codesection,convictions,last_hearing_date,next_hearing_date
person_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1000000000002,2013-04-04,C.46.2-862,Misdemeanor,1.0,Guilty,Guilty,Unknown,Male,107,Conviction,Misdemeanor,covered elsewhere,True,NaT,NaT
1000000000003,2015-10-21,A.46.2-853,Misdemeanor,,Nolle Prosequi,,White,Male,153,Dismissed,Misdemeanor,covered elsewhere,False,NaT,NaT
1000000000004,2009-10-07,A.46.2-862,Misdemeanor,1.0,Guilty,Guilty,Unknown,Male,87,Conviction,Misdemeanor,covered elsewhere,True,NaT,NaT
1000000000006,2003-04-14,MISSING,Misdemeanor,,Dismissed,,Black,Male,700,Dismissed,Misdemeanor,covered elsewhere,False,NaT,NaT
1000000000008,2014-02-05,23-26,Misdemeanor,1.0,Nolle Prosequi,,Unknown,Male,740,Dismissed,Misdemeanor,covered elsewhere,False,NaT,2014-02-05


In [27]:
def get_felony_conviction_dates(df):
    return np.where(
        (df['disposition']=='Conviction') & (df['chargetype']=='Felony'), 
        df['HearingDate'], 
        pd.NaT
    )

In [28]:
%%time
df['felony_conviction_date'] = df.map_partitions(
    get_felony_conviction_dates,
    meta=(None, 'datetime64[ns]')
)

df.head()

CPU times: user 651 ms, sys: 110 ms, total: 761 ms
Wall time: 14.1 s


Unnamed: 0_level_0,HearingDate,CodeSection,ChargeType,Class,DispositionCode,Plea,Race,Sex,fips,disposition,chargetype,codesection,convictions,last_hearing_date,next_hearing_date,felony_conviction_date
person_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1000000000002,2013-04-04,C.46.2-862,Misdemeanor,1.0,Guilty,Guilty,Unknown,Male,107,Conviction,Misdemeanor,covered elsewhere,True,NaT,NaT,NaT
1000000000003,2015-10-21,A.46.2-853,Misdemeanor,,Nolle Prosequi,,White,Male,153,Dismissed,Misdemeanor,covered elsewhere,False,NaT,NaT,NaT
1000000000004,2009-10-07,A.46.2-862,Misdemeanor,1.0,Guilty,Guilty,Unknown,Male,87,Conviction,Misdemeanor,covered elsewhere,True,NaT,NaT,NaT
1000000000006,2003-04-14,MISSING,Misdemeanor,,Dismissed,,Black,Male,700,Dismissed,Misdemeanor,covered elsewhere,False,NaT,NaT,NaT
1000000000008,2014-02-05,23-26,Misdemeanor,1.0,Nolle Prosequi,,Unknown,Male,740,Dismissed,Misdemeanor,covered elsewhere,False,NaT,2014-02-05,NaT


In [29]:
def get_last_felony_conviction_date(df):
    return (
        df['felony_conviction_date']
            .groupby('person_id')
            .shift(1)
            .ffill()
            .fillna(pd.NaT)
    )

In [31]:
%%time
df['last_felony_conviction_date'] = df.map_partitions(
    get_last_felony_conviction_date,
    meta=(None, 'datetime64[ns]')
)

df.head()

CPU times: user 789 ms, sys: 145 ms, total: 934 ms
Wall time: 15.2 s


Unnamed: 0_level_0,HearingDate,CodeSection,ChargeType,Class,DispositionCode,Plea,Race,Sex,fips,disposition,chargetype,codesection,convictions,last_hearing_date,next_hearing_date,felony_conviction_date,last_felony_conviction_date
person_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1000000000002,2013-04-04,C.46.2-862,Misdemeanor,1.0,Guilty,Guilty,Unknown,Male,107,Conviction,Misdemeanor,covered elsewhere,True,NaT,NaT,NaT,NaT
1000000000003,2015-10-21,A.46.2-853,Misdemeanor,,Nolle Prosequi,,White,Male,153,Dismissed,Misdemeanor,covered elsewhere,False,NaT,NaT,NaT,NaT
1000000000004,2009-10-07,A.46.2-862,Misdemeanor,1.0,Guilty,Guilty,Unknown,Male,87,Conviction,Misdemeanor,covered elsewhere,True,NaT,NaT,NaT,NaT
1000000000006,2003-04-14,MISSING,Misdemeanor,,Dismissed,,Black,Male,700,Dismissed,Misdemeanor,covered elsewhere,False,NaT,NaT,NaT,NaT
1000000000008,2014-02-05,23-26,Misdemeanor,1.0,Nolle Prosequi,,Unknown,Male,740,Dismissed,Misdemeanor,covered elsewhere,False,NaT,2014-02-05,NaT,NaT


In [38]:
df['days_since_last_hearing'] = df.map_partitions(
    lambda df: df['HearingDate'] - df['last_hearing_date'],
    meta=(None, 'timedelta64[ns]')
)

df.head()

Function:  subgraph_callable-30a950b2-071d-4106-a489-1b8e97f6
args:      (['Alford', 'Guilty', 'Nolo Contendere'], ['Guilty', 'Guilty In Absentia', 'Dismissed', 'Nolle Prosequi', 'Not Guilty', 'Not Guilty/Acquitted', 'No Indictment Presented', 'Not True Bill', 'Dismissed/Other'],                HearingDate  CodeSection   ChargeType  ...    Plea     Race     Sex
person_id                                             ...                         
1000000000002   2013-04-04   C.46.2-862  Misdemeanor  ...  Guilty  Unknown    Male
1000000000003   2015-10-21   A.46.2-853  Misdemeanor  ...      NA    White    Male
1000000000004   2009-10-07   A.46.2-862  Misdemeanor  ...  Guilty  Unknown    Male
1000000000006   2003-04-14      MISSING  Misdemeanor  ...      NA    Black    Male
1000000000008   2014-02-05        23-26  Misdemeanor  ...      NA  Unknown    Male
...                    ...          ...          ...  ...     ...      ...     ...
12251000000031  2018-04-23   A.46.2-707  Misdemeanor  .

TypeError: unsupported operand type(s) for -: 'Timestamp' and 'float'

In [34]:
df['days_since_last_hearing'] = df['HearingDate'] - df['last_hearing_date']

df.head()

Function:  subgraph_callable-0dcef092-0014-4522-bf97-d7207143
args:      (['Alford', 'Guilty', 'Nolo Contendere'], ['Guilty', 'Guilty In Absentia', 'Dismissed', 'Nolle Prosequi', 'Not Guilty', 'Not Guilty/Acquitted', 'No Indictment Presented', 'Not True Bill', 'Dismissed/Other'],                HearingDate  CodeSection   ChargeType  ...    Plea     Race     Sex
person_id                                             ...                         
1000000000002   2013-04-04   C.46.2-862  Misdemeanor  ...  Guilty  Unknown    Male
1000000000003   2015-10-21   A.46.2-853  Misdemeanor  ...      NA    White    Male
1000000000004   2009-10-07   A.46.2-862  Misdemeanor  ...  Guilty  Unknown    Male
1000000000006   2003-04-14      MISSING  Misdemeanor  ...      NA    Black    Male
1000000000008   2014-02-05        23-26  Misdemeanor  ...      NA  Unknown    Male
...                    ...          ...          ...  ...     ...      ...     ...
12251000000031  2018-04-23  46.2-613(2)  Misdemeanor  .

TypeError: unsupported operand type(s) for -: 'Timestamp' and 'float'

### Writing and Loading Data
1. Write data to csv in `/tmp` directory
2. Load data to PostGres via `COPY` statements

This approach is *much* faster than loading via `df.to_sql`, since PostGres will help us load many records at once, instead of loading 1 by 1 via `INSERT` statements

In [26]:
target_dir = '/tmp/expunge_data'
target_glob = f'{target_dir}/expunge_features-*.csv'

return_val = os.system(f'rm -rf {target_glob}')

In [27]:
%%time
file_paths = df.to_csv(target_glob)

file_paths[:5]

CPU times: user 7.98 s, sys: 1.46 s, total: 9.44 s
Wall time: 3min 42s


['/tmp/expunge_data/expunge_features-00.csv',
 '/tmp/expunge_data/expunge_features-01.csv',
 '/tmp/expunge_data/expunge_features-02.csv',
 '/tmp/expunge_data/expunge_features-03.csv',
 '/tmp/expunge_data/expunge_features-04.csv']

Useful pandas functionality to approximate the SQL statement to create a table

In [27]:
from pandas.io.sql import get_schema

In [28]:
print(get_schema(df.head(), 'expunge_features'))

CREATE TABLE "expunge_features" (
"HearingDate" DATE,
  "CodeSection" TEXT,
  "ChargeType" TEXT,
  "Class" TEXT,
  "DispositionCode" TEXT,
  "Plea" TEXT,
  "Race" TEXT,
  "Sex" TEXT,
  "disposition" TEXT,
  "chargetype" TEXT,
  "codesection" TEXT,
  "convictions" INTEGER,
  "last_hearing_date" DATE,
  "next_hearing_date" DATE
)


We're truncating before loading to avoid duplicate rows on re-runs

In [29]:
engine.execute("""
    CREATE TABLE IF NOT EXISTS expunge_features (
        person_id BIGINT,
        "HearingDate" DATE,
        "CodeSection" TEXT,
        "ChargeType" TEXT,
        "Class" TEXT,
        "DispositionCode" TEXT,
        "Plea" TEXT,
        "Race" TEXT,
        "Sex" TEXT,
        "disposition" TEXT,
        "chargetype" TEXT,
        "codesection" TEXT,
        "convictions" BOOLEAN,
        "last_hearing_date" DATE,
        "next_hearing_date" DATE
    );
    
    TRUNCATE TABLE expunge_features;
""")

<sqlalchemy.engine.cursor.LegacyCursorResult at 0x7f96952fba90>

These `COPY` statements do all of the data loading from CSVs

In [30]:
for path in file_paths:
    engine.execute(f"""
        COPY expunge_features
        FROM '{path}'
        WITH CSV HEADER;
        commit;
    """)

Make sure the data made it to the database

In [31]:
%%sql
SELECT COUNT(*)
FROM expunge_features

 * postgresql://jupyter:***@localhost:5432/expunge
1 rows affected.


count
29346


### Notes/Questions

- `ChargeType` and `chargetype` appear the same in `expunge` - is that because of cleaning done post-load?

### Added Columns
- `last_hearing_date`
- `last_felony_conviction_date`
- `next_hearing_date`
- `days_since_last_hearing`
- `days_since_last_felony_conviction`
- `days_until_next_hearing`

In [37]:
%%sql
SELECT *
FROM expunge_features
LIMIT 10

 * postgresql://jupyter:***@localhost:5432/expunge
10 rows affected.


person_id,HearingDate,CodeSection,ChargeType,Class,DispositionCode,Plea,Race,Sex,disposition,chargetype,codesection,convictions,last_hearing_date,next_hearing_date
1001000000103,2008-07-11,18.2-172,Felony,5,Nolle Prosequi,,Black,Female,Dismissed,Felony,covered elsewhere,True,,2008-07-11
1001000000103,2008-07-11,18.2-172,Felony,5,Nolle Prosequi,,Black,Female,Dismissed,Felony,covered elsewhere,True,2008-07-11,2008-07-11
1001000000103,2008-07-11,18.2-172,Felony,5,Nolle Prosequi,,Black,Female,Dismissed,Felony,covered elsewhere,True,2008-07-11,2008-07-11
1001000000103,2008-07-11,18.2-172,Felony,5,Nolle Prosequi,,Black,Female,Dismissed,Felony,covered elsewhere,True,2008-07-11,2008-07-11
1001000000103,2008-07-11,18.2-172,Felony,5,Nolle Prosequi,,Black,Female,Dismissed,Felony,covered elsewhere,True,2008-07-11,2008-07-11
1001000000103,2008-07-11,18.2-172,Felony,5,Nolle Prosequi,,Black,Female,Dismissed,Felony,covered elsewhere,True,2008-07-11,2008-07-11
1001000000103,2008-07-11,18.2-172,Felony,5,Nolle Prosequi,,Black,Female,Dismissed,Felony,covered elsewhere,True,2008-07-11,2008-07-11
1001000000103,2008-07-11,18.2-172,Felony,5,Nolle Prosequi,,Black,Female,Dismissed,Felony,covered elsewhere,True,2008-07-11,2008-07-11
1001000000103,2008-07-11,18.2-172,Felony,5,Nolle Prosequi,,Black,Female,Dismissed,Felony,covered elsewhere,True,2008-07-11,2008-07-11
1001000000103,2008-07-11,18.2-172,Felony,5,Nolle Prosequi,,Black,Female,Dismissed,Felony,covered elsewhere,True,2008-07-11,2008-07-11


## Tables for Testing

Move 10k person_id's from clean table into materialized view for testing

In [52]:
%%sql
CREATE MATERIALIZED VIEW expunge_10k_clean AS
WITH ids AS (
    SELECT 
        DISTINCT person_id
    FROM expunge_clean
    LIMIT 10000
)
SELECT e.*
FROM expunge_clean e
WHERE EXISTS (
    SELECT 1
    FROM ids i
    WHERE i.person_id = e.person_id
)
ORDER BY e.person_id, e."HearingDate"

 * postgresql://jupyter:***@localhost:5432/expunge
(psycopg2.errors.DuplicateTable) relation "expunge_10k_clean" already exists

[SQL: CREATE MATERIALIZED VIEW expunge_10k_clean AS
WITH ids AS (
    SELECT 
        DISTINCT person_id
    FROM expunge_clean
    LIMIT 10000
)
SELECT e.*
FROM expunge_clean e
WHERE EXISTS (
    SELECT 1
    FROM ids i
    WHERE i.person_id = e.person_id
)
ORDER BY e.person_id, e."HearingDate"]
(Background on this error at: https://sqlalche.me/e/14/f405)


In [21]:
%%sql
CREATE MATERIALIZED VIEW expunge_1k_clean AS
WITH ids AS (
    SELECT 
        DISTINCT person_id
    FROM expunge_clean
    LIMIT 1000
)
SELECT e.*
FROM expunge_clean e
WHERE EXISTS (
    SELECT 1
    FROM ids i
    WHERE i.person_id = e.person_id
)
ORDER BY e.person_id, e."HearingDate"

 * postgresql://jupyter:***@localhost:5432/expunge
2422 rows affected.


[]

In [29]:
%%sql
SELECT *
FROM expunge_1k_clean
WHERE person_id = 1000000000003

 * postgresql://jupyter:***@localhost:5432/expunge
1 rows affected.


person_id,HearingDate,CodeSection,codesection,ChargeType,chargetype,Class,DispositionCode,disposition,Plea,Race,Sex,fips,convictions,arrests,felony10,sevenyear,tenyear,within7,within10,class1_2,class3_4,expungable,old_expungable,expungable_no_lifetimelimit,reason,sameday,lifetime
1000000000003,2015-10-21,A.46.2-853,covered elsewhere,Misdemeanor,Misdemeanor,,Nolle Prosequi,Dismissed,,White,Male,153,False,False,False,False,False,True,True,False,False,Automatic,True,Automatic,Dismissal of misdemeanor charges with no arrests or charges in the past 3 years and no convictions on the person's record,False,False


In [13]:
%%sql
SELECT *
FROM expunge_10k_clean
ORDER BY person_id, "HearingDate"
LIMIT 10

 * postgresql://jupyter:***@localhost:5432/expunge
10 rows affected.


person_id,HearingDate,CodeSection,codesection,ChargeType,chargetype,Class,DispositionCode,disposition,Plea,Race,Sex,fips,convictions,arrests,felony10,sevenyear,tenyear,within7,within10,class1_2,class3_4,expungable,old_expungable,expungable_no_lifetimelimit,reason,sameday,lifetime
1001000000103,2008-07-11,18.2-172,covered elsewhere,Felony,Felony,5,Nolle Prosequi,Dismissed,,Black,Female,195,True,False,False,True,True,False,False,False,False,Not eligible,True,Petition,"Dismissal of felony charges; HOWEVER, the outcome is changed to not eligible because the lifetime limit of two expungements has been exceeded",False,True
1001000000103,2008-07-11,18.2-172,covered elsewhere,Felony,Felony,5,Nolle Prosequi,Dismissed,,Black,Female,195,True,False,False,True,True,False,False,False,False,Not eligible,True,Petition,"Dismissal of felony charges; HOWEVER, the outcome is changed to not eligible because the lifetime limit of two expungements has been exceeded",False,True
1001000000103,2008-07-11,18.2-172,covered elsewhere,Felony,Felony,5,Nolle Prosequi,Dismissed,,Black,Female,195,True,False,False,True,True,False,False,False,False,Not eligible,True,Petition,"Dismissal of felony charges; HOWEVER, the outcome is changed to not eligible because the lifetime limit of two expungements has been exceeded",False,True
1001000000103,2008-07-11,18.2-172,covered elsewhere,Felony,Felony,5,Nolle Prosequi,Dismissed,,Black,Female,195,True,False,False,True,True,False,False,False,False,Not eligible,True,Petition,"Dismissal of felony charges; HOWEVER, the outcome is changed to not eligible because the lifetime limit of two expungements has been exceeded",False,True
1001000000103,2008-07-11,18.2-172,covered elsewhere,Felony,Felony,5,Nolle Prosequi,Dismissed,,Black,Female,195,True,False,False,True,True,False,False,False,False,Not eligible,True,Petition,"Dismissal of felony charges; HOWEVER, the outcome is changed to not eligible because the lifetime limit of two expungements has been exceeded",False,True
1001000000103,2008-07-11,18.2-172,covered elsewhere,Felony,Felony,5,Nolle Prosequi,Dismissed,,Black,Female,195,True,False,False,True,True,False,False,False,False,Not eligible,True,Petition,"Dismissal of felony charges; HOWEVER, the outcome is changed to not eligible because the lifetime limit of two expungements has been exceeded",False,True
1001000000103,2008-07-11,18.2-172,covered elsewhere,Felony,Felony,5,Nolle Prosequi,Dismissed,,Black,Female,195,True,False,False,True,True,False,False,False,False,Not eligible,True,Petition,"Dismissal of felony charges; HOWEVER, the outcome is changed to not eligible because the lifetime limit of two expungements has been exceeded",False,True
1001000000103,2008-07-11,18.2-172,covered elsewhere,Felony,Felony,5,Nolle Prosequi,Dismissed,,Black,Female,195,True,False,False,True,True,False,False,False,False,Not eligible,True,Petition,"Dismissal of felony charges; HOWEVER, the outcome is changed to not eligible because the lifetime limit of two expungements has been exceeded",False,True
1001000000103,2008-07-11,18.2-172,covered elsewhere,Felony,Felony,5,Nolle Prosequi,Dismissed,,Black,Female,195,True,False,False,True,True,False,False,False,False,Not eligible,True,Petition,"Dismissal of felony charges; HOWEVER, the outcome is changed to not eligible because the lifetime limit of two expungements has been exceeded",False,True
1001000000103,2008-07-11,18.2-172,covered elsewhere,Felony,Felony,5,Nolle Prosequi,Dismissed,,Black,Female,195,True,False,False,True,True,False,False,False,False,Not eligible,True,Petition,"Dismissal of felony charges; HOWEVER, the outcome is changed to not eligible because the lifetime limit of two expungements has been exceeded",False,True
