# Expungement Eligibility Classification - Parallel Featurization

## Configuration

## Dask Transformations

Loading up the Dask client is necessary to run processes on multiple workers

In [38]:
import os

import sqlalchemy as sa
import pandas as pd
import numpy as np
import dask.dataframe as dd

In [39]:
from distributed import Client

client = Client()
client

Perhaps you already have a cluster running?
Hosting the HTTP server on port 62377 instead
  f"Port {expected} is already in use.\n"


0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:62377/status,

0,1
Dashboard: http://127.0.0.1:62377/status,Workers: 4
Total threads: 4,Total memory: 8.00 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:62378,Workers: 4
Dashboard: http://127.0.0.1:62377/status,Total threads: 4
Started: Just now,Total memory: 8.00 GiB

0,1
Comm: tcp://127.0.0.1:62389,Total threads: 1
Dashboard: http://127.0.0.1:62392/status,Memory: 2.00 GiB
Nanny: tcp://127.0.0.1:62381,
Local directory: /Users/isaak/repos/LAJC-expungement/dask-worker-space/worker-3ahbx0xi,Local directory: /Users/isaak/repos/LAJC-expungement/dask-worker-space/worker-3ahbx0xi

0,1
Comm: tcp://127.0.0.1:62390,Total threads: 1
Dashboard: http://127.0.0.1:62391/status,Memory: 2.00 GiB
Nanny: tcp://127.0.0.1:62383,
Local directory: /Users/isaak/repos/LAJC-expungement/dask-worker-space/worker-8srzaozk,Local directory: /Users/isaak/repos/LAJC-expungement/dask-worker-space/worker-8srzaozk

0,1
Comm: tcp://127.0.0.1:62398,Total threads: 1
Dashboard: http://127.0.0.1:62399/status,Memory: 2.00 GiB
Nanny: tcp://127.0.0.1:62382,
Local directory: /Users/isaak/repos/LAJC-expungement/dask-worker-space/worker-av06wzl1,Local directory: /Users/isaak/repos/LAJC-expungement/dask-worker-space/worker-av06wzl1

0,1
Comm: tcp://127.0.0.1:62395,Total threads: 1
Dashboard: http://127.0.0.1:62396/status,Memory: 2.00 GiB
Nanny: tcp://127.0.0.1:62384,
Local directory: /Users/isaak/repos/LAJC-expungement/dask-worker-space/worker-_8lmv3ct,Local directory: /Users/isaak/repos/LAJC-expungement/dask-worker-space/worker-_8lmv3ct


### Data Loading

In [40]:
USER = 'jupyter'
PASSWORD = os.environ['POSTGRES_PASS']
HOST = 'localhost'
PORT = '5432'
DB = 'expunge'

DATABASE_URI = f"postgresql://{USER}:{PASSWORD}@{HOST}:{PORT}/{DB}"
engine = sa.create_engine(DATABASE_URI)

Loading SQL extension for useful spot-checking

In [28]:
%load_ext sql
%sql {DATABASE_URI}

The sql extension is already loaded. To reload it, use:
  %reload_ext sql


Dask DataFrame does not accept raw SQL, but will accept a SQLAlchemy ORM query object. We are using this to read data from `expunge` sorted by both `person_id` and `HearingDate`. This ordering is important for some of the partitioned aggregations. 

In [29]:
_EXPUNGE_TABLE = 'expunge_clean' # Full Dataset
# _EXPUNGE_TABLE = 'expunge_10k_clean' # ~26K records
# _EXPUNGE_TABLE = 'expunge_1k_clean' # ~2.6K records

expunge_model = sa.Table(_EXPUNGE_TABLE, sa.MetaData(),
    sa.Column('person_id', sa.Integer),
    sa.Column('HearingDate', sa.DateTime),
    sa.Column('CodeSection', sa.String),
    sa.Column('ChargeType', sa.String),
    sa.Column('Class', sa.String),
    sa.Column('DispositionCode', sa.String),
    sa.Column('Plea', sa.String),
    sa.Column('Race', sa.String),
    sa.Column('Sex', sa.String),
    sa.Column('fips', sa.Integer),
)

Here you can see the raw query string to which the `query` SQLAlchemy object translates

In [30]:
query = (
    sa.sql.select(expunge_model)
    # Where clause just for testing, comment out for full run
    .where(
        sa.or_(
            # expunge_model.c.person_id == 127051000000102, 
            # expunge_model.c.person_id == 224010000000817,
            # expunge_model.c.person_id == 1000000000362

            expunge_model.c.person_id == 1021000000606,
            expunge_model.c.person_id == 1070000000994
        )
    )
    .order_by(expunge_model.c.person_id, expunge_model.c.HearingDate)
)
print(str(query))

SELECT expunge_clean.person_id, expunge_clean."HearingDate", expunge_clean."CodeSection", expunge_clean."ChargeType", expunge_clean."Class", expunge_clean."DispositionCode", expunge_clean."Plea", expunge_clean."Race", expunge_clean."Sex", expunge_clean.fips 
FROM expunge_clean 
WHERE expunge_clean.person_id = :person_id_1 OR expunge_clean.person_id = :person_id_2 ORDER BY expunge_clean.person_id, expunge_clean."HearingDate"


In [31]:
dask_types = {
    'HearingDate': 'datetime64[ns]',
    'CodeSection': str,
    'ChargeType': str,
    'Class': str,
    'DispositionCode': str,
    'Plea': str,
    'Race': str,
    'Sex': str,
    'fips': 'int64'
}
meta_frame = pd.DataFrame(columns=dask_types.keys()).astype(dask_types)

meta_frame.dtypes

HearingDate        datetime64[ns]
CodeSection                object
ChargeType                 object
Class                      object
DispositionCode            object
Plea                       object
Race                       object
Sex                        object
fips                        int64
dtype: object

In [34]:
%%time
n_partitions = None

kwargs = {'npartitions': n_partitions} if n_partitions else {}

df = dd.read_sql_table(
    table=query,
    index_col='person_id',
    uri=DATABASE_URI,
    meta=meta_frame,
    **kwargs
)

CPU times: user 66.4 ms, sys: 11.7 ms, total: 78.1 ms
Wall time: 224 ms


In [35]:
df

Unnamed: 0_level_0,HearingDate,CodeSection,ChargeType,Class,DispositionCode,Plea,Race,Sex,fips
npartitions=3,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1021000000000.0,datetime64[ns],object,object,object,object,object,object,object,int64
1037333000000.0,...,...,...,...,...,...,...,...,...
1053667000000.0,...,...,...,...,...,...,...,...,...
1070000000000.0,...,...,...,...,...,...,...,...,...


In [36]:
pd.set_option('max_columns', None)

In [37]:
df.head(20)

  f"Insufficient elements for `head`. {n} elements requested, only {len(r)} "


Unnamed: 0_level_0,HearingDate,CodeSection,ChargeType,Class,DispositionCode,Plea,Race,Sex,fips
person_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1021000000606,2009-05-26,46.2-300,Misdemeanor,2,Guilty,,White,Female,169
1021000000606,2009-07-23,18.2-57,Misdemeanor,1,Dismissed,,White,Female,520
1021000000606,2011-09-08,18.2-250.1,Misdemeanor,U,Guilty,Guilty,White,Female,520
1021000000606,2014-08-28,18.2-56.1,Misdemeanor,1,Guilty,Guilty,White,Female,520
1021000000606,2018-09-18,18.2-248,Felony,U,Guilty,,White,Female,520
1021000000606,2018-09-18,18.2-248,Felony,U,Guilty,,White,Female,520
1021000000606,2019-07-10,18.2-119,Misdemeanor,,Dismissed,,White,Female,191


Number of partitions that the data is split into. Essentially, `npartitions` is equal to the number of separate Pandas DataFrames that Dask is operating on under the hood

In [None]:
df.npartitions

1

These divisions are the cutoffs for the various partitions. Dask automatically generates the divisions, splitting data into ~100-250mb Pandas DataFrames. 

Since `person_id` is the index, Dask will guarantee that a given `person_id` always falls entirely within a single partition. This is important for performing aggregations on a single person_id without shuffling records across nodes.

In [14]:
df.divisions[:5]

(1021000000606.0, 1070000000994.0)

### Data Cleaning & Featurization

In [15]:
df['CodeSection'] = df['CodeSection'].fillna('MISSING')

In [16]:
VALID_DISPOSITIONS = [
    'Guilty',
    'Guilty In Absentia',
    'Dismissed',
    'Nolle Prosequi',
    'Not Guilty',
    'Not Guilty/Acquitted',
    'No Indictment Presented',
    'Not True Bill',
    'Dismissed/Other'
]

df = df[
    (~df['DispositionCode'].isna())
    & (df['DispositionCode'].isin(VALID_DISPOSITIONS))
]

In [17]:
%%time
DISPOSITION_MAP = {
    'Nolle Prosequi': 'Dismissed',
    'No Indictment Presented': 'Dismissed',
    'Not True Bill': 'Dismissed',
    'Dismissed/Other': 'Dismissed',
    'Not Guilty': 'Dismissed',
    'Not Guilty/Acquitted': 'Dismissed',
    'Guilty In Absentia': 'Conviction',
    'Guilty': 'Conviction',
}

df['disposition'] = df['DispositionCode'].replace(DISPOSITION_MAP)

df.head()

Function:  execute_task
args:      ((<function apply at 0x10d53b950>, <function _read_sql_chunk at 0x119d6b440>, [<sqlalchemy.sql.selectable.Select object at 0x1196c5890>, 'postgresql://jupyter:codeforcville@localhost:5432/expunge', [(<class 'tuple'>, ['HearingDate', 'datetime64[ns]']), (<class 'tuple'>, ['CodeSection', <class 'str'>]), (<class 'tuple'>, ['ChargeType', <class 'str'>]), (<class 'tuple'>, ['Class', <class 'str'>]), (<class 'tuple'>, ['DispositionCode', <class 'str'>]), (<class 'tuple'>, ['Plea', <class 'str'>]), (<class 'tuple'>, ['Race', <class 'str'>]), (<class 'tuple'>, ['Sex', <class 'str'>]), (<class 'tuple'>, ['fips', 'int64'])]], (<class 'dict'>, [['engine_kwargs', (<class 'dict'>, [])], ['index_col', 'person_id']])))
kwargs:    {}
Exception: 'AttributeError("\'list\' object has no attribute \'dtypes\'")'



AttributeError: 'list' object has no attribute 'dtypes'

In [None]:
%%time
deferral_pleas = [
    'Alford',
    'Guilty',
    'Nolo Contendere'
]

deferral_conditions = (
    (df['Plea'].isin(deferral_pleas))
    & (df['disposition']=='Dismissed')
)

df['disposition'] = df['disposition'].mask(deferral_conditions, 'Deferral Dismissal')

df[df['disposition']=='Deferral Dismissal'].head()

CPU times: user 41.1 ms, sys: 11.7 ms, total: 52.7 ms
Wall time: 390 ms


Unnamed: 0_level_0,HearingDate,CodeSection,ChargeType,Class,DispositionCode,Plea,Race,Sex,fips,disposition
person_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1070000000994,2016-07-11,18.2-248.1,Felony,5,Nolle Prosequi,Guilty,Black,Male,770,Deferral Dismissal


In [18]:
%%time
df['chargetype'] = df['ChargeType']

df.head()

Function:  execute_task
args:      ((<function apply at 0x108aca950>, <function _read_sql_chunk at 0x115362b00>, [<sqlalchemy.sql.selectable.Select object at 0x114c380d0>, 'postgresql://jupyter:codeforcville@localhost:5432/expunge', [(<class 'tuple'>, ['HearingDate', 'datetime64[ns]']), (<class 'tuple'>, ['CodeSection', <class 'str'>]), (<class 'tuple'>, ['ChargeType', <class 'str'>]), (<class 'tuple'>, ['Class', <class 'str'>]), (<class 'tuple'>, ['DispositionCode', <class 'str'>]), (<class 'tuple'>, ['Plea', <class 'str'>]), (<class 'tuple'>, ['Race', <class 'str'>]), (<class 'tuple'>, ['Sex', <class 'str'>]), (<class 'tuple'>, ['fips', 'int64'])]], (<class 'dict'>, [['engine_kwargs', (<class 'dict'>, [])], ['index_col', 'person_id']])))
kwargs:    {}
Exception: 'AttributeError("\'list\' object has no attribute \'dtypes\'")'



AttributeError: 'list' object has no attribute 'dtypes'

In [None]:
COVERED_SECTIONS_A = [
    '4.1-305', 
    '18.2-250.1'
]

COVERED_SECTIONS_B = [
    '4.1-305',
    '18.2-96',
    '18.2-103',
    '18.2-119',
    '18.2-120',
    '18.2-134',
    '18.2-250.1',
    '18.2-415'
]

COVERED_SECTIONS_B_MISDEMEANOR = [
    '18.2-248.1'
]

EXCLUDED_SECTIONS_TWELVE = [
    '18.2-36.1',
    '18.2-36.2',
    '18.2-51.4',
    '18.2-51.5',
    '18.2-57.2',
    '18.2-266',
    '46.2-341.24'
]

In [19]:
def assign_code_section(row):
    if (
        row['CodeSection'] in COVERED_SECTIONS_A 
        and row['disposition']=='Deferral Dismissal'
    ):
        return 'covered in 19.2-392.6 - A'
    
    elif (
        row['CodeSection'] in COVERED_SECTIONS_B
        or (
            row['CodeSection'] in COVERED_SECTIONS_B_MISDEMEANOR
            and row['chargetype']=='Misdemeanor'
        )
    ):
        return 'covered in 19.2-392.6 - B'
    
    elif row['CodeSection'] in EXCLUDED_SECTIONS_TWELVE:
        return 'excluded by 19.2-392.12'
    
    else:
        return 'covered elsewhere'

In [20]:
%%time
df['codesection'] = df.map_partitions(
    lambda df: df.apply(assign_code_section, axis=1),
    meta=pd.Series(dtype=str)
)

df.head()

Function:  execute_task
args:      ((<function apply at 0x10d53b950>, <function _read_sql_chunk at 0x119d6b440>, [<sqlalchemy.sql.selectable.Select object at 0x1196c5850>, 'postgresql://jupyter:codeforcville@localhost:5432/expunge', [(<class 'tuple'>, ['HearingDate', 'datetime64[ns]']), (<class 'tuple'>, ['CodeSection', <class 'str'>]), (<class 'tuple'>, ['ChargeType', <class 'str'>]), (<class 'tuple'>, ['Class', <class 'str'>]), (<class 'tuple'>, ['DispositionCode', <class 'str'>]), (<class 'tuple'>, ['Plea', <class 'str'>]), (<class 'tuple'>, ['Race', <class 'str'>]), (<class 'tuple'>, ['Sex', <class 'str'>]), (<class 'tuple'>, ['fips', 'int64'])]], (<class 'dict'>, [['engine_kwargs', (<class 'dict'>, [])], ['index_col', 'person_id']])))
kwargs:    {}
Exception: 'AttributeError("\'list\' object has no attribute \'dtypes\'")'



AttributeError: 'list' object has no attribute 'dtypes'

In [22]:
def has_conviction(df):
    conviction_map = (df['disposition']
              .apply(lambda x: x=='Conviction')
              .groupby('person_id')
              .any())
    
    return df.index.map(conviction_map)

In [23]:
%%time
df['convictions'] = df.map_partitions(
    has_conviction,
    meta=pd.Series(dtype=bool)
)

df.head()

CPU times: user 39.7 ms, sys: 4.62 ms, total: 44.3 ms
Wall time: 99.3 ms


Unnamed: 0_level_0,HearingDate,CodeSection,ChargeType,Class,DispositionCode,Plea,Race,Sex,fips,disposition,chargetype,codesection,convictions
person_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1021000000606,2009-05-26,46.2-300,Misdemeanor,2,Guilty,,White,Female,169,Conviction,Misdemeanor,covered elsewhere,True
1021000000606,2009-07-23,18.2-57,Misdemeanor,1,Dismissed,,White,Female,520,Dismissed,Misdemeanor,covered elsewhere,True
1021000000606,2011-09-08,18.2-250.1,Misdemeanor,U,Guilty,Guilty,White,Female,520,Conviction,Misdemeanor,covered in 19.2-392.6 - B,True
1021000000606,2014-08-28,18.2-56.1,Misdemeanor,1,Guilty,Guilty,White,Female,520,Conviction,Misdemeanor,covered elsewhere,True
1021000000606,2018-09-18,18.2-248,Felony,U,Guilty,,White,Female,520,Conviction,Felony,covered elsewhere,True


**Question** - What about same day hearings?

In [24]:
def shift_hearing_date(df, shift_by):
    return (
        df.groupby('person_id')['HearingDate']
          .shift(periods=shift_by)
    )

In [25]:
%%time
df['last_hearing_date'] = df.map_partitions(
    shift_hearing_date,
    shift_by=1,
    meta=pd.Series(dtype='datetime64[ns]')
)

df.head()

CPU times: user 44.8 ms, sys: 957 µs, total: 45.8 ms
Wall time: 103 ms


Unnamed: 0_level_0,HearingDate,CodeSection,ChargeType,Class,DispositionCode,Plea,Race,Sex,fips,disposition,chargetype,codesection,convictions,last_hearing_date
person_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1021000000606,2009-05-26,46.2-300,Misdemeanor,2,Guilty,,White,Female,169,Conviction,Misdemeanor,covered elsewhere,True,NaT
1021000000606,2009-07-23,18.2-57,Misdemeanor,1,Dismissed,,White,Female,520,Dismissed,Misdemeanor,covered elsewhere,True,2009-05-26
1021000000606,2011-09-08,18.2-250.1,Misdemeanor,U,Guilty,Guilty,White,Female,520,Conviction,Misdemeanor,covered in 19.2-392.6 - B,True,2009-07-23
1021000000606,2014-08-28,18.2-56.1,Misdemeanor,1,Guilty,Guilty,White,Female,520,Conviction,Misdemeanor,covered elsewhere,True,2011-09-08
1021000000606,2018-09-18,18.2-248,Felony,U,Guilty,,White,Female,520,Conviction,Felony,covered elsewhere,True,2014-08-28


In [26]:
def get_conviction_dates(df):
    return np.where(
        (df['disposition']=='Conviction'), 
        df['HearingDate'],
        np.datetime64('NaT')
    )

In [27]:
%%time
df['date_if_conviction'] = df.map_partitions(
    get_conviction_dates,
    meta=pd.Series(dtype='datetime64[ns]')
)

df.head()

CPU times: user 29 ms, sys: 21.3 ms, total: 50.3 ms
Wall time: 104 ms


Unnamed: 0_level_0,HearingDate,CodeSection,ChargeType,Class,DispositionCode,Plea,Race,Sex,fips,disposition,chargetype,codesection,convictions,last_hearing_date,date_if_conviction
person_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1021000000606,2009-05-26,46.2-300,Misdemeanor,2,Guilty,,White,Female,169,Conviction,Misdemeanor,covered elsewhere,True,NaT,2009-05-26
1021000000606,2009-07-23,18.2-57,Misdemeanor,1,Dismissed,,White,Female,520,Dismissed,Misdemeanor,covered elsewhere,True,2009-05-26,NaT
1021000000606,2011-09-08,18.2-250.1,Misdemeanor,U,Guilty,Guilty,White,Female,520,Conviction,Misdemeanor,covered in 19.2-392.6 - B,True,2009-07-23,2011-09-08
1021000000606,2014-08-28,18.2-56.1,Misdemeanor,1,Guilty,Guilty,White,Female,520,Conviction,Misdemeanor,covered elsewhere,True,2011-09-08,2014-08-28
1021000000606,2018-09-18,18.2-248,Felony,U,Guilty,,White,Female,520,Conviction,Felony,covered elsewhere,True,2014-08-28,2018-09-18


In [28]:
def get_felony_conviction_dates(df):
    return np.where(
        (df['chargetype']=='Felony'), 
        df['date_if_conviction'],
        np.datetime64('NaT')
    )

In [29]:
%%time
df['date_if_felony_conviction'] = df.map_partitions(
    get_felony_conviction_dates,
    meta=pd.Series(dtype='datetime64[ns]')
)

df.head()

CPU times: user 49.7 ms, sys: 1.2 ms, total: 50.9 ms
Wall time: 108 ms


Unnamed: 0_level_0,HearingDate,CodeSection,ChargeType,Class,DispositionCode,Plea,Race,Sex,fips,disposition,chargetype,codesection,convictions,last_hearing_date,date_if_conviction,date_if_felony_conviction
person_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1021000000606,2009-05-26,46.2-300,Misdemeanor,2,Guilty,,White,Female,169,Conviction,Misdemeanor,covered elsewhere,True,NaT,2009-05-26,NaT
1021000000606,2009-07-23,18.2-57,Misdemeanor,1,Dismissed,,White,Female,520,Dismissed,Misdemeanor,covered elsewhere,True,2009-05-26,NaT,NaT
1021000000606,2011-09-08,18.2-250.1,Misdemeanor,U,Guilty,Guilty,White,Female,520,Conviction,Misdemeanor,covered in 19.2-392.6 - B,True,2009-07-23,2011-09-08,NaT
1021000000606,2014-08-28,18.2-56.1,Misdemeanor,1,Guilty,Guilty,White,Female,520,Conviction,Misdemeanor,covered elsewhere,True,2011-09-08,2014-08-28,NaT
1021000000606,2018-09-18,18.2-248,Felony,U,Guilty,,White,Female,520,Conviction,Felony,covered elsewhere,True,2014-08-28,2018-09-18,2018-09-18


In [30]:
def get_last_felony_conviction_date(df):
    return (
        df['date_if_felony_conviction']
            .groupby('person_id')
            .shift(1)
            .groupby('person_id')
            .ffill()
            .fillna(pd.NaT)
    )

In [31]:
%%time
df['last_felony_conviction_date'] = df.map_partitions(
    get_last_felony_conviction_date,
    meta=pd.Series(dtype='datetime64[ns]')
)
df = df.drop('date_if_felony_conviction', axis='columns')

df.head(20)

CPU times: user 91.8 ms, sys: 3.14 ms, total: 95 ms
Wall time: 158 ms


Unnamed: 0_level_0,HearingDate,CodeSection,ChargeType,Class,DispositionCode,Plea,Race,Sex,fips,disposition,chargetype,codesection,convictions,last_hearing_date,date_if_conviction,last_felony_conviction_date
person_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1021000000606,2009-05-26,46.2-300,Misdemeanor,2,Guilty,,White,Female,169,Conviction,Misdemeanor,covered elsewhere,True,NaT,2009-05-26,NaT
1021000000606,2009-07-23,18.2-57,Misdemeanor,1,Dismissed,,White,Female,520,Dismissed,Misdemeanor,covered elsewhere,True,2009-05-26,NaT,NaT
1021000000606,2011-09-08,18.2-250.1,Misdemeanor,U,Guilty,Guilty,White,Female,520,Conviction,Misdemeanor,covered in 19.2-392.6 - B,True,2009-07-23,2011-09-08,NaT
1021000000606,2014-08-28,18.2-56.1,Misdemeanor,1,Guilty,Guilty,White,Female,520,Conviction,Misdemeanor,covered elsewhere,True,2011-09-08,2014-08-28,NaT
1021000000606,2018-09-18,18.2-248,Felony,U,Guilty,,White,Female,520,Conviction,Felony,covered elsewhere,True,2014-08-28,2018-09-18,NaT
1021000000606,2018-09-18,18.2-248,Felony,U,Guilty,,White,Female,520,Conviction,Felony,covered elsewhere,True,2018-09-18,2018-09-18,2018-09-18
1021000000606,2019-07-10,18.2-119,Misdemeanor,,Dismissed,,White,Female,191,Dismissed,Misdemeanor,covered in 19.2-392.6 - B,True,2018-09-18,NaT,2018-09-18
1070000000994,2004-02-10,Z.18.2-91,Felony,,Guilty,Guilty,Black,Male,19,Conviction,Felony,covered elsewhere,True,NaT,2004-02-10,NaT
1070000000994,2004-02-10,18.2-95,Felony,,Guilty,Guilty,Black,Male,19,Conviction,Felony,covered elsewhere,True,2004-02-10,2004-02-10,2004-02-10
1070000000994,2009-08-12,18.2-47,Felony,5,Guilty,Guilty,Black,Male,161,Conviction,Felony,covered elsewhere,True,2004-02-10,2009-08-12,2004-02-10


In [32]:
def get_next_conviction_date(df):
    return (
        df['date_if_conviction']
            .groupby('person_id')
            .shift(-1)
            .groupby('person_id')
            .bfill()
            .fillna(pd.NaT)
    )

In [33]:
%%time
df['next_conviction_date'] = df.map_partitions(
    get_next_conviction_date,
    meta=pd.Series(dtype='datetime64[ns]')
)
df = df.drop('date_if_conviction', axis='columns')

df[['HearingDate','disposition','next_conviction_date']].head(20)

CPU times: user 65.1 ms, sys: 0 ns, total: 65.1 ms
Wall time: 127 ms


Unnamed: 0_level_0,HearingDate,disposition,next_conviction_date
person_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1021000000606,2009-05-26,Conviction,2011-09-08
1021000000606,2009-07-23,Dismissed,2011-09-08
1021000000606,2011-09-08,Conviction,2014-08-28
1021000000606,2014-08-28,Conviction,2018-09-18
1021000000606,2018-09-18,Conviction,2018-09-18
1021000000606,2018-09-18,Conviction,NaT
1021000000606,2019-07-10,Dismissed,NaT
1070000000994,2004-02-10,Conviction,2004-02-10
1070000000994,2004-02-10,Conviction,2009-08-12
1070000000994,2009-08-12,Conviction,2010-09-02


In [34]:
def fix_shifted_sameday_dates(df, fix_column, is_backward_facing=True):
    grouped_dates = df.groupby(['person_id','HearingDate'])[fix_column]
    
    if is_backward_facing:
        return grouped_dates.transform(lambda df: df.min(skipna=False))
    else:
        return grouped_dates.transform(lambda df: df.max(skipna=False))

In [35]:
%%time
df['last_hearing_date'] = df.map_partitions(
    fix_shifted_sameday_dates,
    fix_column='last_hearing_date',
    meta=pd.Series(dtype='datetime64[ns]')
)

df[['HearingDate','last_hearing_date']].head()

CPU times: user 61.4 ms, sys: 7.96 ms, total: 69.3 ms
Wall time: 132 ms


Unnamed: 0_level_0,HearingDate,last_hearing_date
person_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1021000000606,2009-05-26,NaT
1021000000606,2009-07-23,2009-05-26
1021000000606,2011-09-08,2009-07-23
1021000000606,2014-08-28,2011-09-08
1021000000606,2018-09-18,2014-08-28


In [36]:
%%time
df['last_felony_conviction_date'] = df.map_partitions(
    fix_shifted_sameday_dates,
    fix_column='last_felony_conviction_date',
    meta=pd.Series(dtype='datetime64[ns]')
)

df[['HearingDate','last_felony_conviction_date']].head()

CPU times: user 65 ms, sys: 0 ns, total: 65 ms
Wall time: 152 ms


Unnamed: 0_level_0,HearingDate,last_felony_conviction_date
person_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1021000000606,2009-05-26,NaT
1021000000606,2009-07-23,NaT
1021000000606,2011-09-08,NaT
1021000000606,2014-08-28,NaT
1021000000606,2018-09-18,NaT


In [37]:
%%time
df['next_conviction_date'] = df.map_partitions(
    fix_shifted_sameday_dates,
    fix_column='next_conviction_date',
    is_backward_facing=False,
    meta=pd.Series(dtype='datetime64[ns]')
)

df[['HearingDate','next_conviction_date']].head(10)

CPU times: user 74.6 ms, sys: 0 ns, total: 74.6 ms
Wall time: 157 ms


Unnamed: 0_level_0,HearingDate,next_conviction_date
person_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1021000000606,2009-05-26,2011-09-08
1021000000606,2009-07-23,2011-09-08
1021000000606,2011-09-08,2014-08-28
1021000000606,2014-08-28,2018-09-18
1021000000606,2018-09-18,NaT
1021000000606,2018-09-18,NaT
1021000000606,2019-07-10,NaT
1070000000994,2004-02-10,2009-08-12
1070000000994,2004-02-10,2009-08-12
1070000000994,2009-08-12,2010-09-02


In [38]:
%%time
df['days_since_last_hearing'] = df['HearingDate'] - df['last_hearing_date']
df['days_until_next_conviction'] = df['next_conviction_date'] - df['HearingDate']
df['days_since_last_felony_conviction'] = df['HearingDate'] - df['last_felony_conviction_date']

df.head()

CPU times: user 77.8 ms, sys: 8.1 ms, total: 85.9 ms
Wall time: 170 ms


Unnamed: 0_level_0,HearingDate,CodeSection,ChargeType,Class,DispositionCode,Plea,Race,Sex,fips,disposition,chargetype,codesection,convictions,last_hearing_date,last_felony_conviction_date,next_conviction_date,days_since_last_hearing,days_until_next_conviction,days_since_last_felony_conviction
person_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1021000000606,2009-05-26,46.2-300,Misdemeanor,2,Guilty,,White,Female,169,Conviction,Misdemeanor,covered elsewhere,True,NaT,NaT,2011-09-08,NaT,835 days,NaT
1021000000606,2009-07-23,18.2-57,Misdemeanor,1,Dismissed,,White,Female,520,Dismissed,Misdemeanor,covered elsewhere,True,2009-05-26,NaT,2011-09-08,58 days,777 days,NaT
1021000000606,2011-09-08,18.2-250.1,Misdemeanor,U,Guilty,Guilty,White,Female,520,Conviction,Misdemeanor,covered in 19.2-392.6 - B,True,2009-07-23,NaT,2014-08-28,777 days,1085 days,NaT
1021000000606,2014-08-28,18.2-56.1,Misdemeanor,1,Guilty,Guilty,White,Female,520,Conviction,Misdemeanor,covered elsewhere,True,2011-09-08,NaT,2018-09-18,1085 days,1482 days,NaT
1021000000606,2018-09-18,18.2-248,Felony,U,Guilty,,White,Female,520,Conviction,Felony,covered elsewhere,True,2014-08-28,NaT,NaT,1482 days,NaT,NaT


In [39]:
df['days_passed_since_hearing'] = -(df['HearingDate'] - np.datetime64('today'))

df.head()

Unnamed: 0_level_0,HearingDate,CodeSection,ChargeType,Class,DispositionCode,Plea,Race,Sex,fips,disposition,chargetype,codesection,convictions,last_hearing_date,last_felony_conviction_date,next_conviction_date,days_since_last_hearing,days_until_next_conviction,days_since_last_felony_conviction,days_passed_since_hearing
person_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1021000000606,2009-05-26,46.2-300,Misdemeanor,2,Guilty,,White,Female,169,Conviction,Misdemeanor,covered elsewhere,True,NaT,NaT,2011-09-08,NaT,835 days,NaT,4588 days
1021000000606,2009-07-23,18.2-57,Misdemeanor,1,Dismissed,,White,Female,520,Dismissed,Misdemeanor,covered elsewhere,True,2009-05-26,NaT,2011-09-08,58 days,777 days,NaT,4530 days
1021000000606,2011-09-08,18.2-250.1,Misdemeanor,U,Guilty,Guilty,White,Female,520,Conviction,Misdemeanor,covered in 19.2-392.6 - B,True,2009-07-23,NaT,2014-08-28,777 days,1085 days,NaT,3753 days
1021000000606,2014-08-28,18.2-56.1,Misdemeanor,1,Guilty,Guilty,White,Female,520,Conviction,Misdemeanor,covered elsewhere,True,2011-09-08,NaT,2018-09-18,1085 days,1482 days,NaT,2668 days
1021000000606,2018-09-18,18.2-248,Felony,U,Guilty,,White,Female,520,Conviction,Felony,covered elsewhere,True,2014-08-28,NaT,NaT,1482 days,NaT,NaT,1186 days


In [41]:
df.head(20)[[
    'disposition',
    'chargetype',
    'HearingDate',
    'days_passed_since_hearing',
    'last_hearing_date',
    'days_since_last_hearing',
    'last_felony_conviction_date',
    'days_since_last_felony_conviction',
    'next_conviction_date',
    'days_until_next_conviction',
]]

Unnamed: 0_level_0,disposition,chargetype,HearingDate,days_passed_since_hearing,last_hearing_date,days_since_last_hearing,last_felony_conviction_date,days_since_last_felony_conviction,next_conviction_date,days_until_next_conviction
person_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1021000000606,Conviction,Misdemeanor,2009-05-26,4588 days,NaT,NaT,NaT,NaT,2011-09-08,835 days
1021000000606,Dismissed,Misdemeanor,2009-07-23,4530 days,2009-05-26,58 days,NaT,NaT,2011-09-08,777 days
1021000000606,Conviction,Misdemeanor,2011-09-08,3753 days,2009-07-23,777 days,NaT,NaT,2014-08-28,1085 days
1021000000606,Conviction,Misdemeanor,2014-08-28,2668 days,2011-09-08,1085 days,NaT,NaT,2018-09-18,1482 days
1021000000606,Conviction,Felony,2018-09-18,1186 days,2014-08-28,1482 days,NaT,NaT,NaT,NaT
1021000000606,Conviction,Felony,2018-09-18,1186 days,2014-08-28,1482 days,NaT,NaT,NaT,NaT
1021000000606,Dismissed,Misdemeanor,2019-07-10,891 days,2018-09-18,295 days,2018-09-18,295 days,NaT,NaT
1070000000994,Conviction,Felony,2004-02-10,6520 days,NaT,NaT,NaT,NaT,2009-08-12,2010 days
1070000000994,Conviction,Felony,2004-02-10,6520 days,NaT,NaT,NaT,NaT,2009-08-12,2010 days
1070000000994,Conviction,Felony,2009-08-12,4510 days,2004-02-10,2010 days,2004-02-10,2010 days,2010-09-02,386 days


## To Do - Features
1. `class_3_or_4_last_20`
2. `class_1_or_2`

### Writing and Loading Data
1. Write data to csv in `/tmp` directory
2. Load data to PostGres via `COPY` statements

This approach is *much* faster than loading via `df.to_sql`, since PostGres will help us load many records at once, instead of loading 1 by 1 via `INSERT` statements

In [None]:
target_dir = '/tmp/expunge_data'
target_glob = f'{target_dir}/expunge_features-*.csv'

return_val = os.system(f'rm -rf {target_glob}')

In [None]:
%%time
file_paths = df.to_csv(target_glob)

file_paths[:5]

Useful pandas functionality to approximate the SQL statement to create a table

In [None]:
from pandas.io.sql import get_schema

In [None]:
print(get_schema(df.head(), 'expunge_features'))

We're truncating before loading to avoid duplicate rows on re-runs

In [None]:
%%sql
DROP TABLE expunge_features;

In [None]:
engine.execute("""
    CREATE TABLE IF NOT EXISTS expunge_features (
        person_id BIGINT,
        "HearingDate" DATE,
        "CodeSection" TEXT,
        "ChargeType" TEXT,
        "Class" TEXT,
        "DispositionCode" TEXT,
        "Plea" TEXT,
        "Race" TEXT,
        "Sex" TEXT,
        "fips" INTEGER,
        "disposition" TEXT,
        "chargetype" TEXT,
        "codesection" TEXT,
        "convictions" BOOLEAN,
        "last_hearing_date" DATE,
        "next_hearing_date" DATE,
        "last_felony_conviction_date" DATE,
        "days_since_last_hearing" TEXT,
        "days_until_next_hearing" TEXT,
        "days_since_last_felony_conviction" TEXT
    );
    
    TRUNCATE TABLE expunge_features;
""")

These `COPY` statements do all of the data loading from CSVs

In [None]:
for path in file_paths:
    engine.execute(f"""
        COPY expunge_features
        FROM '{path}'
        WITH CSV HEADER;
        commit;
    """)

Make sure the data made it to the database

In [None]:
%%sql
SELECT COUNT(*)
FROM expunge_features

### Notes/Questions

- `ChargeType` and `chargetype` appear the same in `expunge` - is that because of cleaning done post-load?

### Added Columns
- `last_hearing_date`
- `last_felony_conviction_date`
- `next_hearing_date`
- `days_since_last_hearing`
- `days_since_last_felony_conviction`
- `days_until_next_hearing`

In [None]:
%%sql
SELECT *
FROM expunge_features
LIMIT 10

## Tables for Testing

Move 10k person_id's from clean table into materialized view for testing

In [None]:
%%sql
CREATE MATERIALIZED VIEW expunge_10k_clean AS
WITH ids AS (
    SELECT 
        DISTINCT person_id
    FROM expunge_clean
    LIMIT 10000
)
SELECT e.*
FROM expunge_clean e
WHERE EXISTS (
    SELECT 1
    FROM ids i
    WHERE i.person_id = e.person_id
)
ORDER BY e.person_id, e."HearingDate"

In [None]:
%%sql
CREATE MATERIALIZED VIEW expunge_1k_clean AS
WITH ids AS (
    SELECT 
        DISTINCT person_id
    FROM expunge_clean
    LIMIT 1000
)
SELECT e.*
FROM expunge_clean e
WHERE EXISTS (
    SELECT 1
    FROM ids i
    WHERE i.person_id = e.person_id
)
ORDER BY e.person_id, e."HearingDate"

In [None]:
%%sql
SELECT *
FROM expunge_1k_clean
WHERE person_id = 1000000000003

In [None]:
%%sql
SELECT *
FROM expunge_10k_clean
ORDER BY person_id, "HearingDate"
LIMIT 10