In [17]:
import pandas as pd
import os
import sqlalchemy as sa
import logging

In [2]:
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s [%(levelname)s - %(module)s.py]: %(message)s',
    datefmt='%H:%M:%S'
)

In [3]:
USER = 'jupyter'
PASSWORD = os.environ['POSTGRES_PASS']
HOST = 'localhost'
PORT = '5432'
DB = 'expunge'

DATABASE_URI = f"postgresql://{USER}:{PASSWORD}@{HOST}:{PORT}/{DB}"

In [4]:
%load_ext sql
%sql {DATABASE_URI}

In [12]:
%%sql
CREATE TABLE fips_subset_20220211 AS 
SELECT *
FROM expunge_clean
WHERE person_id IN (
    SELECT person_id
    FROM expunge_clean
    WHERE fips IN (
        143,
        89,
        690,
        
        3,
        79,
        65,
        165,
        
        163,
        9,
        15,
        530,
        17,
        99,
        193,
        133,
        103,
        159,
        33,
        67,
        141,
        89,
        510,
        59,
        107
    )
)

 * postgresql://jupyter:***@localhost:5432/expunge
2368006 rows affected.


[]

In [5]:
%%sql
SELECT COUNT(*)
FROM fips_subset_20220211
LIMIT 10

 * postgresql://jupyter:***@localhost:5432/expunge
1 rows affected.


count
2368006


# Run Featurization

In [5]:
import sys
import os

sys.path.append(os.path.expanduser('~/LAJC-expungement'))

sys.path

['/home/jupyter-isaak-a/LAJC-expungement/classify',
 '/opt/tljh/user/lib/python37.zip',
 '/opt/tljh/user/lib/python3.7',
 '/opt/tljh/user/lib/python3.7/lib-dynload',
 '',
 '/home/jupyter-isaak-a/venvs/distributed/lib/python3.7/site-packages',
 '/home/jupyter-isaak-a/venvs/distributed/lib/python3.7/site-packages/IPython/extensions',
 '/home/jupyter-isaak-a/.ipython',
 '/home/jupyter-isaak-a/LAJC-expungement']

In [6]:
from classify.featurize import (
    run_featurization,
    ExpungeConfig
)

In [7]:
config = ExpungeConfig.from_yaml('expunge_config.yaml')

config.run_id

21:39:26 [INFO - config.py]: Loading config from file: expunge_config.yaml


'default-full-table'

In [8]:
run_featurization(config)

21:39:31 [INFO - featurize.py]: Featurization Run ID: default-full-table
21:39:31 [INFO - featurize.py]: Initializing Dask distributed client
21:39:33 [INFO - featurize.py]: Reading from table: expunge_clean
21:40:24 [INFO - featurize.py]: Building Dask task graph for feature construction
21:40:25 [INFO - featurize.py]: Expungement feature data will be written to: /tmp/expunge_data
21:40:25 [INFO - featurize.py]: Clearing any data from previous runs
21:40:25 [INFO - featurize.py]: Command 'rm -rf /tmp/expunge_data/expunge_features-*.csv' returned with exit value: 0
21:40:25 [INFO - featurize.py]: Executing Dask task graph and writing results to CSV...
21:47:43 [INFO - featurize.py]: File(s) written successfully
21:47:43 [INFO - featurize.py]: Opening connection to PostGres via Psycopg
21:47:43 [INFO - featurize.py]: Deleting any records with run_id: default-full-table
21:48:22 [INFO - featurize.py]: Loading from file: /tmp/expunge_data/expunge_features-00.csv
21:48:28 [INFO - featurize

'default-full-table'

In [10]:
%%sql
SELECT * 
FROM expunge_features 
WHERE run_id = 'default-full-table'
LIMIT 5

 * postgresql://jupyter:***@localhost:5432/expunge
5 rows affected.


person_id,HearingDate,CodeSection,ChargeType,Class,DispositionCode,Plea,Race,Sex,fips,disposition,chargetype,codesection,convictions,last_hearing_date,last_felony_conviction_date,next_conviction_date,last_hearing_delta,last_felony_conviction_delta,next_conviction_delta,from_present_delta,arrest_disqualifier,felony_conviction_disqualifier,next_conviction_disqualifier_after_misdemeanor,next_conviction_disqualifier_after_felony,pending_after_misdemeanor,pending_after_felony,class1_2,class3_4,run_id
1000000000002,2013-04-04,C.46.2-862,Misdemeanor,1.0,Guilty,Guilty,Unknown,Male,107,Conviction,Misdemeanor,covered elsewhere,True,,,,,,,2828.0,False,False,False,False,False,True,False,False,default-full-table
1000000000003,2015-10-21,A.46.2-853,Misdemeanor,,Nolle Prosequi,,White,Male,153,Dismissed,Misdemeanor,covered elsewhere,False,,,,,,,1898.0,False,False,False,False,True,True,False,False,default-full-table
1000000000004,2009-10-07,A.46.2-862,Misdemeanor,1.0,Guilty,Guilty,Unknown,Male,87,Conviction,Misdemeanor,covered elsewhere,True,,,,,,,4103.0,False,False,False,False,False,False,False,False,default-full-table
1000000000006,2003-04-14,MISSING,Misdemeanor,,Dismissed,,Black,Male,700,Dismissed,Misdemeanor,covered elsewhere,False,,,,,,,6471.0,False,False,False,False,False,False,False,False,default-full-table
1000000000008,2014-02-05,23-26,Misdemeanor,1.0,Nolle Prosequi,,Unknown,Male,740,Dismissed,Misdemeanor,covered elsewhere,False,,,,,,,2521.0,False,False,False,False,True,True,False,False,default-full-table


# Running Classification

In [11]:
train_df = pd.read_csv('./training_set.csv')

print(train_df.shape)
train_df.head()

(12288, 13)


Unnamed: 0,chargetype,disposition,codesection,arrest_disqualifier,convictions,felony_conviction_disqualifier,next_conviction_disqualifier_after_misdemeanor,next_conviction_disqualifier_after_felony,pending_after_misdemeanor,pending_after_felony,class1_2,class3_4,expungability
0,Misdemeanor,Conviction,covered in 19.2-392.6 - A,False,False,False,False,False,False,False,False,False,Automatic
1,Felony,Conviction,covered in 19.2-392.6 - A,False,False,False,False,False,False,False,False,False,Petition
2,Misdemeanor,Dismissed,covered in 19.2-392.6 - A,False,False,False,False,False,False,False,False,False,Automatic
3,Felony,Dismissed,covered in 19.2-392.6 - A,False,False,False,False,False,False,False,False,False,Petition
4,Misdemeanor,Deferral Dismissal,covered in 19.2-392.6 - A,False,False,False,False,False,False,False,False,False,Automatic


In [12]:
train_df['expungability'].value_counts()

Not eligible           5920
Petition               4720
Automatic              1280
Automatic (pending)     256
Petition (pending)      112
Name: expungability, dtype: int64

In [13]:
%%sql
SELECT 
    run_id,
    COUNT(*)
FROM expunge_features
GROUP BY 1
ORDER BY 2

 * postgresql://jupyter:***@localhost:5432/expunge
4 rows affected.


run_id,count
default-10k,29344
notebook-testing,29344
fips-subset-2022-02-11-default,2367607
default-full-table,9052752


In [14]:
engine = sa.create_engine(DATABASE_URI)

engine

Engine(postgresql://jupyter:***@localhost:5432/expunge)

In [15]:
features_df = pd.read_sql("""
    SELECT 
        person_id,
        fips,
        "Race",
        "chargetype",
        "disposition",
        "codesection",
        "arrest_disqualifier",
        "convictions",
        "felony_conviction_disqualifier",
        "next_conviction_disqualifier_after_misdemeanor",
        "next_conviction_disqualifier_after_felony",
        "pending_after_misdemeanor",
        "pending_after_felony",
        "class1_2",
        "class3_4"
    FROM expunge_features
    WHERE run_id = 'default-full-table'
""", engine)

features_df.head()

Unnamed: 0,person_id,fips,Race,chargetype,disposition,codesection,arrest_disqualifier,convictions,felony_conviction_disqualifier,next_conviction_disqualifier_after_misdemeanor,next_conviction_disqualifier_after_felony,pending_after_misdemeanor,pending_after_felony,class1_2,class3_4
0,1000000000002,107,Unknown,Misdemeanor,Conviction,covered elsewhere,False,True,False,False,False,False,True,False,False
1,1000000000003,153,White,Misdemeanor,Dismissed,covered elsewhere,False,False,False,False,False,True,True,False,False
2,1000000000004,87,Unknown,Misdemeanor,Conviction,covered elsewhere,False,True,False,False,False,False,False,False,False
3,1000000000006,700,Black,Misdemeanor,Dismissed,covered elsewhere,False,False,False,False,False,False,False,False,False
4,1000000000008,740,Unknown,Misdemeanor,Dismissed,covered elsewhere,False,False,False,False,False,True,True,False,False


In [19]:
len(features_df.index)

9052752

In [18]:
X = train_df.drop('expungability', axis=1)
Y = train_df['expungability']

In [20]:
categorical_columns = [
    'chargetype',
    'disposition',
    'codesection'
]

other_columns = [
    col for col in train_df.columns 
    if not col in [*categorical_columns, 'expungability']
]

In [21]:
from sklearn.preprocessing import OneHotEncoder

In [22]:
encoder = OneHotEncoder()

In [23]:
encoder.fit(X)

OneHotEncoder()

In [24]:
X_encoded = encoder.transform(X)

In [25]:
from sklearn import tree

clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_encoded, Y)

clf

DecisionTreeClassifier()

In [26]:
extra_cols = [
    'person_id',
    'fips',
    'Race'
]

features_encoded = encoder.transform(features_df.drop(extra_cols, axis=1))

In [27]:
features_df['expungability'] = clf.predict(features_encoded)

features_df.head()

Unnamed: 0,person_id,fips,Race,chargetype,disposition,codesection,arrest_disqualifier,convictions,felony_conviction_disqualifier,next_conviction_disqualifier_after_misdemeanor,next_conviction_disqualifier_after_felony,pending_after_misdemeanor,pending_after_felony,class1_2,class3_4,expungability
0,1000000000002,107,Unknown,Misdemeanor,Conviction,covered elsewhere,False,True,False,False,False,False,True,False,False,Petition
1,1000000000003,153,White,Misdemeanor,Dismissed,covered elsewhere,False,False,False,False,False,True,True,False,False,Automatic
2,1000000000004,87,Unknown,Misdemeanor,Conviction,covered elsewhere,False,True,False,False,False,False,False,False,False,Petition
3,1000000000006,700,Black,Misdemeanor,Dismissed,covered elsewhere,False,False,False,False,False,False,False,False,False,Automatic
4,1000000000008,740,Unknown,Misdemeanor,Dismissed,covered elsewhere,False,False,False,False,False,True,True,False,False,Automatic


In [29]:
features_df.to_sql('full_table_default', engine, if_exists='replace', index=False)



In [30]:
%%sql
SELECT *
FROM "full_table_default"
LIMIT 5

 * postgresql://jupyter:***@localhost:5432/expunge
5 rows affected.


person_id,fips,Race,chargetype,disposition,codesection,arrest_disqualifier,convictions,felony_conviction_disqualifier,next_conviction_disqualifier_after_misdemeanor,next_conviction_disqualifier_after_felony,pending_after_misdemeanor,pending_after_felony,class1_2,class3_4,expungability
1000000000002,107,Unknown,Misdemeanor,Conviction,covered elsewhere,False,True,False,False,False,False,True,False,False,Petition
1000000000003,153,White,Misdemeanor,Dismissed,covered elsewhere,False,False,False,False,False,True,True,False,False,Automatic
1000000000004,87,Unknown,Misdemeanor,Conviction,covered elsewhere,False,True,False,False,False,False,False,False,False,Petition
1000000000006,700,Black,Misdemeanor,Dismissed,covered elsewhere,False,False,False,False,False,False,False,False,False,Automatic
1000000000008,740,Unknown,Misdemeanor,Dismissed,covered elsewhere,False,False,False,False,False,True,True,False,False,Automatic


In [31]:
%%sql
SELECT COUNT(*)
FROM full_table_default

 * postgresql://jupyter:***@localhost:5432/expunge
1 rows affected.


count
9052752


# No Lifetime Run

In [37]:
run_id = 'full_table_no_lifetime'

In [38]:
config = ExpungeConfig.from_yaml('expunge_config_full_table_no_lifetime.yaml')

config.run_id

23:37:55 [INFO - config.py]: Loading config from file: expunge_config_full_table_no_lifetime.yaml


'full_table_no_lifetime'

In [39]:
run_featurization(config, n_partitions=32)

23:38:04 [INFO - featurize.py]: Featurization Run ID: full_table_no_lifetime
23:38:04 [INFO - featurize.py]: Initializing Dask distributed client
Perhaps you already have a cluster running?
Hosting the HTTP server on port 37729 instead
  f"Port {expected} is already in use.\n"
23:38:09 [INFO - featurize.py]: Reading from table: expunge_clean
23:38:09 [INFO - featurize.py]: Loading into 32 partitions
23:38:40 [INFO - featurize.py]: Building Dask task graph for feature construction
23:38:41 [INFO - featurize.py]: Expungement feature data will be written to: /tmp/expunge_data
23:38:41 [INFO - featurize.py]: Clearing any data from previous runs
23:38:42 [INFO - featurize.py]: Command 'rm -rf /tmp/expunge_data/expunge_features-*.csv' returned with exit value: 0
23:38:42 [INFO - featurize.py]: Executing Dask task graph and writing results to CSV...
23:46:22 [INFO - featurize.py]: File(s) written successfully
23:46:22 [INFO - featurize.py]: Opening connection to PostGres via Psycopg
23:46:22 

'full_table_no_lifetime'

In [40]:
%%sql
SELECT * 
FROM expunge_features 
WHERE run_id = 'full_table_no_lifetime'
LIMIT 5

 * postgresql://jupyter:***@localhost:5432/expunge
5 rows affected.


person_id,HearingDate,CodeSection,ChargeType,Class,DispositionCode,Plea,Race,Sex,fips,disposition,chargetype,codesection,convictions,last_hearing_date,last_felony_conviction_date,next_conviction_date,last_hearing_delta,last_felony_conviction_delta,next_conviction_delta,from_present_delta,arrest_disqualifier,felony_conviction_disqualifier,next_conviction_disqualifier_after_misdemeanor,next_conviction_disqualifier_after_felony,pending_after_misdemeanor,pending_after_felony,class1_2,class3_4,run_id
7000000000244,2000-06-13,A.18.2-266,Misdemeanor,1.0,Guilty,Guilty,Black,Male,41,Conviction,Misdemeanor,covered elsewhere,True,,,,,,,7506.0,False,False,False,False,False,False,False,False,full_table_no_lifetime
1000000000002,2013-04-04,C.46.2-862,Misdemeanor,1.0,Guilty,Guilty,Unknown,Male,107,Conviction,Misdemeanor,covered elsewhere,True,,,,,,,2828.0,False,False,False,False,False,True,False,False,full_table_no_lifetime
1000000000003,2015-10-21,A.46.2-853,Misdemeanor,,Nolle Prosequi,,White,Male,153,Dismissed,Misdemeanor,covered elsewhere,False,,,,,,,1898.0,False,False,False,False,True,True,False,False,full_table_no_lifetime
1000000000004,2009-10-07,A.46.2-862,Misdemeanor,1.0,Guilty,Guilty,Unknown,Male,87,Conviction,Misdemeanor,covered elsewhere,True,,,,,,,4103.0,False,False,False,False,False,False,False,False,full_table_no_lifetime
1000000000006,2003-04-14,MISSING,Misdemeanor,,Dismissed,,Black,Male,700,Dismissed,Misdemeanor,covered elsewhere,False,,,,,,,6471.0,False,False,False,False,False,False,False,False,full_table_no_lifetime


# Running Classification

In [41]:
features_df = pd.read_sql(f"""
    SELECT 
        person_id,
        fips,
        "Race",
        "chargetype",
        "disposition",
        "codesection",
        "arrest_disqualifier",
        "convictions",
        "felony_conviction_disqualifier",
        "next_conviction_disqualifier_after_misdemeanor",
        "next_conviction_disqualifier_after_felony",
        "pending_after_misdemeanor",
        "pending_after_felony",
        "class1_2",
        "class3_4"
    FROM expunge_features
    WHERE run_id = '{run_id}'
""", engine)

features_df.head()

Unnamed: 0,person_id,fips,Race,chargetype,disposition,codesection,arrest_disqualifier,convictions,felony_conviction_disqualifier,next_conviction_disqualifier_after_misdemeanor,next_conviction_disqualifier_after_felony,pending_after_misdemeanor,pending_after_felony,class1_2,class3_4
0,1000000000140,59,Black,Misdemeanor,Dismissed,covered elsewhere,True,True,False,True,True,True,True,False,False
1,1000000000140,600,Black,Misdemeanor,Conviction,covered elsewhere,False,True,False,False,False,True,True,False,False
2,1000000000146,59,Black,Felony,Dismissed,covered elsewhere,False,True,False,False,False,False,True,False,False
3,1000000000146,59,Black,Felony,Conviction,covered elsewhere,False,True,False,False,False,False,True,False,False
4,1000000000146,59,Black,Misdemeanor,Dismissed,covered elsewhere,False,True,True,False,False,True,True,False,False


In [42]:
extra_cols = [
    'person_id',
    'fips',
    'Race'
]

features_encoded = encoder.transform(features_df.drop(extra_cols, axis=1))

In [43]:
features_df['expungability'] = clf.predict(features_encoded)

features_df.head()

Unnamed: 0,person_id,fips,Race,chargetype,disposition,codesection,arrest_disqualifier,convictions,felony_conviction_disqualifier,next_conviction_disqualifier_after_misdemeanor,next_conviction_disqualifier_after_felony,pending_after_misdemeanor,pending_after_felony,class1_2,class3_4,expungability
0,1000000000140,59,Black,Misdemeanor,Dismissed,covered elsewhere,True,True,False,True,True,True,True,False,False,Petition
1,1000000000140,600,Black,Misdemeanor,Conviction,covered elsewhere,False,True,False,False,False,True,True,False,False,Petition (pending)
2,1000000000146,59,Black,Felony,Dismissed,covered elsewhere,False,True,False,False,False,False,True,False,False,Petition
3,1000000000146,59,Black,Felony,Conviction,covered elsewhere,False,True,False,False,False,False,True,False,False,Petition (pending)
4,1000000000146,59,Black,Misdemeanor,Dismissed,covered elsewhere,False,True,True,False,False,True,True,False,False,Petition


In [44]:
features_df.to_sql(run_id, engine, if_exists='replace', index=False)

In [45]:
%%sql
SELECT *
FROM "full_table_no_lifetime"
LIMIT 5

 * postgresql://jupyter:***@localhost:5432/expunge
5 rows affected.


person_id,fips,Race,chargetype,disposition,codesection,arrest_disqualifier,convictions,felony_conviction_disqualifier,next_conviction_disqualifier_after_misdemeanor,next_conviction_disqualifier_after_felony,pending_after_misdemeanor,pending_after_felony,class1_2,class3_4,expungability
1000000000140,59,Black,Misdemeanor,Dismissed,covered elsewhere,True,True,False,True,True,True,True,False,False,Petition
1000000000140,600,Black,Misdemeanor,Conviction,covered elsewhere,False,True,False,False,False,True,True,False,False,Petition (pending)
1000000000146,59,Black,Felony,Dismissed,covered elsewhere,False,True,False,False,False,False,True,False,False,Petition
1000000000146,59,Black,Felony,Conviction,covered elsewhere,False,True,False,False,False,False,True,False,False,Petition (pending)
1000000000146,59,Black,Misdemeanor,Dismissed,covered elsewhere,False,True,True,False,False,True,True,False,False,Petition


# Shorter Wait Run (10 -> 7, 7 -> 3)

In [46]:
run_id = 'full_table_shorter_wait'

In [47]:
config = ExpungeConfig.from_yaml('expunge_config_full_table_shorter_wait.yaml')

config.run_id

00:23:57 [INFO - config.py]: Loading config from file: expunge_config_full_table_shorter_wait.yaml


'full_table_shorter_wait'

In [48]:
run_featurization(config, n_partitions=32)

00:23:58 [INFO - featurize.py]: Featurization Run ID: full_table_shorter_wait
00:23:58 [INFO - featurize.py]: Initializing Dask distributed client
Perhaps you already have a cluster running?
Hosting the HTTP server on port 40327 instead
  f"Port {expected} is already in use.\n"
00:24:04 [INFO - featurize.py]: Reading from table: expunge_clean
00:24:04 [INFO - featurize.py]: Loading into 32 partitions
00:24:37 [INFO - featurize.py]: Building Dask task graph for feature construction
00:24:39 [INFO - featurize.py]: Expungement feature data will be written to: /tmp/expunge_data
00:24:39 [INFO - featurize.py]: Clearing any data from previous runs
00:24:40 [INFO - featurize.py]: Command 'rm -rf /tmp/expunge_data/expunge_features-*.csv' returned with exit value: 0
00:24:40 [INFO - featurize.py]: Executing Dask task graph and writing results to CSV...
00:32:13 [INFO - featurize.py]: File(s) written successfully
00:32:13 [INFO - featurize.py]: Opening connection to PostGres via Psycopg
00:32:13

'full_table_shorter_wait'

In [49]:
%%sql
SELECT * 
FROM expunge_features 
WHERE run_id = 'full_table_shorter_wait'
LIMIT 5

 * postgresql://jupyter:***@localhost:5432/expunge
5 rows affected.


person_id,HearingDate,CodeSection,ChargeType,Class,DispositionCode,Plea,Race,Sex,fips,disposition,chargetype,codesection,convictions,last_hearing_date,last_felony_conviction_date,next_conviction_date,last_hearing_delta,last_felony_conviction_delta,next_conviction_delta,from_present_delta,arrest_disqualifier,felony_conviction_disqualifier,next_conviction_disqualifier_after_misdemeanor,next_conviction_disqualifier_after_felony,pending_after_misdemeanor,pending_after_felony,class1_2,class3_4,run_id
5020000000893,2012-05-17,18.2-96,Misdemeanor,1,Nolle Prosequi,,White,Male,169,Dismissed,Misdemeanor,covered in 19.2-392.6 - B,True,,,,,,,3150.0,False,False,False,False,False,False,False,False,full_table_shorter_wait
5020000000893,2012-05-17,18.2-119,Misdemeanor,1,Guilty,,White,Male,169,Conviction,Misdemeanor,covered in 19.2-392.6 - B,True,,,,,,,3150.0,False,False,False,False,False,False,False,False,full_table_shorter_wait
5020000000895,2013-06-25,18.2-58,Felony,U,Guilty,Guilty,Black,Male,760,Conviction,Felony,covered elsewhere,True,,,2018-07-11,,,1842.0,2746.0,False,False,True,True,False,False,False,False,full_table_shorter_wait
5020000000895,2013-06-25,18.2-58,Felony,U,Nolle Prosequi,,Black,Male,760,Dismissed,Felony,covered elsewhere,True,,,2018-07-11,,,1842.0,2746.0,False,False,True,True,False,False,False,False,full_table_shorter_wait
5020000000895,2013-06-25,18.2-58,Felony,U,Nolle Prosequi,,Black,Male,760,Dismissed,Felony,covered elsewhere,True,,,2018-07-11,,,1842.0,2746.0,False,False,True,True,False,False,False,False,full_table_shorter_wait


# Running Classification

In [50]:
features_df = pd.read_sql(f"""
    SELECT 
        person_id,
        fips,
        "Race",
        "chargetype",
        "disposition",
        "codesection",
        "arrest_disqualifier",
        "convictions",
        "felony_conviction_disqualifier",
        "next_conviction_disqualifier_after_misdemeanor",
        "next_conviction_disqualifier_after_felony",
        "pending_after_misdemeanor",
        "pending_after_felony",
        "class1_2",
        "class3_4"
    FROM expunge_features
    WHERE run_id = '{run_id}'
""", engine)

features_df.head()

Unnamed: 0,person_id,fips,Race,chargetype,disposition,codesection,arrest_disqualifier,convictions,felony_conviction_disqualifier,next_conviction_disqualifier_after_misdemeanor,next_conviction_disqualifier_after_felony,pending_after_misdemeanor,pending_after_felony,class1_2,class3_4
0,5020000000895,760,Black,Felony,Dismissed,covered elsewhere,False,True,False,True,True,False,False,False,False
1,5020000000895,760,Black,Felony,Dismissed,covered elsewhere,False,True,False,True,True,False,False,False,False
2,5020000001086,197,White,Misdemeanor,Conviction,covered elsewhere,False,True,False,False,False,False,True,False,False
3,5020000001087,197,White,Misdemeanor,Conviction,covered elsewhere,False,True,False,False,False,False,False,False,False
4,5020000001095,650,Black,Misdemeanor,Dismissed,covered elsewhere,False,False,False,False,False,False,False,False,False


In [51]:
extra_cols = [
    'person_id',
    'fips',
    'Race'
]

features_encoded = encoder.transform(features_df.drop(extra_cols, axis=1))

In [52]:
features_df['expungability'] = clf.predict(features_encoded)

features_df.head()

Unnamed: 0,person_id,fips,Race,chargetype,disposition,codesection,arrest_disqualifier,convictions,felony_conviction_disqualifier,next_conviction_disqualifier_after_misdemeanor,next_conviction_disqualifier_after_felony,pending_after_misdemeanor,pending_after_felony,class1_2,class3_4,expungability
0,5020000000895,760,Black,Felony,Dismissed,covered elsewhere,False,True,False,True,True,False,False,False,False,Petition
1,5020000000895,760,Black,Felony,Dismissed,covered elsewhere,False,True,False,True,True,False,False,False,False,Petition
2,5020000001086,197,White,Misdemeanor,Conviction,covered elsewhere,False,True,False,False,False,False,True,False,False,Petition
3,5020000001087,197,White,Misdemeanor,Conviction,covered elsewhere,False,True,False,False,False,False,False,False,False,Petition
4,5020000001095,650,Black,Misdemeanor,Dismissed,covered elsewhere,False,False,False,False,False,False,False,False,False,Automatic


In [53]:
features_df.to_sql(run_id, engine, if_exists='replace', index=False)

In [54]:
%%sql
SELECT *
FROM "full_table_shorter_wait"
LIMIT 5

 * postgresql://jupyter:***@localhost:5432/expunge
5 rows affected.


person_id,fips,Race,chargetype,disposition,codesection,arrest_disqualifier,convictions,felony_conviction_disqualifier,next_conviction_disqualifier_after_misdemeanor,next_conviction_disqualifier_after_felony,pending_after_misdemeanor,pending_after_felony,class1_2,class3_4,expungability
5020000000895,760,Black,Felony,Dismissed,covered elsewhere,False,True,False,True,True,False,False,False,False,Petition
5020000000895,760,Black,Felony,Dismissed,covered elsewhere,False,True,False,True,True,False,False,False,False,Petition
5020000001086,197,White,Misdemeanor,Conviction,covered elsewhere,False,True,False,False,False,False,True,False,False,Petition
5020000001087,197,White,Misdemeanor,Conviction,covered elsewhere,False,True,False,False,False,False,False,False,False,Petition
5020000001095,650,Black,Misdemeanor,Dismissed,covered elsewhere,False,False,False,False,False,False,False,False,False,Automatic


# No Lifetime, Shorter Wait

In [55]:
run_id = 'full_table_no_lifetime_shorter_wait'

In [56]:
config = ExpungeConfig.from_yaml('expunge_config_full_table_no_lifetime_shorter_wait.yaml')

config.run_id

01:24:22 [INFO - config.py]: Loading config from file: expunge_config_full_table_no_lifetime_shorter_wait.yaml


'full_table_no_lifetime_shorter_wait'

In [57]:
run_featurization(config, n_partitions=32)

01:24:22 [INFO - featurize.py]: Featurization Run ID: full_table_no_lifetime_shorter_wait
01:24:22 [INFO - featurize.py]: Initializing Dask distributed client
Perhaps you already have a cluster running?
Hosting the HTTP server on port 35171 instead
  f"Port {expected} is already in use.\n"
01:24:30 [INFO - featurize.py]: Reading from table: expunge_clean
01:24:30 [INFO - featurize.py]: Loading into 32 partitions
01:25:04 [INFO - featurize.py]: Building Dask task graph for feature construction
01:25:07 [INFO - featurize.py]: Expungement feature data will be written to: /tmp/expunge_data
01:25:07 [INFO - featurize.py]: Clearing any data from previous runs
01:25:10 [INFO - featurize.py]: Command 'rm -rf /tmp/expunge_data/expunge_features-*.csv' returned with exit value: 0
01:25:10 [INFO - featurize.py]: Executing Dask task graph and writing results to CSV...
01:33:09 [INFO - featurize.py]: File(s) written successfully
01:33:09 [INFO - featurize.py]: Opening connection to PostGres via Psyc

'full_table_no_lifetime_shorter_wait'

In [58]:
%%sql
SELECT * 
FROM expunge_features 
WHERE run_id = 'full_table_no_lifetime_shorter_wait'
LIMIT 5

 * postgresql://jupyter:***@localhost:5432/expunge
5 rows affected.


person_id,HearingDate,CodeSection,ChargeType,Class,DispositionCode,Plea,Race,Sex,fips,disposition,chargetype,codesection,convictions,last_hearing_date,last_felony_conviction_date,next_conviction_date,last_hearing_delta,last_felony_conviction_delta,next_conviction_delta,from_present_delta,arrest_disqualifier,felony_conviction_disqualifier,next_conviction_disqualifier_after_misdemeanor,next_conviction_disqualifier_after_felony,pending_after_misdemeanor,pending_after_felony,class1_2,class3_4,run_id
1000000000002,2013-04-04,C.46.2-862,Misdemeanor,1.0,Guilty,Guilty,Unknown,Male,107,Conviction,Misdemeanor,covered elsewhere,True,,,,,,,2828.0,False,False,False,False,False,False,False,False,full_table_no_lifetime_shorter_wait
1000000000118,2013-05-28,4.1-308,Misdemeanor,,Guilty In Absentia,Tried In Absentia,Black,Male,13,Conviction,Misdemeanor,covered elsewhere,True,2011-03-25,,2013-12-04,795.0,,190.0,2774.0,True,False,True,True,False,False,False,False,full_table_no_lifetime_shorter_wait
1000000000118,2013-12-04,14.2-81,Misdemeanor,,Guilty In Absentia,Tried In Absentia,Black,Male,13,Conviction,Misdemeanor,covered elsewhere,True,2013-05-28,,2015-07-15,190.0,,588.0,2584.0,True,False,True,True,False,False,False,False,full_table_no_lifetime_shorter_wait
1000000000118,2015-07-15,18.2-388,Misdemeanor,4.0,Guilty,Not Guilty,Black,Male,13,Conviction,Misdemeanor,covered elsewhere,True,2013-12-04,,2016-11-22,588.0,,496.0,1996.0,True,False,True,True,False,True,False,False,full_table_no_lifetime_shorter_wait
1000000000118,2015-07-15,17-21,Misdemeanor,1.0,Guilty,Not Guilty,Black,Male,13,Conviction,Misdemeanor,covered elsewhere,True,2013-12-04,,2016-11-22,588.0,,496.0,1996.0,True,False,True,True,False,True,False,False,full_table_no_lifetime_shorter_wait


# Running Classification

In [59]:
features_df = pd.read_sql(f"""
    SELECT 
        person_id,
        fips,
        "Race",
        "chargetype",
        "disposition",
        "codesection",
        "arrest_disqualifier",
        "convictions",
        "felony_conviction_disqualifier",
        "next_conviction_disqualifier_after_misdemeanor",
        "next_conviction_disqualifier_after_felony",
        "pending_after_misdemeanor",
        "pending_after_felony",
        "class1_2",
        "class3_4"
    FROM expunge_features
    WHERE run_id = '{run_id}'
""", engine)

features_df.head()

Unnamed: 0,person_id,fips,Race,chargetype,disposition,codesection,arrest_disqualifier,convictions,felony_conviction_disqualifier,next_conviction_disqualifier_after_misdemeanor,next_conviction_disqualifier_after_felony,pending_after_misdemeanor,pending_after_felony,class1_2,class3_4
0,1000000000002,107,Unknown,Misdemeanor,Conviction,covered elsewhere,False,True,False,False,False,False,False,False,False
1,1000000000003,153,White,Misdemeanor,Dismissed,covered elsewhere,False,False,False,False,False,False,True,False,False
2,1000000000004,87,Unknown,Misdemeanor,Conviction,covered elsewhere,False,True,False,False,False,False,False,False,False
3,1000000000006,700,Black,Misdemeanor,Dismissed,covered elsewhere,False,False,False,False,False,False,False,False,False
4,1000000000008,740,Unknown,Misdemeanor,Dismissed,covered elsewhere,False,False,False,False,False,False,True,False,False


In [60]:
extra_cols = [
    'person_id',
    'fips',
    'Race'
]

features_encoded = encoder.transform(features_df.drop(extra_cols, axis=1))

In [61]:
features_df['expungability'] = clf.predict(features_encoded)

features_df.head()

Unnamed: 0,person_id,fips,Race,chargetype,disposition,codesection,arrest_disqualifier,convictions,felony_conviction_disqualifier,next_conviction_disqualifier_after_misdemeanor,next_conviction_disqualifier_after_felony,pending_after_misdemeanor,pending_after_felony,class1_2,class3_4,expungability
0,1000000000002,107,Unknown,Misdemeanor,Conviction,covered elsewhere,False,True,False,False,False,False,False,False,False,Petition
1,1000000000003,153,White,Misdemeanor,Dismissed,covered elsewhere,False,False,False,False,False,False,True,False,False,Automatic
2,1000000000004,87,Unknown,Misdemeanor,Conviction,covered elsewhere,False,True,False,False,False,False,False,False,False,Petition
3,1000000000006,700,Black,Misdemeanor,Dismissed,covered elsewhere,False,False,False,False,False,False,False,False,False,Automatic
4,1000000000008,740,Unknown,Misdemeanor,Dismissed,covered elsewhere,False,False,False,False,False,False,True,False,False,Automatic


In [62]:
features_df.to_sql(run_id, engine, if_exists='replace', index=False)

In [64]:
%%sql
SELECT *
FROM "full_table_no_lifetime_shorter_wait"
LIMIT 5

 * postgresql://jupyter:***@localhost:5432/expunge
5 rows affected.


person_id,fips,Race,chargetype,disposition,codesection,arrest_disqualifier,convictions,felony_conviction_disqualifier,next_conviction_disqualifier_after_misdemeanor,next_conviction_disqualifier_after_felony,pending_after_misdemeanor,pending_after_felony,class1_2,class3_4,expungability
1000000000002,107,Unknown,Misdemeanor,Conviction,covered elsewhere,False,True,False,False,False,False,False,False,False,Petition
1000000000003,153,White,Misdemeanor,Dismissed,covered elsewhere,False,False,False,False,False,False,True,False,False,Automatic
1000000000004,87,Unknown,Misdemeanor,Conviction,covered elsewhere,False,True,False,False,False,False,False,False,False,Petition
1000000000006,700,Black,Misdemeanor,Dismissed,covered elsewhere,False,False,False,False,False,False,False,False,False,Automatic
1000000000008,740,Unknown,Misdemeanor,Dismissed,covered elsewhere,False,False,False,False,False,False,True,False,False,Automatic


# Decision Tree Visualization

In [39]:
import matplotlib.pyplot as plt

Plot decision tree, save to .png image

In [None]:
plt.figure()
tree.plot_tree(clf, filled=True, feature_names=X.columns, impurity=False)
plt.savefig('tree.png', format='png', bbox_inches="tight", dpi=700)