In [1]:
import pandas as pd
import os
import sqlalchemy as sa
import logging

In [2]:
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s [%(levelname)s - %(module)s.py]: %(message)s',
    datefmt='%H:%M:%S'
)

In [3]:
USER = 'jupyter'
PASSWORD = os.environ['POSTGRES_PASS']
HOST = 'localhost'
PORT = '5432'
DB = 'expunge'

DATABASE_URI = f"postgresql://{USER}:{PASSWORD}@{HOST}:{PORT}/{DB}"

In [4]:
%load_ext sql
%sql {DATABASE_URI}

In [5]:
engine = sa.create_engine(DATABASE_URI)

engine

Engine(postgresql://jupyter:***@localhost:5432/expunge)

In [6]:
import sys
import os

sys.path.append(os.path.expanduser('~/LAJC-expungement'))

In [8]:
from pipeline.featurize import (
    run_featurization,
    ExpungeConfig
)

# Examine Previous Runs

In [None]:
%%sql
SELECT 
    run_id,
    COUNT(*)
FROM expunge_features
GROUP BY 1
ORDER BY 2

# Prep Encoder

In [9]:
train_df = pd.read_csv('./training_set.csv')

print(train_df.shape)
train_df.head()

(12288, 13)


Unnamed: 0,chargetype,disposition,codesection,arrest_disqualifier,convictions,felony_conviction_disqualifier,next_conviction_disqualifier_after_misdemeanor,next_conviction_disqualifier_after_felony,pending_after_misdemeanor,pending_after_felony,class1_2,class3_4,expungability
0,Misdemeanor,Conviction,covered in 19.2-392.6 - A,False,False,False,False,False,False,False,False,False,Automatic
1,Felony,Conviction,covered in 19.2-392.6 - A,False,False,False,False,False,False,False,False,False,Petition
2,Misdemeanor,Dismissed,covered in 19.2-392.6 - A,False,False,False,False,False,False,False,False,False,Automatic
3,Felony,Dismissed,covered in 19.2-392.6 - A,False,False,False,False,False,False,False,False,False,Petition
4,Misdemeanor,Deferral Dismissal,covered in 19.2-392.6 - A,False,False,False,False,False,False,False,False,False,Automatic


In [10]:
X = train_df.drop('expungability', axis=1)
Y = train_df['expungability']

In [11]:
categorical_columns = [
    'chargetype',
    'disposition',
    'codesection'
]

other_columns = [
    col for col in train_df.columns 
    if not col in [*categorical_columns, 'expungability']
]

In [12]:
from sklearn.preprocessing import OneHotEncoder

In [13]:
encoder = OneHotEncoder()

In [14]:
encoder.fit(X)

OneHotEncoder()

In [15]:
X_encoded = encoder.transform(X)

In [17]:
type(X_encoded)

scipy.sparse.csr.csr_matrix

In [None]:
from sklearn import tree

clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_encoded, Y)

clf

# Classification Runs

## Misdemeanor Pending for 5

In [None]:
run_id = 'full_table_misd_5'

In [None]:
config = ExpungeConfig.from_yaml(f'expunge_config_{run_id}.yaml')

config.run_id

In [None]:
run_featurization(config, n_partitions=32)

In [None]:
%sql SELECT * FROM expunge_features WHERE run_id = '{run_id}' LIMIT 5

In [None]:
features_df = pd.read_sql(f"""
    SELECT 
        person_id,
        fips,
        "Race",
        "chargetype",
        "disposition",
        "codesection",
        "arrest_disqualifier",
        "convictions",
        "felony_conviction_disqualifier",
        "next_conviction_disqualifier_after_misdemeanor",
        "next_conviction_disqualifier_after_felony",
        "pending_after_misdemeanor",
        "pending_after_felony",
        "class1_2",
        "class3_4"
    FROM expunge_features
    WHERE run_id = '{run_id}'
""", engine)

features_df.head()

In [None]:
extra_cols = [
    'person_id',
    'fips',
    'Race'
]

features_encoded = encoder.transform(features_df.drop(extra_cols, axis=1))

In [None]:
features_df['expungability'] = clf.predict(features_encoded)

features_df.head()

In [None]:
features_df.to_sql(run_id, engine, if_exists='replace', index=False)

In [None]:
%sql SELECT * FROM "{run_id}" LIMIT 5

## Misdemeanor Pending for 5, Lifetime false

In [None]:
run_id = 'full_table_misd_5_no_lifetime'

In [None]:
config = ExpungeConfig.from_yaml(f'expunge_config_{run_id}.yaml')

config.run_id

In [None]:
run_featurization(config, n_partitions=32)

In [None]:
%sql SELECT * FROM expunge_features WHERE run_id = '{run_id}' LIMIT 5

In [None]:
features_df = pd.read_sql(f"""
    SELECT 
        person_id,
        fips,
        "Race",
        "chargetype",
        "disposition",
        "codesection",
        "arrest_disqualifier",
        "convictions",
        "felony_conviction_disqualifier",
        "next_conviction_disqualifier_after_misdemeanor",
        "next_conviction_disqualifier_after_felony",
        "pending_after_misdemeanor",
        "pending_after_felony",
        "class1_2",
        "class3_4"
    FROM expunge_features
    WHERE run_id = '{run_id}'
""", engine)

features_df.head()

In [None]:
extra_cols = [
    'person_id',
    'fips',
    'Race'
]

features_encoded = encoder.transform(features_df.drop(extra_cols, axis=1))

In [None]:
features_df['expungability'] = clf.predict(features_encoded)

features_df.head()

In [None]:
features_df.to_sql(run_id, engine, if_exists='replace', index=False)

In [None]:
%sql SELECT * FROM "{run_id}" LIMIT 5

## Default Run

In [None]:
run_id = 'default-full-table'

In [None]:
config = ExpungeConfig.from_yaml('expunge_config.yaml')

config.run_id

In [None]:
run_featurization(config)

In [None]:
%sql SELECT * FROM expunge_features WHERE run_id = '{run_id}' LIMIT 5

In [None]:
features_df = pd.read_sql("""
    SELECT 
        person_id,
        fips,
        "Race",
        "chargetype",
        "disposition",
        "codesection",
        "arrest_disqualifier",
        "convictions",
        "felony_conviction_disqualifier",
        "next_conviction_disqualifier_after_misdemeanor",
        "next_conviction_disqualifier_after_felony",
        "pending_after_misdemeanor",
        "pending_after_felony",
        "class1_2",
        "class3_4"
    FROM expunge_features
    WHERE run_id = 'default-full-table'
""", engine)

features_df.head()

In [None]:
extra_cols = [
    'person_id',
    'fips',
    'Race'
]

features_encoded = encoder.transform(features_df.drop(extra_cols, axis=1))

In [None]:
features_df['expungability'] = clf.predict(features_encoded)

features_df.head()

In [None]:
features_df.to_sql('full_table_default', engine, if_exists='replace', index=False)

In [None]:
%%sql
SELECT *
FROM "full_table_default"
LIMIT 5

In [None]:
%%sql
SELECT COUNT(*)
FROM full_table_default

## No Lifetime Run

In [None]:
run_id = 'full_table_no_lifetime'

In [None]:
config = ExpungeConfig.from_yaml(f'expunge_config_{run_id}.yaml')

config.run_id

In [None]:
run_featurization(config, n_partitions=32)

In [None]:
%sql SELECT * FROM expunge_features WHERE run_id = '{run_id}' LIMIT 5

In [None]:
features_df = pd.read_sql(f"""
    SELECT 
        person_id,
        fips,
        "Race",
        "chargetype",
        "disposition",
        "codesection",
        "arrest_disqualifier",
        "convictions",
        "felony_conviction_disqualifier",
        "next_conviction_disqualifier_after_misdemeanor",
        "next_conviction_disqualifier_after_felony",
        "pending_after_misdemeanor",
        "pending_after_felony",
        "class1_2",
        "class3_4"
    FROM expunge_features
    WHERE run_id = '{run_id}'
""", engine)

features_df.head()

In [None]:
extra_cols = [
    'person_id',
    'fips',
    'Race'
]

features_encoded = encoder.transform(features_df.drop(extra_cols, axis=1))

In [None]:
features_df['expungability'] = clf.predict(features_encoded)

features_df.head()

In [None]:
features_df.to_sql(run_id, engine, if_exists='replace', index=False)

In [None]:
%sql SELECT * FROM "{run_id}" LIMIT 5

## Shorter Wait Run (10 -> 7, 7 -> 3)

In [None]:
run_id = 'full_table_shorter_wait'

In [None]:
config = ExpungeConfig.from_yaml('expunge_config_full_table_shorter_wait.yaml')

config.run_id

In [None]:
run_featurization(config, n_partitions=32)

In [None]:
%%sql
SELECT * 
FROM expunge_features 
WHERE run_id = 'full_table_shorter_wait'
LIMIT 5

In [None]:
features_df = pd.read_sql(f"""
    SELECT 
        person_id,
        fips,
        "Race",
        "chargetype",
        "disposition",
        "codesection",
        "arrest_disqualifier",
        "convictions",
        "felony_conviction_disqualifier",
        "next_conviction_disqualifier_after_misdemeanor",
        "next_conviction_disqualifier_after_felony",
        "pending_after_misdemeanor",
        "pending_after_felony",
        "class1_2",
        "class3_4"
    FROM expunge_features
    WHERE run_id = '{run_id}'
""", engine)

features_df.head()

In [None]:
extra_cols = [
    'person_id',
    'fips',
    'Race'
]

features_encoded = encoder.transform(features_df.drop(extra_cols, axis=1))

In [None]:
features_df['expungability'] = clf.predict(features_encoded)

features_df.head()

In [None]:
features_df.to_sql(run_id, engine, if_exists='replace', index=False)

In [None]:
%%sql
SELECT *
FROM "full_table_shorter_wait"
LIMIT 5

## No Lifetime, Shorter Wait

In [None]:
run_id = 'full_table_no_lifetime_shorter_wait'

In [None]:
config = ExpungeConfig.from_yaml('expunge_config_full_table_no_lifetime_shorter_wait.yaml')

config.run_id

In [None]:
run_featurization(config, n_partitions=32)

In [None]:
%%sql
SELECT * 
FROM expunge_features 
WHERE run_id = 'full_table_no_lifetime_shorter_wait'
LIMIT 5

In [None]:
features_df = pd.read_sql(f"""
    SELECT 
        person_id,
        fips,
        "Race",
        "chargetype",
        "disposition",
        "codesection",
        "arrest_disqualifier",
        "convictions",
        "felony_conviction_disqualifier",
        "next_conviction_disqualifier_after_misdemeanor",
        "next_conviction_disqualifier_after_felony",
        "pending_after_misdemeanor",
        "pending_after_felony",
        "class1_2",
        "class3_4"
    FROM expunge_features
    WHERE run_id = '{run_id}'
""", engine)

features_df.head()

In [None]:
extra_cols = [
    'person_id',
    'fips',
    'Race'
]

features_encoded = encoder.transform(features_df.drop(extra_cols, axis=1))

In [None]:
features_df['expungability'] = clf.predict(features_encoded)

features_df.head()

In [None]:
features_df.to_sql(run_id, engine, if_exists='replace', index=False)

In [None]:
%%sql
SELECT *
FROM "full_table_no_lifetime_shorter_wait"
LIMIT 5

# Boolean Flags for Districts

In [None]:
%%sql
SELECT *
FROM full_table_default
LIMIT 10

# Decision Tree Visualization

In [None]:
import matplotlib.pyplot as plt

Plot decision tree, save to .png image

In [None]:
plt.figure()
tree.plot_tree(clf, filled=True, feature_names=X.columns, impurity=False)
plt.savefig('tree.png', format='png', bbox_inches="tight", dpi=700)