In [1]:
import pandas as pd
import os
import sqlalchemy as sa

In [2]:
train_df = pd.read_csv('./training_set.csv')

print(train_df.shape)
train_df.head()

(12288, 13)


Unnamed: 0,chargetype,disposition,codesection,arrest_disqualifier,convictions,felony_conviction_disqualifier,next_conviction_disqualifier_after_misdemeanor,next_conviction_disqualifier_after_felony,pending_after_misdemeanor,pending_after_felony,class1_2,class3_4,expungability
0,Misdemeanor,Conviction,covered in 19.2-392.6 - A,False,False,False,False,False,False,False,False,False,Automatic
1,Felony,Conviction,covered in 19.2-392.6 - A,False,False,False,False,False,False,False,False,False,Petition
2,Misdemeanor,Dismissed,covered in 19.2-392.6 - A,False,False,False,False,False,False,False,False,False,Automatic
3,Felony,Dismissed,covered in 19.2-392.6 - A,False,False,False,False,False,False,False,False,False,Petition
4,Misdemeanor,Deferral Dismissal,covered in 19.2-392.6 - A,False,False,False,False,False,False,False,False,False,Automatic


In [3]:
train_df['expungability'].value_counts()

Not eligible           5920
Petition               4720
Automatic              1280
Automatic (pending)     256
Petition (pending)      112
Name: expungability, dtype: int64

In [4]:
USER = 'jupyter'
PASSWORD = os.environ['POSTGRES_PASS']
HOST = 'localhost'
PORT = '5432'
DB = 'expunge'

DATABASE_URI = f"postgresql://{USER}:{PASSWORD}@{HOST}:{PORT}/{DB}"

In [26]:
%load_ext sql
%sql {DATABASE_URI}

In [28]:
%%sql
SELECT 
    run_id,
    COUNT(*)
FROM expunge_features
GROUP BY 1
ORDER BY 2

 * postgresql://jupyter:***@localhost:5432/expunge
1 rows affected.


run_id,count
default-full-table,9052752


In [5]:
engine = sa.create_engine(DATABASE_URI)

engine

Engine(postgresql://jupyter:***@localhost:5432/expunge)

In [30]:
features_df = pd.read_sql("""
    SELECT 
        person_id,
        "chargetype",
        "disposition",
        "codesection",
        "arrest_disqualifier",
        "convictions",
        "felony_conviction_disqualifier",
        "next_conviction_disqualifier_after_misdemeanor",
        "next_conviction_disqualifier_after_felony",
        "pending_after_misdemeanor",
        "pending_after_felony",
        "class1_2",
        "class3_4"
    FROM expunge_features
    WHERE run_id = 'default-10k'
""", engine)

features_df.head()

Unnamed: 0,person_id,chargetype,disposition,codesection,arrest_disqualifier,convictions,felony_conviction_disqualifier,next_conviction_disqualifier_after_misdemeanor,next_conviction_disqualifier_after_felony,pending_after_misdemeanor,pending_after_felony,class1_2,class3_4
0,24091000000224,Misdemeanor,Dismissed,covered elsewhere,False,True,False,True,True,False,False,False,False
1,24091000000224,Misdemeanor,Conviction,covered elsewhere,True,True,False,True,True,False,False,False,False
2,24091000000224,Misdemeanor,Conviction,covered elsewhere,True,True,False,True,True,False,False,False,False
3,24091000000224,Misdemeanor,Conviction,covered elsewhere,True,True,False,False,False,False,False,False,False
4,24100000000431,Felony,Dismissed,covered elsewhere,False,True,False,True,True,False,False,False,False


In [8]:
X = train_df.drop('expungability', axis=1)
Y = train_df['expungability']

In [9]:
categorical_columns = [
    'chargetype',
    'disposition',
    'codesection'
]

other_columns = [
    col for col in train_df.columns 
    if not col in [*categorical_columns, 'expungability']
]

In [10]:
from sklearn.preprocessing import OneHotEncoder

In [11]:
encoder = OneHotEncoder()

In [12]:
encoder.fit(X)

OneHotEncoder()

In [13]:
X_encoded = encoder.transform(X)

In [14]:
from sklearn import tree

clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_encoded, Y)

clf

DecisionTreeClassifier()

In [33]:
features_encoded = encoder.transform(features_df.drop('person_id', axis=1))

In [35]:
features_df['expungability'] = clf.predict(features_encoded)

features_df.head()

Unnamed: 0,person_id,chargetype,disposition,codesection,arrest_disqualifier,convictions,felony_conviction_disqualifier,next_conviction_disqualifier_after_misdemeanor,next_conviction_disqualifier_after_felony,pending_after_misdemeanor,pending_after_felony,class1_2,class3_4,expungability
0,24091000000224,Misdemeanor,Dismissed,covered elsewhere,False,True,False,True,True,False,False,False,False,Petition
1,24091000000224,Misdemeanor,Conviction,covered elsewhere,True,True,False,True,True,False,False,False,False,Not eligible
2,24091000000224,Misdemeanor,Conviction,covered elsewhere,True,True,False,True,True,False,False,False,False,Not eligible
3,24091000000224,Misdemeanor,Conviction,covered elsewhere,True,True,False,False,False,False,False,False,False,Petition
4,24100000000431,Felony,Dismissed,covered elsewhere,False,True,False,True,True,False,False,False,False,Petition


In [37]:
features_df.to_sql('expunge_results_10k', engine, index=False)

In [41]:
%%sql
SELECT *
FROM expunge_results_10k
LIMIT 5

 * postgresql://jupyter:***@localhost:5432/expunge
5 rows affected.


person_id,chargetype,disposition,codesection,arrest_disqualifier,convictions,felony_conviction_disqualifier,next_conviction_disqualifier_after_misdemeanor,next_conviction_disqualifier_after_felony,pending_after_misdemeanor,pending_after_felony,class1_2,class3_4,expungability
24091000000224,Misdemeanor,Dismissed,covered elsewhere,False,True,False,True,True,False,False,False,False,Petition
24091000000224,Misdemeanor,Conviction,covered elsewhere,True,True,False,True,True,False,False,False,False,Not eligible
24091000000224,Misdemeanor,Conviction,covered elsewhere,True,True,False,True,True,False,False,False,False,Not eligible
24091000000224,Misdemeanor,Conviction,covered elsewhere,True,True,False,False,False,False,False,False,False,Petition
24100000000431,Felony,Dismissed,covered elsewhere,False,True,False,True,True,False,False,False,False,Petition


# Decision Tree Visualization

In [39]:
import matplotlib.pyplot as plt

Plot decision tree, save to .png image

In [None]:
plt.figure()
tree.plot_tree(clf, filled=True, feature_names=X.columns, impurity=False)
plt.savefig('tree.png', format='png', bbox_inches="tight", dpi=700)

# Results Comparisons with 100k

In [55]:
df_results = pd.read_csv('results.csv')

df_results.head()

Unnamed: 0,rowID,Race,expungable,expungable_no_lifetimelimit,expungable_7_to_5,expungable_7_to_5_and_10_to_5,expungableB462301,expungableA462862,expungable462300,expungableC462862,expungable18295
0,135513,Black (Non-Hispanic),Petition,Petition,Petition,Petition,Petition,Petition,Petition,Petition,Petition
1,135514,Black (Non-Hispanic),Automatic,Automatic,Automatic,Automatic,Automatic,Automatic,Automatic,Automatic,Automatic
2,135515,Black (Non-Hispanic),Petition,Petition,Petition,Petition,Petition,Petition,Petition,Petition,Petition
3,282217,Black (Non-Hispanic),Not eligible,Not eligible,Not eligible,Not eligible,Not eligible,Not eligible,Not eligible,Not eligible,Not eligible
4,282218,Black (Non-Hispanic),Automatic,Automatic,Automatic,Automatic,Automatic,Automatic,Automatic,Automatic,Automatic


In [51]:
df_100k = pd.read_csv('data100k.csv')

df_100k.head()

Unnamed: 0,person_id,HearingDate,CodeSection,codesection,ChargeType,chargetype,Class,DispositionCode,disposition,Plea,...,within7,within10,class1_2,class3_4,expungable,old_expungable,expungable_no_lifetimelimit,reason,sameday,lifetime
0,102090000000110,2019-02-28,A.46.2-862,covered elsewhere,Misdemeanor,Misdemeanor,1,Guilty,Conviction,,...,True,True,False,False,Automatic (pending),False,Automatic (pending),Conviction of misdemeanor charges listed in 19...,False,False
1,343221000000125,2009-12-07,B.46.2-301,covered elsewhere,Misdemeanor,Misdemeanor,1,Guilty,Conviction,,...,False,False,False,False,Not eligible,False,Not eligible,Conviction of misdemeanor charges that are not...,False,False
2,343221000000125,2011-01-20,A.46.2-707,covered elsewhere,Misdemeanor,Misdemeanor,3,Guilty,Conviction,,...,False,True,False,False,Not eligible,False,Not eligible,Conviction of misdemeanor charges that are not...,False,False
3,343221000000125,2011-07-01,B.46.2-301,covered elsewhere,Misdemeanor,Misdemeanor,1,Guilty In Absentia,Conviction,,...,False,True,False,False,Not eligible,False,Not eligible,Conviction of misdemeanor charges that are not...,False,False
4,343221000000125,2012-10-15,B.46.2-301,covered elsewhere,Misdemeanor,Misdemeanor,1,Guilty,Conviction,,...,False,True,False,False,Not eligible,False,Not eligible,Conviction of misdemeanor charges that are not...,False,False


In [52]:
df_100k['rowID'] = df_100k.index + 1

df_100k.head()

Unnamed: 0,person_id,HearingDate,CodeSection,codesection,ChargeType,chargetype,Class,DispositionCode,disposition,Plea,...,within10,class1_2,class3_4,expungable,old_expungable,expungable_no_lifetimelimit,reason,sameday,lifetime,rowID
0,102090000000110,2019-02-28,A.46.2-862,covered elsewhere,Misdemeanor,Misdemeanor,1,Guilty,Conviction,,...,True,False,False,Automatic (pending),False,Automatic (pending),Conviction of misdemeanor charges listed in 19...,False,False,1
1,343221000000125,2009-12-07,B.46.2-301,covered elsewhere,Misdemeanor,Misdemeanor,1,Guilty,Conviction,,...,False,False,False,Not eligible,False,Not eligible,Conviction of misdemeanor charges that are not...,False,False,2
2,343221000000125,2011-01-20,A.46.2-707,covered elsewhere,Misdemeanor,Misdemeanor,3,Guilty,Conviction,,...,True,False,False,Not eligible,False,Not eligible,Conviction of misdemeanor charges that are not...,False,False,3
3,343221000000125,2011-07-01,B.46.2-301,covered elsewhere,Misdemeanor,Misdemeanor,1,Guilty In Absentia,Conviction,,...,True,False,False,Not eligible,False,Not eligible,Conviction of misdemeanor charges that are not...,False,False,4
4,343221000000125,2012-10-15,B.46.2-301,covered elsewhere,Misdemeanor,Misdemeanor,1,Guilty,Conviction,,...,True,False,False,Not eligible,False,Not eligible,Conviction of misdemeanor charges that are not...,False,False,5


In [56]:
df_results.shape

(293598, 11)

In [53]:
df_100k.shape

(293598, 29)

In [60]:
pd.set_option('max_columns', None)

In [61]:
df_join = df_100k.join(df_results.set_index('rowID'), on='rowID', lsuffix='_orig')

df_join.head()

Unnamed: 0,person_id,HearingDate,CodeSection,codesection,ChargeType,chargetype,Class,DispositionCode,disposition,Plea,Race_orig,Sex,fips,convictions,arrests,felony10,sevenyear,tenyear,within7,within10,class1_2,class3_4,expungable_orig,old_expungable,expungable_no_lifetimelimit_orig,reason,sameday,lifetime,rowID,Race,expungable,expungable_no_lifetimelimit,expungable_7_to_5,expungable_7_to_5_and_10_to_5,expungableB462301,expungableA462862,expungable462300,expungableC462862,expungable18295
0,102090000000110,2019-02-28,A.46.2-862,covered elsewhere,Misdemeanor,Misdemeanor,1,Guilty,Conviction,,Black(Non-Hispanic),Male,25,True,False,False,False,False,True,True,False,False,Automatic (pending),False,Automatic (pending),Conviction of misdemeanor charges listed in 19...,False,False,1,Black (Non-Hispanic),Automatic (pending),Automatic (pending),Automatic (pending),Automatic (pending),Automatic (pending),Automatic (pending),Automatic (pending),Automatic (pending),Automatic (pending)
1,343221000000125,2009-12-07,B.46.2-301,covered elsewhere,Misdemeanor,Misdemeanor,1,Guilty,Conviction,,Black(Non-Hispanic),Female,540,True,False,False,True,True,False,False,False,False,Not eligible,False,Not eligible,Conviction of misdemeanor charges that are not...,False,False,2,Black (Non-Hispanic),Not eligible,Not eligible,Not eligible,Not eligible,Not eligible,Not eligible,Not eligible,Not eligible,Not eligible
2,343221000000125,2011-01-20,A.46.2-707,covered elsewhere,Misdemeanor,Misdemeanor,3,Guilty,Conviction,,Black(Non-Hispanic),Female,540,True,True,False,True,True,False,True,False,False,Not eligible,False,Not eligible,Conviction of misdemeanor charges that are not...,False,False,3,Black (Non-Hispanic),Not eligible,Not eligible,Not eligible,Not eligible,Not eligible,Not eligible,Not eligible,Not eligible,Not eligible
3,343221000000125,2011-07-01,B.46.2-301,covered elsewhere,Misdemeanor,Misdemeanor,1,Guilty In Absentia,Conviction,,Black(Non-Hispanic),Female,3,True,True,False,True,True,False,True,False,False,Not eligible,False,Not eligible,Conviction of misdemeanor charges that are not...,False,False,4,Black (Non-Hispanic),Not eligible,Not eligible,Not eligible,Not eligible,Not eligible,Not eligible,Not eligible,Not eligible,Not eligible
4,343221000000125,2012-10-15,B.46.2-301,covered elsewhere,Misdemeanor,Misdemeanor,1,Guilty,Conviction,,Black(Non-Hispanic),Female,3,True,True,False,True,True,False,True,False,False,Not eligible,False,Not eligible,Conviction of misdemeanor charges that are not...,False,False,5,Black (Non-Hispanic),Not eligible,Not eligible,Not eligible,Not eligible,Not eligible,Not eligible,Not eligible,Not eligible,Not eligible


In [65]:
df_join.sort_values('person_id').iloc[2]

person_id                                                               1000000000013
HearingDate                                                                2010-02-23
CodeSection                                                                B.46.2-301
codesection                                                         covered elsewhere
ChargeType                                                                Misdemeanor
chargetype                                                                Misdemeanor
Class                                                                               1
DispositionCode                                                                Guilty
disposition                                                                Conviction
Plea                                                                              NaN
Race_orig                                                         Black(Non-Hispanic)
Sex                                                   

In [64]:
df_join.sort_values('person_id').iloc[2]['reason']

'Conviction of misdemeanor charges listed in 19.2-392.6 B with no convictions within 7 years from disposition date'