In [1]:
from sqlalchemy import create_engine
import psycopg2 as db
import pandas as pd
import numpy as np
import os
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
import random
random.seed(42)  #use any integer, I'm partial to 42, 434, etc.

# Remove code_section and fips from the data first
# Keep race and gender and felony and misdemeanor 
# Change the misdeanor and charge class type and just combine them to like "Misdeanor 5"
# Look at how important race is in the conditioning of the model (feature importance)
# Model fit in the way that datacamp says how do it (accuracy fit)
# Look at accuracy 

In [2]:
postPass=os.environ["POSTGRES_PASS"]

In [3]:
engine = create_engine("postgresql+psycopg2://{user}:{pw}@localhost/{db}"
.format(user="jupyter", pw=postPass, db="expunge"))


In [4]:
new_query = """
SELECT *, charge_type || charge_class as charge_type
FROM charges
"""

charges = pd.read_sql(new_query, con=engine)

In [5]:
charges.disposition_code.value_counts()


Guilty                     4820209
Nolle Prosequi             1688688
Dismissed                  1314883
Guilty In Absentia          989686
Not Guilty                  176021
Not Guilty/Acquitted         50611
No Indictment Presented       6551
Not True Bill                 6158
Dismissed/Other                770
Name: disposition_code, dtype: int64

In [9]:
charges= charges.assign(conviction=charges['disposition_code'] == 'Guilty' or charges['disposition_code'] == 'Guilty In Absentia' )

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [6]:
charges['conviction']= [x in ['Guilty', 'Guilty In Absentia'] for x in charges['disposition_code']]

In [12]:
charges

Unnamed: 0,id,person_id,hearing_date,code_section,charge_type,charge_class,disposition_code,plea,race,sex,fips,charge_type.1,conviction
0,3048743,306100000000163,2016-08-12,28-12,Misdemeanor,,Guilty,,Black,Male,701,MisdemeanorNA,True
1,3048744,306100000000163,2016-11-04,28-29,Misdemeanor,,Dismissed,,Black,Male,700,MisdemeanorNA,False
2,3048745,88220000000471,2013-10-16,46.2-300,Misdemeanor,,Guilty,Guilty,Black,Male,059,MisdemeanorNA,True
3,3048746,11221000000319,2013-07-30,C.46.2-862,Misdemeanor,,Guilty,,White,Female,131,MisdemeanorNA,True
4,3048747,354170000000498,2010-09-28,46.2-308,Misdemeanor,,Guilty,Guilty,White,Male,013,MisdemeanorNA,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9053572,3048738,306100000000163,2012-07-20,28-29,Misdemeanor,,Dismissed,,Black,Male,701,MisdemeanorNA,False
9053573,3048739,306100000000163,2012-12-06,28-12,Misdemeanor,4,Guilty,,Black,Male,701,Misdemeanor4,True
9053574,3048740,306100000000163,2013-04-10,18.2-96,Misdemeanor,1,Guilty,,Black,Male,701,Misdemeanor1,True
9053575,3048741,306100000000163,2014-03-28,28-12,Misdemeanor,4,Dismissed,,Black,Male,701,Misdemeanor4,False


In [25]:
# Import train_test_split function
from sklearn.model_selection import train_test_split

X=charges.drop(['conviction', 'hearing_date', 'code_section','fips', 'charge_type', 'charge_class', 'person_id', 'id', 'disposition_code'],  axis=1)  # Features
y=charges['conviction']  # Labels

# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) # 70% training and 30% test
#Taking all 90 million rows and putting them into a lottery where 70% go into training set and other goes in test set
#Training data used to estimate random forest model and test used to test the model

In [26]:
from sklearn.preprocessing import OneHotEncoder
xohe = OneHotEncoder()
data = xohe.fit(X_train)
data.categories_

[array(['Alford', 'Guilty', 'NA', 'Nolo Contendere', 'Not Guilty',
        'Tried In Absentia'], dtype=object),
 array(['American Indian or Alaskan Native', 'Asian or Pacific Islander',
        'Black', 'Hispanic', 'Unknown', 'White'], dtype=object),
 array(['Female', 'Male'], dtype=object)]

In [27]:
xoheTrans = xohe.transform(X_train)

In [28]:
#For the actual test
xoheTest = xohe.transform(X_test)


In [29]:
%%time
#Import Random Forest Model
from sklearn.ensemble import RandomForestClassifier

#Create a Gaussian Classifier
clf=RandomForestClassifier(n_estimators=100)

#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(xoheTrans,y_train)

CPU times: user 10min 45s, sys: 28.6 s, total: 11min 14s
Wall time: 11min 14s


RandomForestClassifier()

In [30]:
y_pred=clf.predict(xoheTrans)
y_pred_test=clf.predict(xoheTest)

In [32]:
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_train, y_pred))

Accuracy: 0.6420744889588218


In [31]:
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred_test))

Accuracy: 0.6425664396478152


In [33]:

clf.feature_importances_

array([9.60398362e-04, 2.83597953e-01, 4.93210069e-01, 4.92738179e-02,
       6.18607087e-02, 5.92822340e-02, 3.87512204e-04, 2.07269756e-03,
       6.68432842e-03, 3.20673046e-02, 5.61979772e-03, 3.13909454e-03,
       8.96038180e-04, 9.48045884e-04])

In [41]:
import pandas as pd
feature_imp = pd.Series(clf.feature_importances_,index=np.concatenate(data.categories_)).sort_values(ascending=False)
feature_imp

NA                                   0.493210
Guilty                               0.283598
Not Guilty                           0.061861
Tried In Absentia                    0.059282
Nolo Contendere                      0.049274
Hispanic                             0.032067
Black                                0.006684
Unknown                              0.005620
White                                0.003139
Asian or Pacific Islander            0.002073
Alford                               0.000960
Male                                 0.000948
Female                               0.000896
American Indian or Alaskan Native    0.000388
dtype: float64

array(['Alford', 'Guilty', 'NA', 'Nolo Contendere', 'Not Guilty',
       'Tried In Absentia', 'American Indian or Alaskan Native',
       'Asian or Pacific Islander', 'Black', 'Hispanic', 'Unknown',
       'White', 'Female', 'Male'], dtype=object)