In [1]:
# Import libraries 

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Set styling parameters
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 15, 6
plt.style.use('fivethirtyeight')

from poibin import PoiBin
import pytest

from sklearn.neighbors import KNeighborsClassifier

In [2]:
# Import ML models 

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score

import xgboost as xgb

In [3]:
cases = pd.read_csv('SCDB_2018_01_caseCentered_Citation.csv', encoding = 'ISO-8859-1')

In [4]:
cases.head()

Unnamed: 0,caseId,docketId,caseIssuesId,voteId,dateDecision,decisionType,usCite,sctCite,ledCite,lexisCite,...,authorityDecision1,authorityDecision2,lawType,lawSupp,lawMinor,majOpinWriter,majOpinAssigner,splitVote,majVotes,minVotes
0,1946-001,1946-001-01,1946-001-01-01,1946-001-01-01-01,11/18/1946,1,329 U.S. 1,67 S. Ct. 6,91 L. Ed. 3,1946 U.S. LEXIS 1724,...,4.0,,6.0,600.0,35 U.S.C. § 33,78.0,78.0,1,8,1
1,1946-002,1946-002-01,1946-002-01-01,1946-002-01-01-01,11/18/1946,1,329 U.S. 14,67 S. Ct. 13,91 L. Ed. 12,1946 U.S. LEXIS 1725,...,4.0,,6.0,600.0,18 U.S.C. § 398,81.0,87.0,1,6,3
2,1946-003,1946-003-01,1946-003-01-01,1946-003-01-01-01,11/18/1946,1,329 U.S. 29,67 S. Ct. 1,91 L. Ed. 22,1946 U.S. LEXIS 3037,...,1.0,,2.0,207.0,,84.0,78.0,1,5,4
3,1946-004,1946-004-01,1946-004-01-01,1946-004-01-01-01,11/25/1946,7,329 U.S. 40,67 S. Ct. 167,91 L. Ed. 29,1946 U.S. LEXIS 1696,...,4.0,,6.0,600.0,49 Stat. 801,87.0,87.0,1,5,3
4,1946-005,1946-005-01,1946-005-01-01,1946-005-01-01-01,11/25/1946,1,329 U.S. 64,67 S. Ct. 154,91 L. Ed. 44,1946 U.S. LEXIS 2997,...,7.0,,,,,78.0,87.0,1,6,3


In [5]:
og_data = pd.read_csv('SCDB_2018_01_justiceCentered_Citation.csv', encoding = 'ISO-8859-1')

In [6]:
og_data.head()

Unnamed: 0,caseId,docketId,caseIssuesId,voteId,dateDecision,decisionType,usCite,sctCite,ledCite,lexisCite,...,majVotes,minVotes,justice,justiceName,vote,opinion,direction,majority,firstAgreement,secondAgreement
0,1946-001,1946-001-01,1946-001-01-01,1946-001-01-01-01-01,11/18/1946,1,329 U.S. 1,67 S. Ct. 6,91 L. Ed. 3,1946 U.S. LEXIS 1724,...,8,1,86,HHBurton,2.0,1.0,1.0,1.0,,
1,1946-001,1946-001-01,1946-001-01-01,1946-001-01-01-01-02,11/18/1946,1,329 U.S. 1,67 S. Ct. 6,91 L. Ed. 3,1946 U.S. LEXIS 1724,...,8,1,84,RHJackson,1.0,1.0,2.0,2.0,,
2,1946-001,1946-001-01,1946-001-01-01,1946-001-01-01-01-03,11/18/1946,1,329 U.S. 1,67 S. Ct. 6,91 L. Ed. 3,1946 U.S. LEXIS 1724,...,8,1,81,WODouglas,1.0,1.0,2.0,2.0,,
3,1946-001,1946-001-01,1946-001-01-01,1946-001-01-01-01-04,11/18/1946,1,329 U.S. 1,67 S. Ct. 6,91 L. Ed. 3,1946 U.S. LEXIS 1724,...,8,1,80,FFrankfurter,4.0,2.0,2.0,2.0,,
4,1946-001,1946-001-01,1946-001-01-01,1946-001-01-01-01-05,11/18/1946,1,329 U.S. 1,67 S. Ct. 6,91 L. Ed. 3,1946 U.S. LEXIS 1724,...,8,1,79,SFReed,1.0,1.0,2.0,2.0,,


In [7]:
og_data.columns

Index(['caseId', 'docketId', 'caseIssuesId', 'voteId', 'dateDecision',
       'decisionType', 'usCite', 'sctCite', 'ledCite', 'lexisCite', 'term',
       'naturalCourt', 'chief', 'docket', 'caseName', 'dateArgument',
       'dateRearg', 'petitioner', 'petitionerState', 'respondent',
       'respondentState', 'jurisdiction', 'adminAction', 'adminActionState',
       'threeJudgeFdc', 'caseOrigin', 'caseOriginState', 'caseSource',
       'caseSourceState', 'lcDisagreement', 'certReason', 'lcDisposition',
       'lcDispositionDirection', 'declarationUncon', 'caseDisposition',
       'caseDispositionUnusual', 'partyWinning', 'precedentAlteration',
       'voteUnclear', 'issue', 'issueArea', 'decisionDirection',
       'decisionDirectionDissent', 'authorityDecision1', 'authorityDecision2',
       'lawType', 'lawSupp', 'lawMinor', 'majOpinWriter', 'majOpinAssigner',
       'splitVote', 'majVotes', 'minVotes', 'justice', 'justiceName', 'vote',
       'opinion', 'direction', 'majority', 'firstA

In [8]:
working_data = og_data.drop(columns = ['justice', 'docketId', 'caseIssuesId', 'voteId', 'dateDecision',
                                        'usCite', 'sctCite', 'ledCite', 'lexisCite', 
                                        'docket', 'caseName', 'petitionerState', 'respondentState', 
                                        'adminActionState', 'caseOriginState',
                                        'caseSourceState', 'declarationUncon', 
                                        'caseDispositionUnusual', 'partyWinning', 'voteUnclear',
                                        'decisionDirectionDissent', 'authorityDecision1', 'authorityDecision2',
                                        'lawMinor', 'majOpinWriter', 'majOpinAssigner',
                                        'splitVote','firstAgreement', 'secondAgreement', 
                                        'dateArgument', 'dateRearg',
                                        'term', 'caseDisposition', 'decisionDirection',
                                        'majVotes', 'minVotes', 'majority', 'vote', 'opinion', 
                                       'precedentAlteration', 'issueArea'])

In [9]:
working_data.columns

Index(['caseId', 'decisionType', 'naturalCourt', 'chief', 'petitioner',
       'respondent', 'jurisdiction', 'adminAction', 'threeJudgeFdc',
       'caseOrigin', 'caseSource', 'lcDisagreement', 'certReason',
       'lcDisposition', 'lcDispositionDirection', 'issue', 'lawType',
       'lawSupp', 'justiceName', 'direction'],
      dtype='object')

In [10]:
working_data = working_data.dropna(subset=['direction'])
working_data = working_data.fillna(int(999))
d = {1: 0, 2: 1}
working_data['direction'] = working_data['direction'].map(d)

In [11]:
not_to_dummy = ['caseId', 'justiceName', 'direction']
wd_columns_to_dummy = list(working_data.columns)
for n in not_to_dummy:
    wd_columns_to_dummy.remove(n)
wd_columns_to_dummy

['decisionType',
 'naturalCourt',
 'chief',
 'petitioner',
 'respondent',
 'jurisdiction',
 'adminAction',
 'threeJudgeFdc',
 'caseOrigin',
 'caseSource',
 'lcDisagreement',
 'certReason',
 'lcDisposition',
 'lcDispositionDirection',
 'issue',
 'lawType',
 'lawSupp']

In [12]:
working_data = pd.get_dummies(working_data, columns = wd_columns_to_dummy)

In [13]:
working_data.head()

Unnamed: 0,caseId,justiceName,direction,decisionType_1,decisionType_2,decisionType_4,decisionType_6,decisionType_7,naturalCourt_1301,naturalCourt_1303,...,lawSupp_507.0,lawSupp_508.0,lawSupp_509.0,lawSupp_510.0,lawSupp_511.0,lawSupp_512.0,lawSupp_600.0,lawSupp_800.0,lawSupp_900.0,lawSupp_999.0
0,1946-001,HHBurton,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
1,1946-001,RHJackson,1,1,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
2,1946-001,WODouglas,1,1,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
3,1946-001,FFrankfurter,1,1,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
4,1946-001,SFReed,1,1,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0


In [14]:
working_data.justiceName.value_counts()

WJBrennan       5087
BRWhite         4701
WHRehnquist     4300
JPStevens       4019
WODouglas       3707
HABlackmun      3561
TMarshall       3537
PStewart        3429
HLBlack         3151
SDOConnor       2798
AScalia         2773
AMKennedy       2736
WEBurger        2692
LFPowell        2383
CThomas         2246
JHarlan2        2190
TCClark         2132
EWarren         2100
RBGinsburg      2032
SGBreyer        1916
FFrankfurter    1721
DHSouter        1690
HHBurton        1327
SFReed          1085
JGRoberts        981
SAAlito          942
RHJackson        804
FMVinson         762
SSotomayor       668
SMinton          658
CEWhittaker      641
EKagan           551
AFortas          525
AJGoldberg       440
WBRutledge       368
FMurphy          361
NMGorsuch         82
Name: justiceName, dtype: int64

In [15]:
working_data.isna().sum()

caseId               0
justiceName          0
direction            0
decisionType_1       0
decisionType_2       0
decisionType_4       0
decisionType_6       0
decisionType_7       0
naturalCourt_1301    0
naturalCourt_1303    0
naturalCourt_1401    0
naturalCourt_1402    0
naturalCourt_1403    0
naturalCourt_1404    0
naturalCourt_1405    0
naturalCourt_1406    0
naturalCourt_1407    0
naturalCourt_1408    0
naturalCourt_1409    0
naturalCourt_1410    0
naturalCourt_1411    0
naturalCourt_1501    0
naturalCourt_1502    0
naturalCourt_1503    0
naturalCourt_1504    0
naturalCourt_1505    0
naturalCourt_1506    0
naturalCourt_1507    0
naturalCourt_1601    0
naturalCourt_1602    0
                    ..
lawSupp_377.0        0
lawSupp_378.0        0
lawSupp_379.0        0
lawSupp_381.0        0
lawSupp_382.0        0
lawSupp_383.0        0
lawSupp_384.0        0
lawSupp_387.0        0
lawSupp_388.0        0
lawSupp_400.0        0
lawSupp_401.0        0
lawSupp_402.0        0
lawSupp_403

In [16]:
working_data['direction']

0        0
1        1
2        1
3        1
4        1
5        1
6        1
7        1
8        1
9        0
10       1
11       0
12       0
13       0
14       1
15       0
16       1
17       0
18       0
19       1
20       0
21       0
22       0
23       1
24       1
25       1
26       1
27       0
29       1
30       1
        ..
79581    0
79582    0
79583    0
79584    0
79585    1
79586    1
79588    1
79589    1
79590    1
79591    1
79592    1
79593    1
79594    1
79595    1
79596    1
79597    1
79598    1
79599    1
79600    1
79601    1
79602    1
79603    0
79604    0
79605    0
79606    0
79607    1
79608    0
79609    0
79610    0
79611    0
Name: direction, Length: 75096, dtype: int64

## Modeling 

### Random Forest 

Just issue areas (for the sake of dimensionality, to see if there's any true value to keeping all those categories). 

In [17]:
#no_issue_data = working_data.drop(columns = ['issue'])
no_issue_data = working_data

In [18]:
Ginsburg_df = no_issue_data[no_issue_data['justiceName'] == 'RBGinsburg']

In [19]:
# if Ginsburg_df[Ginsburg_df['adminAction'].notnull()]:
#     Ginsburg_df['lcDisposition'] = 4
    
Ginsburg_df = Ginsburg_df.dropna(subset=['direction'])

In [20]:
case_ids = Ginsburg_df['caseId']
Ginsburg_df = Ginsburg_df.drop(columns = ['caseId', 'justiceName'])
# df = df[pd.notnull(df['EPS'])]

In [21]:
Ginsburg_target = Ginsburg_df['direction']
Ginsburg_df = Ginsburg_df.drop(columns = ['direction'])

In [22]:
data_train, data_test, target_train, target_test, case_train, case_test = train_test_split(Ginsburg_df, Ginsburg_target, case_ids)

In [23]:
case_test

62697    1995-014
72704    2008-010
75807    2012-018
69881    2004-012
71589    2006-035
68968    2002-074
75654    2012-001
73414    2009-006
62886    1995-035
65001    1997-085
66702    1999-082
71472    2006-022
72019    2007-008
62058    1994-035
61744    1993-096
68203    2001-074
65784    1998-068
75870    2012-025
72695    2008-009
76419    2013-007
73334    2008-080
68896    2002-065
63939    1996-062
63795    1996-046
72091    2007-016
66585    1999-069
74286    2010-010
72416    2007-052
77526    2014-060
73073    2008-051
           ...   
69692    2003-082
75636    2011-077
64083    1996-078
67735    2001-021
61797    1994-006
69058    2002-084
68770    2002-051
64650    1997-046
76473    2013-013
78762    2016-063
77112    2014-010
69496    2003-060
70600    2005-012
70743    2005-028
61546    1993-074
65073    1997-093
76140    2012-055
69076    2003-001
72884    2008-030
62238    1994-055
64524    1997-032
75519    2011-064
74161    2009-090
68428    2002-013
73783    2

In [24]:
forest = RandomForestClassifier(oob_score = True, n_estimators=200, max_depth = 30)
forest.fit(data_train, target_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=30, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=1,
            oob_score=True, random_state=None, verbose=0, warm_start=False)

In [25]:
forest.oob_score_

0.6587926509186351

In [26]:
print("On Training, score was {}".format(forest.score(data_train, target_train)))
print("On Test, score was {}".format(forest.score(data_test, target_test)))

On Training, score was 0.9330708661417323
On Test, score was 0.6614173228346457


In [27]:
probs = forest.predict_proba(data_test)
roc_auc_score(target_test, probs[:,1])

0.6884991438767183

In [28]:
probs_series = pd.DataFrame(data = probs[:,1], index = case_test.index)

In [29]:
outcomes_series = pd.DataFrame(data = data_test['direction'].values, index = working_test_cases.values)

KeyError: 'direction'

In [None]:
probs_with_ids = pd.concat([probs_series, case_test], axis=1)

In [None]:
probs_with_ids.head()

In [None]:
probs_with_ids.rename(columns={0:'probability'}, inplace=True)

In [None]:
probs_with_ids.head()

In [None]:
justices = ['RBG', 'Alito', 'Souter']

In [None]:
master_probas = pd.DataFrame(columns = justices, index = case_test.values)

In [None]:
master_probas.head()

In [None]:
for ind, row in probs_with_ids.iterrows():
    case = row['caseId']
    probabil = row['probability']
    master_probas['RBG'].loc[case] = probabil 

In [None]:
master_probas.head()

In [None]:
master_probas = master_probas.fillna(2)

In [None]:
master_probas.head()

In [None]:
ps = dict.fromkeys(list(master_probas.index.values), 0)

In [None]:
ps

In [None]:
for ind, row in master_probas.iterrows():
    lista = []
    for c in master_probas.columns:
        if row[c] != 2:
            lista.append(row[c])
    lista = lista + [0.7, 0.5, 0.66, 0.44, 0.9, 0.83, 0.7, 0.71]
    ps[ind] = lista 

In [None]:
print(ps)

In [None]:
outcomes = {}
x = [5, 6, 7, 8, 9]

In [None]:
for k in ps.keys():
    if len(ps[k]) == 9:
        pb = PoiBin(ps[k])
        outcomes[k] = sum(pb.pmf(x))
    

In [None]:
outcomes

In [None]:
from sklearn.metrics import confusion_matrix

cnf_matrix = confusion_matrix(forest.predict(data_test), target_test)
print('Confusion Matrix:\n',cnf_matrix)

In [None]:
import numpy as np
import itertools
import matplotlib.pyplot as plt
%matplotlib inline

plt.imshow(cnf_matrix,  cmap=plt.cm.Blues) #Create the basic matrix.

#Add title and Axis Labels
plt.title('Confusion Matrix')
plt.ylabel('True label')
plt.xlabel('Predicted label')

#Add appropriate Axis Scales
class_names = ['conservative', 'liberal'] #Get class labels to add to matrix
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names, rotation=45)
plt.yticks(tick_marks, class_names)

#Add Labels to Each Cell
thresh = cnf_matrix.max() / 2. #Used for text coloring below
#Here we iterate through the confusion matrix and append labels to our visualization.
for i, j in itertools.product(range(cnf_matrix.shape[0]), range(cnf_matrix.shape[1])):
        plt.text(j, i, cnf_matrix[i, j],
                 horizontalalignment="center",
                 color="white" if cnf_matrix[i, j] > thresh else "black")

#Add a Side Bar Legend Showing Colors
plt.colorbar()

In [None]:
forest = RandomForestClassifier(oob_score = True, n_estimators=1500, max_depth = 30)
forest.fit(data_train, target_train)

In [None]:
# Using GridSearchCV to do hyperparameter tuning, note that this takes a while to run and will tax your system
# a bit if you're running this on a laptop. 

# n_estimators = [300, 500, 700]
# max_depth = [10,12,14,16]

# param_grid_forest = dict(n_estimators=n_estimators, max_depth=max_depth)


# kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=7)

# grid_search = GridSearchCV(forest, param_grid_forest, scoring="accuracy", n_jobs=-1, cv=kfold)
# grid_result = grid_search.fit(data_train, target_train)

# # Interpreting results 
# print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

# means = grid_result.cv_results_['mean_test_score']
# stds = grid_result.cv_results_['std_test_score']
# params = grid_result.cv_results_['params']

# for mean, stdev, param in zip(means, stds, params):
#     print("%f (%f) with: %r" % (mean, stdev, param))

In [None]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(fit_intercept = False, C = 1e12)
model_log = logreg.fit(data_train, target_train)

In [None]:
probs = model_log.predict_proba(data_test)
roc_auc_score(target_test, probs[:,1])

In [None]:
print("On Training, score was {}".format(model_log.score(data_train, target_train)))
print("On Test, score was {}".format(model_log.score(data_test, target_test)))

In [None]:
cnf_matrix = confusion_matrix(model_log.predict(data_test), target_test)
print('Confusion Matrix:\n',cnf_matrix)

In [None]:
plt.imshow(cnf_matrix,  cmap=plt.cm.Blues) #Create the basic matrix.

#Add title and Axis Labels
plt.title('Confusion Matrix')
plt.ylabel('True label')
plt.xlabel('Predicted label')

#Add appropriate Axis Scales
class_names = ['conservative', 'liberal'] #Get class labels to add to matrix
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names, rotation=45)
plt.yticks(tick_marks, class_names)

#Add Labels to Each Cell
thresh = cnf_matrix.max() / 2. #Used for text coloring below
#Here we iterate through the confusion matrix and append labels to our visualization.
for i, j in itertools.product(range(cnf_matrix.shape[0]), range(cnf_matrix.shape[1])):
        plt.text(j, i, cnf_matrix[i, j],
                 horizontalalignment="center",
                 color="white" if cnf_matrix[i, j] > thresh else "black")

#Add a Side Bar Legend Showing Colors
plt.colorbar()

In [None]:
# eval_set = [(data_test, target_test)]
# xgb_model = xgb.XGBClassifier()
# xgb_model.fit(data_train, target_train, early_stopping_rounds=30, eval_metric="error", eval_set=eval_set, verbose=True)

In [None]:
# print("On Training, score was {}".format(xgb_model.score(data_train, target_train)))
# print("On Test, score was {}".format(xgb_model.score(data_test, target_test)))

In [None]:
# Using GridSearchCV to do hyperparameter tuning, note that this takes a while to run and will tax your system
# a bit if you're running this on a laptop. 

# learning_rate = [0.005, 0.01, 0.02]
# n_estimators = [150]
# max_depth = [2,4,6]

# param_grid = dict(learning_rate=learning_rate, n_estimators=n_estimators, max_depth=max_depth)


# kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=7)

# grid_search = GridSearchCV(xgb_model, param_grid, scoring="accuracy", n_jobs=-1, cv=kfold)
# grid_result = grid_search.fit(data_train, target_train)

# # Interpreting results 
# print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

# means = grid_result.cv_results_['mean_test_score']
# stds = grid_result.cv_results_['std_test_score']
# params = grid_result.cv_results_['params']

# for mean, stdev, param in zip(means, stds, params):
#     print("%f (%f) with: %r" % (mean, stdev, param))

In [None]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
nb = GaussianNB()
nbmodel = nb.fit(data_train, target_train)
nbpreds = nbmodel.predict(data_test)
accuracy = accuracy_score(target_test, nbpreds)

In [None]:
accuracy

In [None]:
from sklearn import svm
sv = svm.SVC(kernel='rbf', probability = True, random_state = 124)
svmmodel = sv.fit(data_train, target_train)
svmpreds = svmmodel.predict(data_test)
accuracy_score(target_test, svmpreds)