In [1]:
# import dependencies
import pandas as pd
import numpy as np
import warnings
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report

warnings.filterwarnings("ignore")

In [5]:
scotus_df = pd.read_csv("../Resources/SCOTUS_OHEncoded.csv", index_col=0)
df_115 = scotus_df[scotus_df["justice"] == 115]
df_115.head()

Unnamed: 0,term_1953,term_1960,term_1962,term_1965,term_1968,term_1972,term_1973,term_1977,term_1984,term_1987,...,lawType_3,lawType_4,lawType_5,lawType_6,lawType_8,lawType_9,precedentAlteration_0,precedentAlteration_1,direction,justice
13085,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,1,0,1,115
13094,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,115
13103,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,1,0,1,115
13112,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,1,0,1,115
13121,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,1,0,1,115


In [3]:
target_115 = df_115["direction"]
target_names = ["conservative", "liberal"]

In [4]:
data_115 = df_115.drop(["direction", "justice"], axis=1)
data_115.head()

Unnamed: 0,term_1953,term_1960,term_1962,term_1965,term_1968,term_1972,term_1973,term_1977,term_1984,term_1987,...,lawType_1,lawType_2,lawType_3,lawType_4,lawType_5,lawType_6,lawType_8,lawType_9,precedentAlteration_0,precedentAlteration_1
13085,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
13094,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
13103,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
13112,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
13121,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0


In [6]:
# term_df = pd.get_dummies(data_115["term"], prefix="term")
# precAlt_df = pd.get_dummies(data_115["precedentAlteration"], prefix="precedentAlteration")
# issueArea_df = pd.get_dummies(data_115["issueArea"], prefix="issueArea")
# lawType_df = pd.get_dummies(data_115["lawType"], prefix="lawType")
# caseOriginState_df = pd.get_dummies(data_115["caseOriginState"], prefix="caseOriginState")

In [7]:
# oneHot_features = pd.concat([term_df, precAlt_df], axis=1)
# oneHot_features = pd.concat([oneHot_features, issueArea_df], axis=1)
# oneHot_features = pd.concat([oneHot_features, lawType_df], axis=1)
# oneHot_features = pd.concat([oneHot_features, caseOriginState_df], axis=1)
# oneHot_features.head()

In [8]:
X = data_115
y = target_115
feature_names = data_115.columns

### Evaluating different classifiers

In [9]:
clf_lr = LogisticRegression()
clf_rf = RandomForestClassifier()
clf_nb = GaussianNB()
clf_svm = SVC(kernel='linear', probability=True)

print('5-fold cross validation:\n')

labels = ["Logistic Regression", "Random Forest", "Naive Bayes", "Linear SVM"]

for clf, label in zip([clf_lr, clf_rf, clf_nb, clf_svm], labels):
    scores = cross_val_score(clf, X, y, cv=5, 
                             scoring='f1', n_jobs=-1)
    print("F1: %0.3f [%s]" % (scores.mean(), label))

5-fold cross validation:

F1: 0.701 [Logistic Regression]
F1: 0.744 [Random Forest]
F1: 0.262 [Naive Bayes]
F1: 0.731 [Linear SVM]


In [10]:
# using all but random forest for voting classifier estimators
v_clf_hard = VotingClassifier(estimators = [(labels[0], clf_lr),
                                            (labels[2], clf_nb),
                                            (labels[3], clf_svm)],
                                            voting = 'hard')

In [11]:
v_clf_soft = VotingClassifier(estimators = [(labels[0], clf_lr),
                                            (labels[2], clf_nb),
                                            (labels[3], clf_svm)],
                                            voting = 'soft')

In [12]:
new_labels = ["Logistic Regression", "Naive Bayes", "Linear SVM", 
              "Voting_Classifier_Hard", "Voting_Classifier_Soft"]

for (clf, label) in zip([clf_lr, clf_rf, clf_svm, v_clf_hard, v_clf_soft], new_labels):
    scores = cross_val_score(clf, X, y,cv=5,
                             scoring='f1', n_jobs=-1)
    print("F1: %0.3f [%s]" % (scores.mean(), label))

F1: 0.701 [Logistic Regression]
F1: 0.712 [Naive Bayes]
F1: 0.731 [Linear SVM]
F1: 0.714 [Voting_Classifier_Hard]
F1: 0.262 [Voting_Classifier_Soft]


### SVM Model Training

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
model = SVC(kernel='linear')
model.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [15]:
# Model Accuracy
print('Train Acc: %.3f' % model.score(X_train, y_train))
print('Test Acc: %.3f' % model.score(X_test, y_test))

Train Acc: 0.769
Test Acc: 0.588


In [16]:
predictions = model.predict(X_test)
print(classification_report(y_test, predictions,
                            target_names=target_names))

              precision    recall  f1-score   support

conservative       0.56      1.00      0.72         9
     liberal       1.00      0.12      0.22         8

   micro avg       0.59      0.59      0.59        17
   macro avg       0.78      0.56      0.47        17
weighted avg       0.77      0.59      0.49        17



In [17]:
print(confusion_matrix(predictions, y_test))
predictions[:15], y_test[:15].ravel()

[[9 7]
 [0 1]]


(array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int64),
 array([2, 1, 1, 2, 1, 2, 1, 1, 1, 2, 1, 2, 2, 1, 1], dtype=int64))

### Save model and confirm load

In [18]:
import pickle

filename = '115_model.sav'
pickle.dump(model, open(filename, 'wb'))

In [19]:
#Load Model from Disk
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(X_test, y_test)
print(result)

0.5882352941176471
