In [1]:
# import dependencies
import pandas as pd
import numpy as np
import warnings
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report

warnings.filterwarnings("ignore")

In [2]:
scotus_df = pd.read_csv("../Resources/SCOTUS_OHEncoded.csv", index_col=0)
df_113 = scotus_df[scotus_df["justice"] == 113]
df_113.tail()

Unnamed: 0,term_1953,term_1960,term_1962,term_1965,term_1968,term_1972,term_1973,term_1977,term_1984,term_1987,...,lawType_3,lawType_4,lawType_5,lawType_6,lawType_8,lawType_9,precedentAlteration_0,precedentAlteration_1,direction,justice
13806,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,2,113
13815,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,1,113
13823,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,2,113
13832,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,1,0,2,113
13841,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,1,0,1,113


In [3]:
target_113 = df_113["direction"]
target_names = ["conservative", "liberal"]

In [4]:
data_113 = df_113.drop(["direction", "justice"], axis=1)
data_113.head()

Unnamed: 0,term_1953,term_1960,term_1962,term_1965,term_1968,term_1972,term_1973,term_1977,term_1984,term_1987,...,lawType_1,lawType_2,lawType_3,lawType_4,lawType_5,lawType_6,lawType_8,lawType_9,precedentAlteration_0,precedentAlteration_1
8475,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
8482,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
8489,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
8495,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
8502,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0


In [7]:
# term_df = pd.get_dummies(data_113["term"], prefix="term")
# precAlt_df = pd.get_dummies(data_113["precedentAlteration"], prefix="precedentAlteration")
# issueArea_df = pd.get_dummies(data_113["issueArea"], prefix="issueArea")
# lawType_df = pd.get_dummies(data_113["lawType"], prefix="lawType")
# caseOriginState_df = pd.get_dummies(data_113["caseOriginState"], prefix="caseOriginState")

In [5]:
# oneHot_features = pd.concat([term_df, precAlt_df], axis=1)
# oneHot_features = pd.concat([oneHot_features, issueArea_df], axis=1)
# oneHot_features = pd.concat([oneHot_features, lawType_df], axis=1)
# oneHot_features = pd.concat([oneHot_features, caseOriginState_df], axis=1)
# oneHot_features.head()

In [6]:
X = data_113
y = target_113
feature_names = data_113.columns

### Evaluating different classifiers

In [7]:
clf_lr = LogisticRegression()
clf_rf = RandomForestClassifier()
clf_nb = GaussianNB()
clf_svm = SVC(kernel='linear', probability=True)

print('5-fold cross validation:\n')

labels = ["Logistic Regression", "Random Forest", "Naive Bayes", "Linear SVM"]

for clf, label in zip([clf_lr, clf_rf, clf_nb, clf_svm], labels):
    scores = cross_val_score(clf, X, y, cv=5, 
                             scoring='f1', n_jobs=-1)
    print("F1: %0.3f [%s]" % (scores.mean(), label))

5-fold cross validation:

F1: 0.253 [Logistic Regression]
F1: 0.313 [Random Forest]
F1: 0.525 [Naive Bayes]
F1: 0.184 [Linear SVM]


In [8]:
# using all but linear svm for voting classifier estimators
v_clf_hard = VotingClassifier(estimators = [(labels[0], clf_lr),
                                            (labels[1], clf_rf),
                                            (labels[2], clf_nb)],
                                            voting = 'hard')

In [9]:
v_clf_soft = VotingClassifier(estimators = [(labels[0], clf_lr),
                                            (labels[1], clf_rf),
                                            (labels[2], clf_nb)],
                                            voting = 'soft')

In [10]:
new_labels = ["Logistic Regression", "Random Forest", "Naive Bayes", 
              "Voting_Classifier_Hard", "Voting_Classifier_Soft"]

for (clf, label) in zip([clf_lr, clf_rf, clf_nb, v_clf_hard, v_clf_soft], new_labels):
    scores = cross_val_score(clf, X, y,cv=5,
                             scoring='f1', n_jobs=-1)
    print("F1: %0.3f [%s]" % (scores.mean(), label))

F1: 0.253 [Logistic Regression]
F1: 0.261 [Random Forest]
F1: 0.525 [Naive Bayes]
F1: 0.306 [Voting_Classifier_Hard]
F1: 0.470 [Voting_Classifier_Soft]


### SVM Model Training

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
model = SVC(kernel='linear')
model.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [13]:
# Model Accuracy
print('Train Acc: %.3f' % model.score(X_train, y_train))
print('Test Acc: %.3f' % model.score(X_test, y_test))

Train Acc: 0.661
Test Acc: 0.560


In [14]:
predictions = model.predict(X_test)
print(classification_report(y_test, predictions,
                            target_names=target_names))

              precision    recall  f1-score   support

conservative       0.29      0.20      0.23        46
     liberal       0.64      0.75      0.69        88

   micro avg       0.56      0.56      0.56       134
   macro avg       0.47      0.47      0.46       134
weighted avg       0.52      0.56      0.53       134



In [15]:
print(confusion_matrix(predictions, y_test))
predictions[:15], y_test[:15].ravel()

[[ 9 22]
 [37 66]]


(array([2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 1, 2, 2, 1], dtype=int64),
 array([1, 1, 2, 1, 2, 2, 1, 2, 2, 1, 1, 2, 2, 1, 2], dtype=int64))

### Save model and confirm load

In [16]:
import pickle

filename = '113_model.sav'
pickle.dump(model, open(filename, 'wb'))

In [17]:
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(X_test, y_test)
print(result)

0.5597014925373134
