In [1]:
# import dependencies
import pandas as pd
import numpy as np
import warnings
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report

warnings.filterwarnings("ignore")

### Pre-processing

In [3]:
scotus_df = pd.read_csv("../Resources/SCOTUS_OHEncoded.csv", index_col=0)
df_108 = scotus_df[scotus_df["justice"] == 108]
df_108.tail()

Unnamed: 0,term_1953,term_1960,term_1962,term_1965,term_1968,term_1972,term_1973,term_1977,term_1984,term_1987,...,lawType_3,lawType_4,lawType_5,lawType_6,lawType_8,lawType_9,precedentAlteration_0,precedentAlteration_1,direction,justice
13793,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,1,1,108
13802,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,2,108
13811,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,1,108
13828,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,1,0,2,108
13837,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,1,0,1,108


In [4]:
target_108 = df_108["direction"]
target_names = ["conservative", "liberal"]

In [5]:
data_108 = df_108.drop(["direction", "justice"], axis=1)
data_108.head()

Unnamed: 0,term_1953,term_1960,term_1962,term_1965,term_1968,term_1972,term_1973,term_1977,term_1984,term_1987,...,lawType_1,lawType_2,lawType_3,lawType_4,lawType_5,lawType_6,lawType_8,lawType_9,precedentAlteration_0,precedentAlteration_1
597,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
604,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
609,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
617,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
620,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0


In [6]:
# # One-hot encoding
# term_df = pd.get_dummies(data_108["term"], prefix='term')
# precAlt_df = pd.get_dummies(data_108["precedentAlteration"], prefix='precedentAlteration')
# issueArea_df = pd.get_dummies(data_108["issueArea"], prefix='issueArea')
# lawType_df = pd.get_dummies(data_108["lawType"], prefix='lawType')
# caseOriginState_df = pd.get_dummies(data_108["caseOriginState"], prefix='caseOriginState')

# precAlt_df.head()

In [7]:
# # concatenate encoded features into mega-dataframe
# oneHot_features = pd.concat([term_df, precAlt_df], axis=1)
# oneHot_features = pd.concat([oneHot_features, issueArea_df], axis=1)
# oneHot_features = pd.concat([oneHot_features, lawType_df], axis=1)
# oneHot_features = pd.concat([oneHot_features, caseOriginState_df], axis=1)
# oneHot_features.head()

In [8]:
# assign as X, y, and feature_names variables (to simplify replication)
X = data_108
y = target_108
feature_names = data_108.columns

### Evaluating different classifiers

In [9]:
clf_lr = LogisticRegression()
clf_rf = RandomForestClassifier()
clf_nb = GaussianNB()
clf_svm = SVC(kernel='linear', probability=True)

print('5-fold cross validation:\n')

labels = ["Logistic Regression", "Random Forest", "Naive Bayes", "Linear SVM"]

for clf, label in zip([clf_lr, clf_rf, clf_nb, clf_svm], labels):
    scores = cross_val_score(clf, X, y, cv=5, 
                             scoring='f1', n_jobs=-1)
    print("F1: %0.3f [%s]" % (scores.mean(), label))


5-fold cross validation:

F1: 0.729 [Logistic Regression]
F1: 0.733 [Random Forest]
F1: 0.133 [Naive Bayes]
F1: 0.736 [Linear SVM]


In [10]:
# using all but naive bayes (low score) for voting classifier estimators
v_clf_hard = VotingClassifier(estimators = [(labels[0], clf_lr),
                                            (labels[1], clf_rf),
                                            (labels[3], clf_svm)],
                                            voting = 'hard')

In [11]:
v_clf_soft = VotingClassifier(estimators = [(labels[0], clf_lr),
                                            (labels[1], clf_rf),
                                            (labels[3], clf_svm)],
                                            voting = 'soft')

In [12]:
new_labels = ["Logistic Regression", "Random Forest", "Linear SVM", 
              "Voting_Classifier_Hard", "Voting_Classifier_Soft"]

for (clf, label) in zip([clf_lr, clf_rf, clf_svm, v_clf_hard, v_clf_soft], new_labels):
    scores = cross_val_score(clf, X, y,cv=5,
                             scoring='f1', n_jobs=-1)
    print("F1: %0.3f [%s]" % (scores.mean(), label))

F1: 0.729 [Logistic Regression]
F1: 0.712 [Random Forest]
F1: 0.736 [Linear SVM]
F1: 0.736 [Voting_Classifier_Hard]
F1: 0.739 [Voting_Classifier_Soft]


### Training the SVM model
While the Soft Voting Classifier had the best F1 score for justice 108, Linear SVM was 2nd highest by a very small margin and also scored well on other justice's models, thus, it was chosen as the standard.

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [14]:
model = SVC(kernel='linear')
model.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [15]:
# Model Accuracy
print('Train Acc: %.3f' % model.score(X_train, y_train))
print('Test Acc: %.3f' % model.score(X_test, y_test))

Train Acc: 0.691
Test Acc: 0.664


In [16]:
predictions = model.predict(X_test)
print(classification_report(y_test, predictions,
                            target_names=target_names))

              precision    recall  f1-score   support

conservative       0.68      0.94      0.79       378
     liberal       0.44      0.09      0.15       184

   micro avg       0.66      0.66      0.66       562
   macro avg       0.56      0.52      0.47       562
weighted avg       0.60      0.66      0.58       562



In [17]:
confusion_matrix(predictions, y_test)
predictions[:15], y_test[:15].ravel()

(array([2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int64),
 array([2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int64))

### Save and confirm loading of the model

In [18]:
#Save Model
#https://machinelearningmastery.com/save-load-machine-learning-models-python-scikit-learn/
import pickle

filename = '108_model.sav'
pickle.dump(model, open(filename, 'wb'))

In [19]:
#Load Model from Disk
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(X_test, y_test)
print(result)

0.6637010676156584
