In [1]:
# import dependencies
import pandas as pd
import numpy as np
import warnings
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder


warnings.filterwarnings("ignore")

Using TensorFlow backend.


In [2]:
df_115 = pd.read_csv(("../Resources/115justice.csv"))
df_115.tail()

Unnamed: 0,justice,term,direction,precedentAlteration,issueArea,lawType,caseOriginState
77,115,2017,2,0,1,2,55
78,115,2017,1,0,9,9,0
79,115,2017,2,0,2,2,0
80,115,2017,2,0,2,6,0
81,115,2017,1,0,1,6,0


In [3]:
target_115 = df_115["direction"]
target_names = ["conservative", "liberal"]

In [4]:
data_115 = df_115.drop(["direction", "justice"], axis=1)
data_115.head()

Unnamed: 0,term,precedentAlteration,issueArea,lawType,caseOriginState
0,2016,0,9,6,0
1,2016,0,9,1,0
2,2016,0,9,3,0
3,2016,0,8,3,0
4,2016,0,8,6,0


In [5]:
term_df = pd.get_dummies(data_115["term"], prefix="term")
precAlt_df = pd.get_dummies(data_115["precedentAlteration"], prefix="precedentAlteration")
issueArea_df = pd.get_dummies(data_115["issueArea"], prefix="issueArea")
lawType_df = pd.get_dummies(data_115["lawType"], prefix="lawType")
caseOriginState_df = pd.get_dummies(data_115["caseOriginState"], prefix="caseOriginState")

In [6]:
oneHot_features = pd.concat([term_df, precAlt_df], axis=1)
oneHot_features = pd.concat([oneHot_features, issueArea_df], axis=1)
oneHot_features = pd.concat([oneHot_features, lawType_df], axis=1)
oneHot_features = pd.concat([oneHot_features, caseOriginState_df], axis=1)
oneHot_features.head()

Unnamed: 0,term_2016,term_2017,precedentAlteration_0,precedentAlteration_1,issueArea_1,issueArea_2,issueArea_3,issueArea_4,issueArea_5,issueArea_6,...,caseOriginState_5,caseOriginState_6,caseOriginState_7,caseOriginState_10,caseOriginState_22,caseOriginState_26,caseOriginState_31,caseOriginState_49,caseOriginState_55,caseOriginState_56
0,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
X = oneHot_features
feature_names = oneHot_features.columns
y = target_115
feature_names.ravel()

array(['term_2016', 'term_2017', 'precedentAlteration_0',
       'precedentAlteration_1', 'issueArea_1', 'issueArea_2',
       'issueArea_3', 'issueArea_4', 'issueArea_5', 'issueArea_6',
       'issueArea_7', 'issueArea_8', 'issueArea_9', 'issueArea_10',
       'issueArea_12', 'issueArea_14', 'lawType_1', 'lawType_2',
       'lawType_3', 'lawType_4', 'lawType_6', 'lawType_9',
       'caseOriginState_0', 'caseOriginState_5', 'caseOriginState_6',
       'caseOriginState_7', 'caseOriginState_10', 'caseOriginState_22',
       'caseOriginState_26', 'caseOriginState_31', 'caseOriginState_49',
       'caseOriginState_55', 'caseOriginState_56'], dtype=object)

### Evaluating different classifiers

In [8]:
clf_lr = LogisticRegression(random_state=1)
clf_rf = RandomForestClassifier(random_state=1, n_estimators=300)
clf_nb = GaussianNB()
clf_svm = SVC(kernel='linear', probability=True)

print('5-fold cross validation:\n')

labels = ["Logistic Regression", "Random Forest", "Naive Bayes", "Linear SVM"]

for clf, label in zip([clf_lr, clf_rf, clf_nb, clf_svm], labels):
    scores = cross_val_score(clf, X, y, cv=5, 
                             scoring='f1', n_jobs=-1)
    print("F1: %0.3f [%s]" % (scores.mean(), label))

5-fold cross validation:

F1: 0.701 [Logistic Regression]
F1: 0.716 [Random Forest]
F1: 0.262 [Naive Bayes]
F1: 0.731 [Linear SVM]


In [9]:
# using all but naive bayes (since score was low) for voting classifier estimators
v_clf_hard = VotingClassifier(estimators = [(labels[0], clf_lr),
                                            (labels[1], clf_rf),
                                            (labels[3], clf_svm)],
                                            voting = 'hard')

In [10]:
v_clf_soft = VotingClassifier(estimators = [(labels[0], clf_lr),
                                            (labels[1], clf_rf),
                                            (labels[3], clf_svm)],
                                            voting = 'soft')

In [11]:
new_labels = ["Logistic Regression", "Random Forest", "Linear SVM", 
              "Voting_Classifier_Hard", "Voting_Classifier_Soft"]

for (clf, label) in zip([clf_lr, clf_rf, clf_svm, v_clf_hard, v_clf_soft], new_labels):
    scores = cross_val_score(clf, X, y,cv=5,
                             scoring='f1', n_jobs=-1)
    print("F1: %0.3f [%s]" % (scores.mean(), label))

F1: 0.701 [Logistic Regression]
F1: 0.716 [Random Forest]
F1: 0.731 [Linear SVM]
F1: 0.731 [Voting_Classifier_Hard]
F1: 0.734 [Voting_Classifier_Soft]


### SVM Model Training

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
model = SVC(kernel='linear')
model.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [14]:
# Model Accuracy
print('Train Acc: %.3f' % model.score(X_train, y_train))
print('Test Acc: %.3f' % model.score(X_test, y_test))

Train Acc: 0.769
Test Acc: 0.588


In [15]:
predictions = model.predict(X_test)
print(classification_report(y_test, predictions,
                            target_names=target_names))

              precision    recall  f1-score   support

conservative       0.56      1.00      0.72         9
     liberal       1.00      0.12      0.22         8

   micro avg       0.59      0.59      0.59        17
   macro avg       0.78      0.56      0.47        17
weighted avg       0.77      0.59      0.49        17



In [16]:
confusion_matrix(predictions, y_test)
predictions[:20], y_test[:20].ravel()

(array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2], dtype=int64),
 array([2, 1, 1, 2, 1, 2, 1, 1, 1, 2, 1, 2, 2, 1, 1, 2, 2], dtype=int64))