In [1]:
# import dependencies
import pandas as pd
import numpy as np
import warnings
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder


warnings.filterwarnings("ignore")

Using TensorFlow backend.


### Pre-processing

In [2]:
df_109 = pd.read_csv(("../Resources/109justice.csv"))
df_109.tail()

Unnamed: 0,justice,term,direction,precedentAlteration,issueArea,lawType,caseOriginState
2027,109,2017,2,0,1,2,55
2028,109,2017,1,0,9,9,0
2029,109,2017,2,0,2,2,0
2030,109,2017,2,0,2,6,0
2031,109,2017,1,0,1,6,0


In [3]:
target_109 = df_109["direction"]
target_names = ["conservative", "liberal"]

In [4]:
data_109 = df_109.drop(["direction", "justice"], axis=1)
# feature_names = data_109.columns
data_109.head()

Unnamed: 0,term,precedentAlteration,issueArea,lawType,caseOriginState
0,1993,0,2,4,0
1,1993,0,2,4,0
2,1993,0,2,3,0
3,1993,0,2,3,0
4,1993,0,9,4,0


In [20]:
# X_scaler = StandardScaler().fit(data_109)
# X_scaled = X_scaler.transform(data_109)

# X = X_scaled
# y = target_109

In [5]:
term_df = pd.get_dummies(data_109["term"], prefix="term")
precAlt_df = pd.get_dummies(data_109["precedentAlteration"], prefix="precedentAlteration")
issueArea_df = pd.get_dummies(data_109["issueArea"], prefix="issueArea")
lawType_df = pd.get_dummies(data_109["lawType"], prefix="lawType")
caseOriginState_df = pd.get_dummies(data_109["caseOriginState"], prefix="caseOriginState")

In [6]:
oneHot_features = pd.concat([term_df, precAlt_df], axis=1)
oneHot_features = pd.concat([oneHot_features, issueArea_df], axis=1)
oneHot_features = pd.concat([oneHot_features, lawType_df], axis=1)
oneHot_features = pd.concat([oneHot_features, caseOriginState_df], axis=1)
oneHot_features.head()

Unnamed: 0,term_1993,term_1994,term_1995,term_1996,term_1997,term_1998,term_1999,term_2000,term_2001,term_2002,...,caseOriginState_49,caseOriginState_50,caseOriginState_51,caseOriginState_52,caseOriginState_53,caseOriginState_55,caseOriginState_56,caseOriginState_57,caseOriginState_58,caseOriginState_59
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
X = oneHot_features
feature_names = oneHot_features.columns
y = target_109
feature_names.ravel()

array(['term_1993', 'term_1994', 'term_1995', 'term_1996', 'term_1997',
       'term_1998', 'term_1999', 'term_2000', 'term_2001', 'term_2002',
       'term_2003', 'term_2004', 'term_2005', 'term_2006', 'term_2007',
       'term_2008', 'term_2009', 'term_2010', 'term_2011', 'term_2012',
       'term_2013', 'term_2014', 'term_2015', 'term_2016', 'term_2017',
       'precedentAlteration_0', 'precedentAlteration_1', 'issueArea_1',
       'issueArea_2', 'issueArea_3', 'issueArea_4', 'issueArea_5',
       'issueArea_6', 'issueArea_7', 'issueArea_8', 'issueArea_9',
       'issueArea_10', 'issueArea_12', 'issueArea_13', 'issueArea_14',
       'lawType_0', 'lawType_1', 'lawType_2', 'lawType_3', 'lawType_4',
       'lawType_5', 'lawType_6', 'lawType_8', 'lawType_9',
       'caseOriginState_0', 'caseOriginState_1', 'caseOriginState_2',
       'caseOriginState_4', 'caseOriginState_5', 'caseOriginState_6',
       'caseOriginState_7', 'caseOriginState_8', 'caseOriginState_10',
       'caseOriginSta

### Evaluating different classifiers

In [8]:
clf_lr = LogisticRegression(random_state=1)
clf_rf = RandomForestClassifier(random_state=1, n_estimators=300)
clf_nb = GaussianNB()
clf_svm = SVC(kernel='linear', probability=True)

print('5-fold cross validation:\n')

labels = ["Logistic Regression", "Random Forest", "Naive Bayes", "Linear SVM"]

for clf, label in zip([clf_lr, clf_rf, clf_nb, clf_svm], labels):
    scores = cross_val_score(clf, X, y, cv=5, 
                             scoring='f1', n_jobs=-1)
    print("F1: %0.3f [%s]" % (scores.mean(), label))

5-fold cross validation:

F1: 0.233 [Logistic Regression]
F1: 0.299 [Random Forest]
F1: 0.509 [Naive Bayes]
F1: 0.240 [Linear SVM]


In [9]:
# using all but logistic regression (since score was low) for voting classifier estimators
v_clf_hard = VotingClassifier(estimators = [(labels[1], clf_rf),
                                            (labels[2], clf_nb),
                                            (labels[3], clf_svm)],
                                            voting = 'hard')

In [10]:
v_clf_soft = VotingClassifier(estimators = [(labels[1], clf_rf),
                                            (labels[2], clf_nb),
                                            (labels[3], clf_svm)],
                                            voting = 'soft')

In [11]:
new_labels = ["Random Forest", "Naive Bayes", "Linear SVM", 
              "Voting_Classifier_Hard", "Voting_Classifier_Soft"]

for (clf, label) in zip([clf_rf, clf_nb, clf_svm, v_clf_hard, v_clf_soft], new_labels):
    scores = cross_val_score(clf, X, y,cv=5,
                             scoring='f1', n_jobs=-1)
    print("F1: %0.3f [%s]" % (scores.mean(), label))

F1: 0.299 [Random Forest]
F1: 0.509 [Naive Bayes]
F1: 0.240 [Linear SVM]
F1: 0.320 [Voting_Classifier_Hard]
F1: 0.504 [Voting_Classifier_Soft]


### Training the Naive Bayes model

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [13]:
model = GaussianNB()
model.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [14]:
# Model Accuracy
print('Train Acc: %.3f' % model.score(X_train, y_train))
print('Test Acc: %.3f' % model.score(X_test, y_test))

Train Acc: 0.451
Test Acc: 0.447


In [15]:
predictions = model.predict(X_test)
print(classification_report(y_test, predictions,
                            target_names=target_names))

              precision    recall  f1-score   support

conservative       0.42      0.95      0.58       205
     liberal       0.75      0.11      0.19       303

   micro avg       0.45      0.45      0.45       508
   macro avg       0.58      0.53      0.39       508
weighted avg       0.62      0.45      0.35       508



In [16]:
confusion_matrix(predictions, y_test)
predictions[:10], y_test[:10].ravel()

(array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int64),
 array([2, 1, 1, 2, 2, 2, 1, 2, 2, 2], dtype=int64))

### SVM Model Training

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [18]:
model = SVC(kernel='linear')
model.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [19]:
# Model Accuracy
print('Train Acc: %.3f' % model.score(X_train, y_train))
print('Test Acc: %.3f' % model.score(X_test, y_test))

Train Acc: 0.623
Test Acc: 0.619


In [20]:
predictions = model.predict(X_test)
print(classification_report(y_test, predictions,
                            target_names=target_names))

              precision    recall  f1-score   support

conservative       0.56      0.20      0.29       162
     liberal       0.63      0.90      0.74       245

   micro avg       0.62      0.62      0.62       407
   macro avg       0.59      0.55      0.52       407
weighted avg       0.60      0.62      0.56       407



In [21]:
confusion_matrix(predictions, y_test)
predictions[:20], y_test[:20].ravel()

(array([1, 2, 1, 2, 2, 1, 2, 2, 2, 2, 2, 1, 2, 2, 1, 2, 2, 2, 1, 1],
       dtype=int64),
 array([2, 1, 1, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 1, 1, 1, 2, 2, 1, 1],
       dtype=int64))