In [1]:
# import dependencies
import pandas as pd
import numpy as np
import warnings
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report

warnings.filterwarnings("ignore")

### Pre-processing

In [2]:
df_107 = pd.read_csv(("../Resources/107justice.csv"))
df_107.tail()

Unnamed: 0,justice,term,direction,precedentAlteration,issueArea,lawType,caseOriginState
1685,107,2008,2,0,1,2,0
1686,107,2008,2,0,8,3,0
1687,107,2008,2,0,2,4,0
1688,107,2008,1,0,10,6,0
1689,107,2008,2,0,2,3,0


In [3]:
target_107 = df_107["direction"]
target_names = ["conservative", "liberal"]

In [4]:
data_107 = df_107.drop(["direction", "justice"], axis=1)
data_107.head()

Unnamed: 0,term,precedentAlteration,issueArea,lawType,caseOriginState
0,1990,0,9,4,0
1,1990,0,10,0,0
2,1990,0,10,0,0
3,1990,0,1,2,22
4,1990,0,1,2,0


In [5]:
# dummy (One-Hot) encoding
term_df = pd.get_dummies(data_107["term"], prefix="term")
precAlt_df = pd.get_dummies(data_107["precedentAlteration"], prefix="precedentAlteration")
issueArea_df = pd.get_dummies(data_107["issueArea"], prefix="issueArea")
lawType_df = pd.get_dummies(data_107["lawType"], prefix="lawType")
caseOriginState_df = pd.get_dummies(data_107["caseOriginState"], prefix="caseOriginState")

In [6]:
# concatenate encoded features into mega-dataframe
oneHot_features = pd.concat([term_df, precAlt_df], axis=1)
oneHot_features = pd.concat([oneHot_features, issueArea_df], axis=1)
oneHot_features = pd.concat([oneHot_features, lawType_df], axis=1)
oneHot_features = pd.concat([oneHot_features, caseOriginState_df], axis=1)
oneHot_features.head()

Unnamed: 0,term_1990,term_1991,term_1992,term_1993,term_1994,term_1995,term_1996,term_1997,term_1998,term_1999,...,caseOriginState_48,caseOriginState_50,caseOriginState_51,caseOriginState_52,caseOriginState_53,caseOriginState_55,caseOriginState_56,caseOriginState_57,caseOriginState_58,caseOriginState_59
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
# assign as X, y, and feature_names variables (to simplify replication)
X = oneHot_features
feature_names = oneHot_features.columns
y = target_107

### Evaluating different classifiers

In [8]:
clf_lr = LogisticRegression()
clf_rf = RandomForestClassifier()
clf_nb = GaussianNB()
clf_svm = SVC(kernel='linear', probability=True)

print('5-fold cross validation:\n')

labels = ["Logistic Regression", "Random Forest", "Naive Bayes", "Linear SVM"]

for clf, label in zip([clf_lr, clf_rf, clf_nb, clf_svm], labels):
    scores = cross_val_score(clf, X, y, cv=5, 
                             scoring='f1', n_jobs=-1)
    print("F1: %0.3f [%s]" % (scores.mean(), label))


5-fold cross validation:

F1: 0.209 [Logistic Regression]
F1: 0.316 [Random Forest]
F1: 0.497 [Naive Bayes]
F1: 0.212 [Linear SVM]


In [9]:
# using all but logistic regression (lowest score) for voting classifier estimators
v_clf_hard = VotingClassifier(estimators = [(labels[1], clf_rf),
                                            (labels[2], clf_nb),
                                            (labels[3], clf_svm)],
                                            voting = 'hard')

In [10]:
v_clf_soft = VotingClassifier(estimators = [(labels[1], clf_rf),
                                            (labels[2], clf_nb),
                                            (labels[3], clf_svm)],
                                            voting = 'soft')

In [11]:
new_labels = ["Random Forest", "Naive Bayes", "Linear SVM", 
              "Voting_Classifier_Hard", "Voting_Classifier_Soft"]

for (clf, label) in zip([clf_rf, clf_nb, clf_svm, v_clf_hard, v_clf_soft], new_labels):
    scores = cross_val_score(clf, X, y,cv=5,
                             scoring='f1', n_jobs=-1)
    print("F1: %0.3f [%s]" % (scores.mean(), label))

F1: 0.315 [Random Forest]
F1: 0.497 [Naive Bayes]
F1: 0.212 [Linear SVM]
F1: 0.322 [Voting_Classifier_Hard]
F1: 0.476 [Voting_Classifier_Soft]


### Evaluate Naive Bayes Model Further With Training

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [26]:
model = GaussianNB()
model.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [27]:
# Model Accuracy
print('Train Acc: %.3f' % model.score(X_train, y_train))
print('Test Acc: %.3f' % model.score(X_test, y_test))

Train Acc: 0.479
Test Acc: 0.396


In [28]:
predictions = model.predict(X_test)
print(classification_report(y_test, predictions,
                            target_names=target_names))

              precision    recall  f1-score   support

conservative       0.37      0.93      0.53       125
     liberal       0.67      0.08      0.15       213

   micro avg       0.40      0.40      0.40       338
   macro avg       0.52      0.51      0.34       338
weighted avg       0.56      0.40      0.29       338



In [29]:
print(confusion_matrix(predictions, y_test))
predictions[:10], y_test[:10].ravel()

[[116 195]
 [  9  18]]


(array([1, 1, 1, 1, 1, 1, 1, 1, 2, 2], dtype=int64),
 array([1, 2, 2, 1, 2, 1, 2, 2, 1, 1], dtype=int64))

### Linear SVM Model Training

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [18]:
model = SVC(kernel='linear', probability=True)
model.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=True, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [19]:
# Model Accuracy
print('Train Acc: %.3f' % model.score(X_train, y_train))
print('Test Acc: %.3f' % model.score(X_test, y_test))

Train Acc: 0.612
Test Acc: 0.592


In [20]:
predictions = model.predict(X_test)
print(classification_report(y_test, predictions,
                            target_names=target_names))

              precision    recall  f1-score   support

conservative       0.41      0.22      0.29       125
     liberal       0.64      0.81      0.71       213

   micro avg       0.59      0.59      0.59       338
   macro avg       0.52      0.52      0.50       338
weighted avg       0.55      0.59      0.56       338



In [24]:
print(confusion_matrix(predictions, y_test))
predictions[:15], y_test[:15].ravel()

[[ 28  41]
 [ 97 172]]


(array([2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 2, 1, 1, 1, 2], dtype=int64),
 array([1, 2, 2, 1, 2, 1, 2, 2, 1, 1, 1, 2, 2, 2, 1], dtype=int64))

#### Linear SVM yielded better results than Naive Bayes

### Save and test loading of the model

In [22]:
import pickle

filename = '107_model.sav'
pickle.dump(model, open(filename, 'wb'))

In [23]:
#Load Model from Disk
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(X_test, y_test)
print(result)

0.591715976331361
