In [1]:
# import dependencies
import pandas as pd
import numpy as np
import warnings
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

warnings.filterwarnings("ignore")

### Pre-processing

In [2]:
df_106 = pd.read_csv(("../Resources/106justice.csv"))
df_106.tail()

Unnamed: 0,justice,term,direction,precedentAlteration,issueArea,lawType,caseOriginState
2731,106,2017,2,0,1,2,55
2732,106,2017,1,0,9,9,0
2733,106,2017,2,0,2,2,0
2734,106,2017,2,0,2,6,0
2735,106,2017,1,0,1,6,0


In [3]:
# df_106["direction"] = np.where(df_106["direction"] == 1, 0, 1)
# df_106.tail(5)
# # df_106.count()

In [4]:
target_106 = df_106["direction"]
target_names = ["conservative", "liberal"]

In [5]:
data_106 = df_106.drop(["direction", "justice"], axis=1)
feature_names = data_106.columns
data_106.head()

Unnamed: 0,term,precedentAlteration,issueArea,lawType,caseOriginState
0,1987,0,9,4,0
1,1987,0,10,1,5
2,1987,0,9,3,0
3,1987,0,7,6,0
4,1987,0,4,2,42


In [6]:
X_scaler = StandardScaler().fit(data_106)
X_scaled = X_scaler.transform(data_106)

X = X_scaled
y = target_106

### Evaluating different classifiers

In [7]:
clf1 = LogisticRegression(random_state=1)
clf2 = RandomForestClassifier(random_state=1, n_estimators=300)
clf3 = GaussianNB()
clf4 = SVC(kernel='linear', probability=True)

print('5-fold cross validation:\n')

labels = ["Logistic Regression", "Random Forest", "Naive Bayes", "Linear SVM"]

for clf, label in zip([clf1, clf2, clf3, clf4], labels):
    scores = cross_val_score(clf, X, y, cv=5, 
                             scoring='f1', n_jobs=-1)
    print("F1: %0.3f [%s]" % (scores.mean(), label))


5-fold cross validation:

F1: 0.695 [Logistic Regression]
F1: 0.480 [Random Forest]
F1: 0.634 [Naive Bayes]
F1: 0.735 [Linear SVM]


In [8]:
# using all but random forest (since score was low) for voting classifier estimators
v_clf_hard = VotingClassifier(estimators = [(labels[0], clf1),
                                            (labels[2], clf3),
                                            (labels[3], clf4)],
                                            voting = 'hard')

In [9]:
v_clf_soft = VotingClassifier(estimators = [(labels[0], clf1),
                                            (labels[2], clf3),
                                            (labels[3], clf4)],
                                            voting = 'soft')

In [10]:
new_labels = ["Logistic Regression", "Naive Bayes", "Linear SVM", 
              "Voting_Classifier_Hard", "Voting_Classifier_Soft"]

for (clf, label) in zip([clf1, clf3, clf4, v_clf_hard, v_clf_soft], new_labels):
    scores = cross_val_score(clf, X, y,cv=5,
                             scoring='f1', n_jobs=-1)
    print("F1: %0.3f [%s]" % (scores.mean(), label))

F1: 0.695 [Logistic Regression]
F1: 0.634 [Naive Bayes]
F1: 0.735 [Linear SVM]
F1: 0.700 [Voting_Classifier_Hard]
F1: 0.685 [Voting_Classifier_Soft]


### Training the SVM model

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [12]:
model = SVC(kernel='linear')
model.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [13]:
# Model Accuracy
print('Train Acc: %.3f' % model.score(X_train, y_train))
print('Test Acc: %.3f' % model.score(X_test, y_test))

Train Acc: 0.579
Test Acc: 0.585


In [14]:
predictions = model.predict(X_test)
print(classification_report(y_test, predictions,
                            target_names=target_names))

              precision    recall  f1-score   support

conservative       0.58      1.00      0.74       400
     liberal       0.00      0.00      0.00       284

   micro avg       0.58      0.58      0.58       684
   macro avg       0.29      0.50      0.37       684
weighted avg       0.34      0.58      0.43       684



In [15]:
confusion_matrix(predictions, y_test)
predictions[:10], y_test[:10].ravel()

(array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int64),
 array([1, 1, 1, 2, 2, 2, 2, 1, 1, 1], dtype=int64))

### Previous code for reference

In [16]:
# RANDOM FOREST CLASSIFIER

# from sklearn.ensemble import RandomForestClassifier
# rf = RandomForestClassifier(n_estimators=300)
# rf = rf.fit(data_106, target_106)
# rf.score(data_106, target_106)

In [17]:
# importances = rf.feature_importances_
# importances

In [18]:
# sorted(zip(rf.feature_importances_, feature_names), reverse=True)