In [1]:
# Use SVM with US Presidency dataset to determine if a linear separation is possible

In [2]:
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

In [3]:
us_data = pd.read_csv("datasets/USPresidency.csv")
us_data.head()

Unnamed: 0,Year,Q1,Q2,Q3,Q4,Q5,Q6,Q7,Q8,Q9,Q10,Q11,Q12,Target
0,1864,0,0,0,0,1,0,0,1,1,0,0,0,1
1,1868,1,1,0,0,0,0,1,1,1,0,1,0,1
2,1872,1,1,0,0,1,0,1,0,0,0,1,0,1
3,1880,1,0,0,1,0,0,1,1,0,0,0,0,1
4,1888,0,0,0,0,1,0,0,0,0,0,0,0,1


In [4]:
classes = us_data.iloc[:, 13]
us_data = us_data.iloc[:, 1:13]
us_data.head()

Unnamed: 0,Q1,Q2,Q3,Q4,Q5,Q6,Q7,Q8,Q9,Q10,Q11,Q12
0,0,0,0,0,1,0,0,1,1,0,0,0
1,1,1,0,0,0,0,1,1,1,0,1,0
2,1,1,0,0,1,0,1,0,0,0,1,0
3,1,0,0,1,0,0,1,1,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,0,0


In [5]:
svm = SVC()
grid = GridSearchCV(estimator=svm, param_grid={"C": [0.1, 1, 10, 100], "kernel": ["rbf", "linear"], "gamma": [0.001, 0.01, 0.1, 1, 10, 100]})
grid.fit(us_data, classes)
print(grid.best_estimator_)

SVC(C=10, gamma=0.001, kernel='linear')


In [6]:
classifications = grid.predict(us_data)
correct = 0
print(classifications)
print(list(classes))
for classification, actual in zip(classifications, classes):
    if classification == actual:
        correct += 1
    
print(correct, "correct classifications out of", len(classes))

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
31 correct classifications out of 31


In [7]:
# 100% classification rate, therefore a linear separation exists in a higher dimensional space

In [8]:
svm = SVC(C=10, kernel="linear", gamma=0.001)
svm.fit(us_data, classes)
# Retrieve coefficients of features to build a new feature
print(svm.coef_)

[[-0.40007606  0.26678712 -1.13328149 -1.33315633  0.13354116 -0.8661906
   0.60017033  1.26629517 -1.26562514 -0.53381404  0.         -0.86723862]]
