In [76]:
import numpy as np
import pandas as pd
import itertools
import matplotlib.pyplot as plt
from sklearn import svm, datasets
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.linear_model import LogisticRegression

plt.style.use('seaborn-poster')
%matplotlib inline

In [7]:
# import data
senator_data = pd.read_csv('election_history.csv') # this is a dataframe now
senator_data.head()

Unnamed: 0,Year,State,Incumbent Running,Currently held by democrat,President Democrat,Concurrent presidential race,Incumbent president running,Senate controlled by Democrats,House controlled by democrats,Democrat Winner,Democrat Margin
0,2016,Alabama,1,0,1,1,0,0,0,0,-28.4
1,2016,Alaska,1,0,1,1,0,0,0,0,-32.0
2,2016,Arizona,1,0,1,1,0,0,0,0,-12.0
3,2016,Arkansas,1,0,1,1,0,0,0,0,-23.4
4,2016,Colorado,1,1,1,1,0,0,0,1,5.7


In [110]:
# Fit model and look at importance of attributes

X = senator_data.iloc[:, 2:9]
y = senator_data.iloc[:, 9]

model = LogisticRegression()
prelim_fit   = model.fit(X, y)

# Analyze Feature Importances
coefficients = pd.concat([pd.DataFrame(X.columns),pd.DataFrame(np.transpose(model.coef_))], axis = 1)
coefficients.columns = ['Attribute','Coefficient']

print(coefficients)

                        Attribute  Coefficient
0               Incumbent Running     0.402044
1      Currently held by democrat     3.179607
2              President Democrat    -1.224944
3    Concurrent presidential race     0.577810
4     Incumbent president running     1.291565
5  Senate controlled by Democrats    -0.555386
6   House controlled by democrats    -0.124793


In [139]:
# Separate out into training and test sets --  maybe update with some more complex way of picking this
train_X = senator_data[senator_data.Year != 2016].iloc[:, 2:9]
train_Y = senator_data[senator_data.Year != 2016].iloc[:, 9]

test_X  = senator_data[senator_data.Year == 2016].iloc[:, 2:9]
test_Y  = senator_data[senator_data.Year == 2016].iloc[:, 9]

fit = model.fit(train_X, train_Y)
print('score:', model.score(test_X, test_Y))

# Analyze Feature Importances
coefficients = pd.concat([pd.DataFrame(train_X.columns),pd.DataFrame(np.transpose(model.coef_))], axis = 1)
coefficients.columns = ['Attribute','Coefficient']

print(coefficients)

score: 0.939393939394
                        Attribute  Coefficient
0               Incumbent Running     0.473870
1      Currently held by democrat     2.770507
2              President Democrat    -0.762677
3    Concurrent presidential race     0.657045
4     Incumbent president running     1.077220
5  Senate controlled by Democrats    -0.762677
6   House controlled by democrats    -0.241778


In [147]:
# Try to predict margins
clf = svm.SVR()

train_Y_margin = senator_data[senator_data.Year != 2016].iloc[:, 10]
test_Y_margin  = senator_data[senator_data.Year == 2016].iloc[:, 10]

clf.fit(train_X, train_Y_margin) 

predictions = clf.predict(test_X)

margin_predictions = pd.concat([pd.DataFrame(senator_data.State[senator_data.Year == 2016]),pd.DataFrame(np.transpose(predictions))], axis = 1)
coefficients.columns = ['State','Predictions']
print(margin_predictions)

print('score', clf.score(test_X, test_Y_margin))
print("Given that the R^2 is 10%, either I didn't set this up right or it's not a good model")


             State         0
0          Alabama -3.294981
1           Alaska -3.294981
2          Arizona -3.294981
3         Arkansas -3.294981
4         Colorado  3.510637
5      Connecticut  3.510637
6          Florida -3.294981
7          Georgia -3.294981
8           Hawaii  3.510637
9            Idaho -3.294981
10        Illinois -3.294981
11         Indiana -4.187232
12            Iowa -3.294981
13          Kansas -3.294981
14        Kentucky -3.294981
15       Louisiana -4.187232
16        Maryland  1.863272
17        Missouri -3.294981
18          Nevada  1.863272
19   New Hampshire -3.294981
20        New York  3.510637
21  North Carolina -3.294981
22    North Dakota -3.294981
23            Ohio -3.294981
24        Oklahoma -3.294981
25          Oregon  3.510637
26    Pennsylvania -3.294981
27  South Carolina -3.294981
28    South Dakota -3.294981
29            Utah -3.294981
30         Vermont  3.510637
31      Washington  3.510637
32       Wisconsin -3.294981
score 0.102082