In [1]:
import pandas as pd

In [28]:
data = pd.read_csv('gub_candidates.csv')
data

Unnamed: 0,candidate,year,state,party,percent_popular_vote,winner,percent_positive,percent_total_tweets
0,John Carney,2020,Delaware,Democratic,59.5,1,0.721689,0.577605
1,Julianne Murray,2020,Delaware,Republican,38.6,0,0.512195,0.409091
2,Kathy DeMatteis,2020,Delaware,Independent,1.2,0,0.333333,0.006652
3,John Machurek,2020,Delaware,Libertarian,0.7,0,0.333333,0.006652
4,Eric Holcomb,2020,Indiana,Republican,56.5,1,0.341532,0.537996
...,...,...,...,...,...,...,...,...
497,Chris Christie,2009,New Jersey,Republican,48.5,1,0.418873,0.333711
498,Jon Corzine,2009,New Jersey,Democratic,44.9,0,0.554976,0.521296
499,Chris Daggett,2009,New Jersey,Independent,5.8,0,0.451562,0.144993
500,Bob McDonnell,2009,Virginia,Republican,58.6,1,0.699478,0.437910


In [29]:
#clean party column
def clean_party(x):
    party_dict = {'D':'Democratic', 'DFL':'Democratic', 'R':'Republican', 'L':'Libertarian', 'I':'Independent', 'ndependent':'Independent', 'G':'Green','C':'Constitution'}
    if x in ['Republican', 'Democratic', 'Libertarian', 'Independent', 'Green', 'Constitution']:
        return x
    elif x in party_dict.keys():
        return party_dict[x]
    else:
        return 'Other'

In [30]:
data['party'] = list(map(clean_party, data['party']))

In [31]:
data['party'].value_counts()

Democratic      159
Republican      156
Libertarian      80
Independent      45
Green            26
Other            22
Constitution     14
Name: party, dtype: int64

In [32]:
data['winner'] = data['winner'].astype(object)

## X-y split

In [33]:
y1 = data['winner']
y2 = data['percent_popular_vote']
X = data.drop(['winner', 'percent_popular_vote'], axis=1, inplace=True)

## normalize features

In [34]:
#split numericals and categoricals
X_num = data[['percent_positive', 'percent_total_tweets']]
X_cat = data[['year','state', 'party']]

In [53]:
#OneHotEncode the categoricals
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(drop='first')
encoded = encoder.fit_transform(X_cat).toarray()
Xcat_encoded = pd.DataFrame(encoded)
Xcat_encoded.columns = encoder.get_feature_names()

In [54]:
X_normalized = pd.concat([X_num, Xcat_encoded], axis=1)

In [55]:
X_normalized

Unnamed: 0,percent_positive,percent_total_tweets,x0_2010,x0_2011,x0_2012,x0_2013,x0_2014,x0_2015,x0_2016,x0_2018,...,x1_West Virginia(special),x1_Wisconsin,x1_Wisconsin(recall),x1_Wyoming,x2_Democratic,x2_Green,x2_Independent,x2_Libertarian,x2_Other,x2_Republican
0,0.721689,0.577605,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,0.512195,0.409091,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.333333,0.006652,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.333333,0.006652,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.341532,0.537996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
497,0.418873,0.333711,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
498,0.554976,0.521296,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
499,0.451562,0.144993,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
500,0.699478,0.437910,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


## making model for predicting percent popular vote

The train-test split needs to be done by election so that it is possible to compare %popular vote to predict the winner. Therefore, I am using the 2018 elections as the test set as it is 111 candidates, or 22% of the total data. 

In [38]:
data[data['year']==2018]

Unnamed: 0,candidate,year,state,party,percent_positive,percent_total_tweets
45,Kay Ivey,2018,Alabama,Republican,0.564838,0.667777
46,Walt Maddox,2018,Alabama,Democratic,0.543860,0.332223
47,Mike Dunleavy,2018,Alaska,Republican,0.626697,0.365591
48,Mark Begich,2018,Alaska,Democratic,0.590250,0.627792
49,William Toien,2018,Alaska,Libertarian,0.375000,0.006617
...,...,...,...,...,...,...
151,Michael White,2018,Wisconsin,Green,0.215317,0.229126
152,Mark Gordon,2018,Wyoming,Republican,0.619835,0.441606
153,Mary Throne,2018,Wyoming,Democratic,0.711111,0.492701
154,Rex Rammell,2018,Wyoming,Constitution,0.533333,0.054745


In [39]:
## train-test split
X_test = X_normalized[45:156]
X_train = pd.concat([X_normalized[:45], X_normalized[156:]], axis=0)
y_test= y2[45:156]
y_train = pd.concat([y2[:45], y2[156:]], axis=0)

In [40]:
len(y_train)

391

In [None]:
## make model pipelines

In [41]:
import numpy as np
import math
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

In [42]:
model1 = LinearRegression()
model2 = KNeighborsRegressor()
model3 = DecisionTreeRegressor()
model4 = RandomForestRegressor()

model_pipeline = [model1, model2, model3, model4]
model_names = ['Linear Regression', 'KNN', 'Decision Tree', 'Random Forest']
scores = {}
scores_rmse={}
scores_mae={}
val_scores= {}
predictions = {}

i=0
for model in model_pipeline:
    mean_score = np.mean(cross_val_score(model, X_train, y_train, cv=5))
    model.fit(X_train, y_train)
    scores[model_names[i]] = mean_score
    prediction=model.predict(X_test)
    predictions[model_names[i]] = prediction
    scores_rmse[model_names[i]] = math.sqrt(mean_squared_error(y_test, prediction))
    scores_mae[model_names[i]] = mean_absolute_error(y_test, prediction)
    val_scores[model_names[i]] = model.score(X_test,y_test)
    
    i = i+1
print('scores: ',scores)
print('cross validation scores: ',val_scores)
print('rmse scores: ',scores_rmse)
print('mae scores: ' ,scores_mae)

scores:  {'Linear Regression': -3.548222332676749e+24, 'KNN': 0.7724961806105201, 'Decision Tree': 0.8052006662700922, 'Random Forest': 0.8521747894733283}
cross validation scores:  {'Linear Regression': -9.22412710865946e+23, 'KNN': 0.8773956886421276, 'Decision Tree': 0.8351825125266307, 'Random Forest': 0.9230110281404885}
rmse scores:  {'Linear Regression': 22415442102835.586, 'KNN': 8.17217278990772, 'Decision Tree': 9.475154803936375, 'Random Forest': 6.475882041283074}
mae scores:  {'Linear Regression': 22415442102835.59, 'KNN': 6.275765765765767, 'Decision Tree': 6.007207207207205, 'Random Forest': 4.848445045045048}


In [None]:
#Random Forest seems like the best model...

In [43]:
test_data = pd.DataFrame(y_test)

In [44]:
test_data['RF_pred'] = predictions['Random Forest']
test_data['DT_pred'] = predictions['Decision Tree']
test_data['LR_pred'] = predictions['Linear Regression']
test_data['KNN_pred'] = predictions['KNN']
test_data['models_mean'] = test_data[['RF_pred','DT_pred','KNN_pred']].mean(axis=1)


In [45]:
test_data['winner'] = [y1[i] for i in test_data.index]
test_data['candidate'] = [data['candidate'][i] for i in test_data.index]
test_data['state'] = [data['state'][i] for i in test_data.index]
test_data = test_data[['candidate', 'state', 'percent_popular_vote', 'RF_pred', 'DT_pred', 'KNN_pred','models_mean', 'winner' ]]

In [46]:
test_data.head(60)

Unnamed: 0,candidate,state,percent_popular_vote,RF_pred,DT_pred,KNN_pred,models_mean,winner
45,Kay Ivey,Alabama,59.6,52.5651,57.2,57.44,55.735033,1
46,Walt Maddox,Alabama,40.4,43.178,39.9,40.4,41.159333,0
47,Mike Dunleavy,Alaska,51.5,44.898,52.8,42.92,46.872667,1
48,Mark Begich,Alaska,44.5,44.094,44.9,33.7,40.898,0
49,William Toien,Alaska,1.9,2.173,1.1,2.32,1.864333,0
50,Doug Ducey,Arizona,56.0,51.4837,58.6,54.54,54.874567,1
51,David Garcia,Arizona,41.8,43.458,44.9,42.46,43.606,0
52,Angel Torres,Arizona,2.1,2.377,2.3,2.74,2.472333,0
53,Asa Hutchinson,Arkansas,65.3,44.3437,33.6,50.8,42.914567,1
54,Jared Henderson,Arkansas,31.8,33.435,32.2,46.84,37.491667,0


In [47]:
#feature importance
importance = model3.feature_importances_
for i,v in enumerate(importance):
    print('Feature: %0d, Score: %.5f' % (i,v))

Feature: 0, Score: 0.03585
Feature: 1, Score: 0.54113
Feature: 2, Score: 0.00795
Feature: 3, Score: 0.00979
Feature: 4, Score: 0.00324
Feature: 5, Score: 0.00003
Feature: 6, Score: 0.00203
Feature: 7, Score: 0.00029
Feature: 8, Score: 0.00263
Feature: 9, Score: 0.00000
Feature: 10, Score: 0.00014
Feature: 11, Score: 0.00196
Feature: 12, Score: 0.00029
Feature: 13, Score: 0.00000
Feature: 14, Score: 0.00076
Feature: 15, Score: 0.00043
Feature: 16, Score: 0.00430
Feature: 17, Score: 0.00035
Feature: 18, Score: 0.00091
Feature: 19, Score: 0.00029
Feature: 20, Score: 0.00000
Feature: 21, Score: 0.00179
Feature: 22, Score: 0.00001
Feature: 23, Score: 0.00003
Feature: 24, Score: 0.00001
Feature: 25, Score: 0.00014
Feature: 26, Score: 0.00024
Feature: 27, Score: 0.00015
Feature: 28, Score: 0.00000
Feature: 29, Score: 0.00104
Feature: 30, Score: 0.00076
Feature: 31, Score: 0.00000
Feature: 32, Score: 0.00013
Feature: 33, Score: 0.00068
Feature: 34, Score: 0.00075
Feature: 35, Score: 0.00001
Fe

The most important features are % total tweets (Score: 0.54113), Republican (Score: 0.14864), and % positive tweet Score: (0.03585).

The Linear Regression model was a horrible fit, with negative scores. The Random Forest model 
had the highest score of .92 and lowest RMSE of 6.46. But it only predicted the winner of elections correctly 23 of 36 times.

The Decision Tree model predicted the winner correctly 29 of 36 times.

The KNN model predicted 24 winners correctly. 