In [51]:
import pandas as pd
import numpy as np

In [89]:
elections = pd.read_csv("../DATA/training.csv")
elections = elections[['year', 'state', 'previous_winner_age', 'incumbent', 'incumbency_count', 'avg_dem_margin_4', 'avg_total_votes_4', 'party_flips_4', 'dem_margin']]
elections

Unnamed: 0,year,state,previous_winner_age,incumbent,incumbency_count,avg_dem_margin_4,avg_total_votes_4,party_flips_4,dem_margin
0,1978,AL,51.0,Republican,1,-0.152094,156357.00,0,-0.080017
1,2010,AL,56.0,Democrat,0,-0.062914,180365.00,1,-0.021875
2,2012,AL,34.0,Republican,1,-0.030359,195903.25,2,-0.273189
3,2014,AL,36.0,Republican,2,-0.078652,240007.75,2,-0.348109
4,2016,AL,38.0,Republican,3,-0.159234,239233.50,1,-0.092305
...,...,...,...,...,...,...,...,...,...
5394,2012,WY,56.0,Republican,2,-0.182703,208296.75,0,-0.486013
5395,2014,WY,58.0,Republican,3,-0.269611,206279.00,0,-0.498770
5396,2016,WY,60.0,,0,-0.392941,197574.25,0,-0.348426
5397,2018,WY,50.0,Republican,1,-0.454327,195984.25,0,-0.362279


In [90]:
elections['year'] = elections['year'] - 1976
elections

Unnamed: 0,year,state,previous_winner_age,incumbent,incumbency_count,avg_dem_margin_4,avg_total_votes_4,party_flips_4,dem_margin
0,2,AL,51.0,Republican,1,-0.152094,156357.00,0,-0.080017
1,34,AL,56.0,Democrat,0,-0.062914,180365.00,1,-0.021875
2,36,AL,34.0,Republican,1,-0.030359,195903.25,2,-0.273189
3,38,AL,36.0,Republican,2,-0.078652,240007.75,2,-0.348109
4,40,AL,38.0,Republican,3,-0.159234,239233.50,1,-0.092305
...,...,...,...,...,...,...,...,...,...
5394,36,WY,56.0,Republican,2,-0.182703,208296.75,0,-0.486013
5395,38,WY,58.0,Republican,3,-0.269611,206279.00,0,-0.498770
5396,40,WY,60.0,,0,-0.392941,197574.25,0,-0.348426
5397,42,WY,50.0,Republican,1,-0.454327,195984.25,0,-0.362279


In [121]:
elections.dtypes

year                     int64
state                   object
previous_winner_age    float64
incumbent               object
incumbency_count         int64
avg_dem_margin_4       float64
avg_total_votes_4      float64
party_flips_4            int64
dtype: object

In [91]:
from sklearn.model_selection import train_test_split
from pandas.plotting import scatter_matrix # optional
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

holdout = elections[elections['year'] == 46] # Separate out 2022
elections = elections[elections['year'] < 46]

y = elections['dem_margin']
elections = elections.drop(columns=['dem_margin'])
holdout_y = holdout['dem_margin']
holdout = holdout.drop(columns=['dem_margin'])

numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, ['year', 'previous_winner_age', 'incumbency_count', 'avg_dem_margin_4', 'avg_total_votes_4', 'party_flips_4']),
        ('cat', categorical_transformer, ['state', 'incumbent'])
    ])

pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

pipeline.fit(elections)
X = pipeline.transform(elections)
holdout_X = pipeline.transform(holdout)
print(X.shape)
print(holdout_X.shape)

(5206, 58)
(193, 58)


In [93]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10)

In [94]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

linear_model = LinearRegression()
linear_model.fit(X_train, y_train)
linear_scores = cross_val_score(linear_model, X_train, y_train, scoring="neg_mean_squared_error", cv=5)
linear_rmse_scores = np.sqrt(-linear_scores)

forest_model = RandomForestRegressor()
forest_model.fit(X_train, y_train)
forest_scores = cross_val_score(forest_model, X_train, y_train, scoring="neg_mean_squared_error", cv=5)
forest_rmse_scores = np.sqrt(-forest_scores)

print("Linear model scores:")
print(linear_rmse_scores)
print('\n')
print("Random forest model scores:")
print(forest_rmse_scores)

Linear model scores:
[0.17006613 0.16941134 0.16374835 0.17795529 0.17044318]


Random forest model scores:
[0.14960955 0.15901775 0.14866256 0.15742709 0.15330892]


In [38]:
from sklearn.model_selection import RandomizedSearchCV

param_grid = [{
    'n_estimators': [10, 25 ,50, 100, 200, 300, 400, 500],
    'max_features': [1, 2, 4, 8, 16, 25, 50, 100],
    'max_depth': [None, 10, 20, 30, 40],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4, 6, 8]
}]

rf = RandomForestRegressor(random_state=67)
grid_search = RandomizedSearchCV(rf, param_grid, cv=5, n_iter=4000,
                           scoring='neg_mean_squared_error',
                           return_train_score=True, n_jobs=-1)
grid_search.fit(X_train, y_train)

In [39]:
grid_search.best_params_

{'n_estimators': 200,
 'min_samples_split': 10,
 'min_samples_leaf': 1,
 'max_features': 16,
 'max_depth': 30}

In [95]:
from sklearn.metrics import mean_squared_error

final_model = grid_search.best_estimator_

final_predictions = final_model.predict(holdout_X)

final_mse = mean_squared_error(holdout_y, final_predictions)
final_rmse = np.sqrt(final_mse)
final_rmse

0.09647079429873397

In [96]:
final_predictions

array([-0.55594493, -0.18089776, -0.33323201, -0.35450044,  0.22610415,
       -0.06828888, -0.21842949,  0.49420996,  0.30390189,  0.179006  ,
        0.12237892,  0.40862844,  0.38734337,  0.10533348, -0.03021002,
        0.18806483,  0.07434684,  0.18675473,  0.11075443,  0.37544466,
        0.1950287 ,  0.34603494,  0.07686019,  0.07826626,  0.44218177,
        0.11380454,  0.09865849,  0.11450355,  0.19548497,  0.50714182,
       -0.19371465,  0.17521294,  0.22503494,  0.21306724,  0.22408498,
        0.1796244 ,  0.1015523 ,  0.20826245, -0.27484745,  0.17668264,
       -0.28369218, -0.28824025,  0.18053349, -0.20520147, -0.07467825,
        0.05577566,  0.35314834,  0.1875934 , -0.29119685, -0.25730066,
       -0.22033491,  0.52631485,  0.4188868 , -0.27699021,  0.46867658,
        0.43120368,  0.11487028,  0.41096317,  0.22850847, -0.27536921,
        0.10761159, -0.17262382, -0.32154074, -0.3065504 ,  0.12769577,
       -0.28877336, -0.36596697, -0.38129928, -0.35831457, -0.56

In [103]:
pred_df = pd.DataFrame(final_predictions)
pred_df

Unnamed: 0,0
0,-0.555945
1,-0.180898
2,-0.333232
3,-0.354500
4,0.226104
...,...
188,0.464902
189,-0.188908
190,0.400111
191,-0.198952


In [104]:
reindexed = holdout_y.reset_index().drop(columns=['index'])
reindexed

Unnamed: 0,dem_margin
0,-0.720991
1,-0.260102
2,-0.318882
3,-0.461098
4,0.539603
...,...
188,0.434942
189,-0.089752
190,0.450179
191,-0.237642


In [117]:
merged = reindexed.merge(pred_df, left_index=True, right_index=True)
merged['predictions'] = merged[0]
merged = merged.drop(columns=[0])
merged

Unnamed: 0,dem_margin,predictions
0,-0.720991,-0.555945
1,-0.260102,-0.180898
2,-0.318882,-0.333232
3,-0.461098,-0.354500
4,0.539603,0.226104
...,...,...
188,0.434942,0.464902
189,-0.089752,-0.188908
190,0.450179,0.400111
191,-0.237642,-0.198952


In [119]:
merged['mispredicted'] = ((merged['dem_margin'] > 0) & (merged["predictions"] <= 0)) | ((merged['dem_margin'] < 0) & (merged["predictions"] >= 0))
merged

Unnamed: 0,dem_margin,predictions,mispredicted
0,-0.720991,-0.555945,False
1,-0.260102,-0.180898,False
2,-0.318882,-0.333232,False
3,-0.461098,-0.354500,False
4,0.539603,0.226104,False
...,...,...,...
188,0.434942,0.464902,False
189,-0.089752,-0.188908,False
190,0.450179,0.400111,False
191,-0.237642,-0.198952,False


In [120]:
merged[merged['mispredicted']]

Unnamed: 0,dem_margin,predictions,mispredicted
5,0.122127,-0.068289,True
18,-0.064789,0.110754,True
23,-0.046907,0.078266,True
87,0.055236,-0.037924,True


In [122]:
import pickle
with open("../SRC/elections_model.pkl", "wb") as f:
    pickle.dump(final_model, f)