In [533]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, f1_score, precision_recall_fscore_support
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report

In [534]:
pd.set_option('max_columns', None)

In [535]:
combined_data_file = "../../data/Data-Jul16/combined_jul16.csv"
data_2020_file = "../../data/Data-Jul16/F20_jul05.csv"
data_historical = pd.read_csv(combined_data_file, encoding = "ISO-8859-1")

In [536]:
data_2020 = pd.read_csv(data_2020_file, encoding = "ISO-8859-1")
data_2016 = data_historical.loc[data_historical['YEAR'] == 2016]
data_2012 = data_historical.loc[data_historical['YEAR'] == 2012]
data_2008 = data_historical.loc[data_historical['YEAR'] == 2008]

year_df_dict = {2020:data_2020, 2016:data_2016, 2012:data_2012, 2008:data_2008}

In [537]:
data_historical['COUNTY_TOTALVOTES'] = data_historical['COUNTY_TOTALVOTES'].astype(np.int64)

### PrepData###

In [538]:
data_historical['REP_VOTES%'] = data_historical['REP_VOTES'] / data_historical['COUNTY_TOTALVOTES']
data_historical['DEM_VOTES%'] = data_historical['DEM_VOTES'] / data_historical['COUNTY_TOTALVOTES']
data_2020['REP_VOTES%'] = data_2020['REP_VOTES'] / data_2020['COUNTY_TOTALVOTES']
data_2020['DEM_VOTES%'] = data_2020['DEM_VOTES']/ data_2020['COUNTY_TOTALVOTES']

In [539]:
data_historical.drop(['REP_VOTES', 'DEM_VOTES'], axis=1, inplace=True)
data_2020.drop(['REP_VOTES', 'DEM_VOTES'], axis=1, inplace=True)

In [540]:
data_historical.drop(['WINNING_CANDIDATE', 'WINNING_PARTY_BINARY', 'REP_CANDIDATE', 'DEM_CANDIDATE', 'AVG_WAGE_SALARY', 'WINNING_PARTY', 'COUNTY', 'STATE', 'REP_VOTES%', 'DEM_VOTES%', 'MARGIN_VICTORY'], axis=1, inplace=True)

In [541]:
def train_test_split_by_year(X, y, year, cols=None):
    
    if year != None:
        year_filter =  X['YEAR'] < year
        X = X[year_filter]
        y = y[year_filter]
        
    X = X.drop('YEAR', axis=1)
    y = np.delete(y, 1, axis=1)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    
    if (cols == None):
        return X_train, X_test, y_train, y_test
    else:
        return X_train[cols], X_test[cols], y_train, y_test

# Step 1 #

### Build/Train Model#1: Linear regression to preidct total votes by county###


In [542]:
year = 2020
target_df = year_df_dict.get(year)

In [543]:
X = data_historical.drop(['COUNTY_TOTALVOTES'], axis=1)
y = data_historical[['COUNTY_TOTALVOTES', 'YEAR']].values

In [544]:
X_train, X_test, y_train, y_test = train_test_split_by_year(X, y, year)

In [545]:
linear_model = LinearRegression()

In [546]:
linear_model.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [547]:
y_pred = linear_model.predict(X_test)

In [548]:
print("Accuracy score={:.2f}".format(linear_model.score(X_test, y_test) * 100))

Accuracy score=99.05


### Generate Total Votes Predictions by County###

In [549]:
target_df.drop(['COUNTY_TOTALVOTES'], inplace=True, axis=1, errors='ignore')

In [550]:
target_df.drop(['ID', 'YEAR', 'Unnamed: 0', 'Unnamed: 0.1', 'WINNING_CANDIDATE', 'WINNING_PARTY_BINARY', 'REP_CANDIDATE', 'DEM_CANDIDATE', 'WINNING_PARTY', 'COUNTY', 'STATE', 'REP_VOTES%', 'DEM_VOTES%', 'REP_VOTES', 'DEM_VOTES', 'COUNTY_TOTALVOTES', 'MARGIN_VICTORY', 'AVG_WAGE_SALARY'], axis=1, inplace=True, errors='ignore')

In [551]:
y_pred_target = linear_model.predict(target_df)

In [552]:
#merge predictions with the target dataset. 
target_df['COUNTY_TOTALVOTES'] = y_pred_target

In [553]:
target_df['COUNTY_TOTALVOTES'] = target_df['COUNTY_TOTALVOTES'].astype(np.int64)

In [554]:
import datetime

x = datetime.datetime.now()
day = x.day
month = x.month

date_str = str(month) + str(day)

In [555]:
#save file (will be used by next step in the pipeline)
filename_step1 = '../../data/F20_step1_output_' + date_str + '.csv'
target_df.to_csv(filename_step1)

# Step 2 #

In [556]:
# start clean and reimport everything again. 
data = pd.read_csv(combined_data_file, encoding = "ISO-8859-1")

In [557]:
data_2020 = pd.read_csv(data_2020_file, encoding = "ISO-8859-1")
data_2016 = data.loc[data['YEAR'] == 2016]
data_2012 = data.loc[data['YEAR'] == 2012]
data_2008 = data.loc[data['YEAR'] == 2008]

### Build/Train Model#2: Random Forest Classifier to predict County Winner###

In [558]:
data.drop(['REP_VOTES', 'DEM_VOTES', 'MARGIN_VICTORY', 'WINNING_CANDIDATE', 'REP_CANDIDATE', 'DEM_CANDIDATE', 'WINNING_PARTY', 'COUNTY', 'STATE'], axis=1, inplace=True)
data.drop(['AA_FEMALE', 'AA_MALE', 'BA_FEMALE', 'BA_MALE', 'H_FEMALE', 'H_MALE', 'IA_FEMALE', 'IA_MALE', 'NA_FEMALE', 'NA_MALE' , 'TOT_FEMALE', 'TOT_MALE', 'TOT_POP', 'WA_FEMALE', 'WA_MALE', 'TOT_POP_LESS19', 'TOT_MALE_LESS19', 'TOT_FEMALE_LESS19', 'TOT_POP_20to39', 'TOT_MALE_20to39', 'TOT_FEMALE_20to39', 'TOT_POP_40to59', 'TOT_MALE_40to59', 'TOT_FEMALE_40to59', 'TOT_POP_Above60', 'TOT_MALE_Above60', 'TOT_FEMALE_Above60'] , axis=1, inplace=True)


In [559]:
significant_cols = [
 'STATE_FIPS',
 'COUNTY_FIPS',
 'COUNTY_TOTALVOTES',
 'HOUSE_WINNING_BINARY',
 'SENATE_WINNING_BINARY',
 'UNEMPLOYMENT_RATE',
 'BA_FEMALE%',
 'BA_MALE%',
 'H_FEMALE%',
 'IA_FEMALE%',
 'WA_FEMALE%',
 'WA_MALE%',
 'TOT_FEMALE%',
 'TOT_MALE%',
 'TOT_POP_LESS19%',
 'TOT_POP_40to59%',
 'TOT_POP_Above60%']

In [560]:
X = data.drop('WINNING_PARTY_BINARY', axis=1)
y = data[['WINNING_PARTY_BINARY', 'YEAR']].values

X_train, X_test, y_train, y_test = train_test_split_by_year(X, y, year, significant_cols)

In [561]:
rfc4 = RandomForestClassifier(n_estimators=10).fit(X_train, y_train)

rfc_pred4= rfc4.predict(X_test)

print("Experiment#rfc4: {:.3f}%".format(accuracy_score(y_test, rfc_pred4) * 100))

  """Entry point for launching an IPython kernel.


Experiment#rfc4: 89.364%


In [562]:
print(classification_report(y_test, rfc_pred4))

              precision    recall  f1-score   support

           0       0.82      0.64      0.72       992
           1       0.91      0.96      0.93      3728

   micro avg       0.89      0.89      0.89      4720
   macro avg       0.86      0.80      0.83      4720
weighted avg       0.89      0.89      0.89      4720



### Predict County winner ###

In [563]:
target_year_data = pd.read_csv(filename_step1, encoding = "ISO-8859-1")

In [564]:
## use with significant model only..
## for now drop AVG_WAGE_SALARY till data is fixed. 
data_significant = target_year_data[[
 'STATE_FIPS',
 'COUNTY_FIPS',
 'COUNTY_TOTALVOTES',
 'HOUSE_WINNING_BINARY',
 'SENATE_WINNING_BINARY',
 'UNEMPLOYMENT_RATE',
 'BA_FEMALE%',
 'BA_MALE%',
 'H_FEMALE%',
 'IA_FEMALE%',
 'WA_FEMALE%',
 'WA_MALE%',
 'TOT_FEMALE%',
 'TOT_MALE%',
 'TOT_POP_LESS19%',
 'TOT_POP_40to59%',
 'TOT_POP_Above60%']]

In [565]:
y_pred = rfc4.predict(data_significant)

In [566]:
target_year_data['WINNING_PARTY_BINARY'] = y_pred

In [567]:
#save to csv
filename_step2 = '../../data/F20_step2_output_' + date_str + '.csv'
target_year_data.to_csv(filename_step2)

# Step 3

### Build/Train Model#3: Linear Regression to predict R/D Votes for every county###

In [568]:
votes_historcail_df = pd.read_csv(combined_data_file, encoding = "ISO-8859-1")
votes_df = pd.read_csv(filename_step2)

In [569]:
votes_historcail_df_copy = votes_historcail_df[['YEAR','STATE_FIPS', 'COUNTY_FIPS', 'COUNTY_TOTALVOTES', 'WINNING_PARTY_BINARY', 'REP_VOTES']]

In [570]:
votes_historcail_df_copy['REP_VOTES%'] = votes_historcail_df_copy['REP_VOTES'] / votes_historcail_df_copy['COUNTY_TOTALVOTES']
votes_historcail_df_copy.drop('REP_VOTES', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [571]:
X = votes_historcail_df_copy.drop(['REP_VOTES%'], axis=1)
y = votes_historcail_df_copy[['REP_VOTES%', 'YEAR']].values

In [572]:
X_train, X_test, y_train, y_test = train_test_split_by_year(X, y, year)

In [573]:
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [574]:
y_pred = linear_model.predict(X_test)

In [575]:
print("Accuracy score={:.2f}".format(linear_model.score(X_test, y_test) * 100))

Accuracy score=54.45


In [576]:
from sklearn import metrics
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Mean Absolute Error: 0.07796162565633612
Mean Squared Error: 0.009688883718852832
Root Mean Squared Error: 0.0984321274729589


### Predict R and D Total votes per County###

In [577]:
votes_df_copy = votes_df[['STATE_FIPS', 'COUNTY_FIPS', 'COUNTY_TOTALVOTES', 'WINNING_PARTY_BINARY']]

In [578]:
y_pred = linear_model.predict(votes_df_copy)

In [579]:
target_year_data['REP_VOTES%'] = y_pred

In [580]:
#info D total votes. 
target_year_data['DEM_VOTES%'] = 1 - target_year_data['REP_VOTES%']

In [581]:
target_year_data['REP_VOTES'] = target_year_data['REP_VOTES%'] * target_year_data['COUNTY_TOTALVOTES']
target_year_data['DEM_VOTES'] = target_year_data['DEM_VOTES%'] * target_year_data['COUNTY_TOTALVOTES']

In [582]:
target_year_data['REP_VOTES'] = target_year_data['REP_VOTES'].astype(np.int64)
target_year_data['DEM_VOTES'] = target_year_data['DEM_VOTES'].astype(np.int64)

In [583]:
#save to csv
filename_step3 = '../../data/F20_step3_output_' + date_str + '.csv'
target_year_data.to_csv(filename_step3)

# Step 4

### Calculate Winner###

In [584]:
electoral_ref = pd.read_excel('../../data/Electoral College Votes.xlsx')
target_data = pd.read_csv(filename_step3)
data_historical = pd.read_csv(combined_data_file, encoding = "ISO-8859-1")
states_aggr = pd.read_csv('../../data/Data-Jul16/state_aggregated_0723.csv')

In [585]:
states_aggr

Unnamed: 0,YEAR,STATE_FIPS,STATE,TOTAL_DEM_VOTES,TOTAL_REP_VOTES,TOTAL_VOTES,MARGIN_VICTORY,ABS_MARGIN_VICTORY,WINNING_PARTY,ELECTORAL_VOTES,TOP_TOPIC
0,2012,1,Alabama,795696,1255925,2051621,0.224325,0.224325,1,9,Economy
1,2012,2,Alaska,142458,192750,335208,0.150032,0.150032,1,3,Economy
2,2012,4,Arizona,1025232,1233654,2258886,0.092268,0.092268,1,11,Economy
3,2012,5,Arkansas,394409,647744,1042153,0.243088,0.243088,1,6,Economy
4,2012,6,California,7854285,4839958,12694243,-0.237456,0.237456,0,55,Economy
...,...,...,...,...,...,...,...,...,...,...,...
308,2020,51,Virginia,1809332,2021888,3831220,0.055480,0.055480,1,13,Economy
309,2020,53,Washington,1810315,1258348,3068663,-0.179872,0.179872,0,12,Economy
310,2020,54,West Virginia,330167,497356,827523,0.202035,0.202035,1,5,Economy
311,2020,55,Wisconsin,1257726,1404664,2662390,0.055190,0.055190,1,10,Immigration


In [586]:
### 1) create a new df with StateFips/Count
state_fips_map = {}
for index, row in data_historical.iterrows():
    state = row['STATE']
    if (state_fips_map.get(state) == None):
        state_fips_map[state] = row['STATE_FIPS']
        
state_fips_df = pd.DataFrame(list(state_fips_map.items()), columns=['STATE','STATE_FIPS'])

In [587]:
df = electoral_ref.merge(state_fips_df, left_on='STATE', right_on='STATE')

In [588]:
target_data.drop('Unnamed: 0', inplace=True, axis=1)

In [589]:
"""extract how counties prediction for every state"""
visited = {}
electoral_votes = {'D':0, 'R':0 }
for index, row in target_data.iterrows():
    state = int(row['STATE_FIPS'])
    
    rep_votes = int(row['REP_VOTES'])
    dem_votes = int(row['DEM_VOTES'])

    if (visited.get(state) == None):
        electoral_votes = {'D': dem_votes, 'R': rep_votes }
        visited[state] = electoral_votes
    else:
        #update existing map
        current_votes = visited.get(state)
        current_r = current_votes.get('R')
        current_d = current_votes.get('D')
        total_dem = current_d + dem_votes
        total_rep = current_r + rep_votes
        electoral_votes = {'D': total_dem, 'R': total_rep }
        visited[state] = electoral_votes

In [590]:
votes_pred_df = pd.DataFrame(list(visited.items()), columns=['STATE_FIPS','PRED_VOTES'])

In [591]:
for index, row in votes_pred_df.iterrows():
    key_val = row['PRED_VOTES']
    d_votes = key_val.get('D')
    r_votes = key_val.get('R')
    votes_pred_df.loc[index, 'PRED_DEM_VOTES'] = key_val.get('D')
    votes_pred_df.loc[index, 'PRED_REP_VOTES'] = key_val.get('R')

In [592]:
votes_pred_df.drop('PRED_VOTES', inplace=True, axis=1)
votes_pred_df['PRED_DEM_VOTES'] = votes_pred_df['PRED_DEM_VOTES'].astype(np.int64)
votes_pred_df['PRED_REP_VOTES'] = votes_pred_df['PRED_REP_VOTES'].astype(np.int64)
votes_pred_df

Unnamed: 0,STATE_FIPS,PRED_DEM_VOTES,PRED_REP_VOTES
0,1,906608,1225239
1,2,192269,295880
2,4,1794197,791870
3,5,512351,729964
4,6,10312429,3615159
5,8,1357621,1129896
6,9,1009586,611263
7,10,233845,212151
8,11,181221,101769
9,12,4732093,4133482


In [593]:
"""determine the winner of every state"""
for index, row in votes_pred_df.iterrows():
    
    total_r = row['PRED_REP_VOTES']
    total_d = row['PRED_DEM_VOTES']
    
    votes_pred_df.loc[index, 'PRED_WINNING_PARTY'] = '0' if total_d > total_r else '1'

In [594]:
votes_pred_df

Unnamed: 0,STATE_FIPS,PRED_DEM_VOTES,PRED_REP_VOTES,PRED_WINNING_PARTY
0,1,906608,1225239,1
1,2,192269,295880,1
2,4,1794197,791870,0
3,5,512351,729964,1
4,6,10312429,3615159,0
5,8,1357621,1129896,0
6,9,1009586,611263,0
7,10,233845,212151,0
8,11,181221,101769,0
9,12,4732093,4133482,0


In [595]:
votes_pred_df

Unnamed: 0,STATE_FIPS,PRED_DEM_VOTES,PRED_REP_VOTES,PRED_WINNING_PARTY
0,1,906608,1225239,1
1,2,192269,295880,1
2,4,1794197,791870,0
3,5,512351,729964,1
4,6,10312429,3615159,0
5,8,1357621,1129896,0
6,9,1009586,611263,0
7,10,233845,212151,0
8,11,181221,101769,0
9,12,4732093,4133482,0


In [596]:
data_aggr_2020 = states_aggr.loc[states_aggr['YEAR'] == 2020]
data_aggr_2016 = states_aggr.loc[states_aggr['YEAR'] == 2016]
data_aggr_2012 = states_aggr.loc[states_aggr['YEAR'] == 2012]


In [602]:
join_df = None
if year == 2012:
    join_df = data_aggr_2012
elif year == 2016:
    join_df = data_aggr_2016
else:
    join_df = data_2020

In [603]:
pred_actual_df = pd.merge(votes_pred_df, join_df, on='STATE_FIPS')

In [604]:
pred_actual_df

Unnamed: 0,STATE_FIPS,PRED_DEM_VOTES,PRED_REP_VOTES,PRED_WINNING_PARTY,YEAR_FIPS,YEAR,COUNTY_FIPS,STATE,COUNTY,REP_CANDIDATE,DEM_CANDIDATE,REP_VOTES,DEM_VOTES,COUNTY_TOTALVOTES,WINNING_CANDIDATE,WINNING_PARTY,WINNING_PARTY_BINARY,HOUSE_WINNING_BINARY,SENATE_WINNING_BINARY,UNEMPLOYMENT_RATE,AVG_WAGE_SALARY,AA_FEMALE,AA_MALE,BA_FEMALE,BA_MALE,H_FEMALE,H_MALE,IA_FEMALE,IA_MALE,NA_FEMALE,NA_MALE,TOT_FEMALE,TOT_MALE,TOT_POP,WA_FEMALE,WA_MALE,TOT_POP_LESS19,TOT_MALE_LESS19,TOT_FEMALE_LESS19,TOT_POP_20to39,TOT_MALE_20to39,TOT_FEMALE_20to39,TOT_POP_40to59,TOT_MALE_40to59,TOT_FEMALE_40to59,TOT_POP_Above60,TOT_MALE_Above60,TOT_FEMALE_Above60,AA_FEMALE%,AA_MALE%,BA_FEMALE%,BA_MALE%,H_FEMALE%,H_MALE%,IA_FEMALE%,IA_MALE%,NA_FEMALE%,NA_MALE%,WA_FEMALE%,WA_MALE%,TOT_FEMALE%,TOT_MALE%,TOT_POP_LESS19%,TOT_POP_20to39%,TOT_POP_40to59%,TOT_POP_Above60%,MARGIN_VICTORY
0,1,906608,1225239,1,20201001,2020,1,Alabama,Autauga County,,,,,,,,0,1,-1,5.1,,380,301,5762,5153,789,860,146,121,29,33,28606,26995,55601,21801,20859,14424,7397,7027,13967,6885,7082,15320,7430,7890,11890,5283,6607,0.006834,0.005414,0.103631,0.092678,0.014190,0.015467,0.002626,0.002176,0.000522,0.000594,0.392097,0.375155,0.514487,0.485513,0.259420,0.251201,0.275535,0.213845,
1,1,906608,1225239,1,20201003,2020,3,Alabama,Baldwin County,,,,,,,,0,1,-1,5.3,,1575,933,10092,9400,4829,5302,798,886,73,73,112365,105657,218022,97830,92471,51670,26101,25569,48792,24036,24756,57324,27532,29792,60236,27988,32248,0.007224,0.004279,0.046289,0.043115,0.022149,0.024319,0.003660,0.004064,0.000335,0.000335,0.448716,0.424136,0.515384,0.484616,0.236994,0.223794,0.262928,0.276284,
2,1,906608,1225239,1,20201005,2020,5,Alabama,Barbour County,,,,,,,,0,1,-1,8.3,,61,52,5672,6370,480,584,67,97,16,30,11748,13133,24881,5799,6410,5692,2907,2785,6469,3919,2550,6352,3469,2883,6368,2838,3530,0.002452,0.002090,0.227965,0.256019,0.019292,0.023472,0.002693,0.003899,0.000643,0.001206,0.233069,0.257626,0.472168,0.527832,0.228769,0.259998,0.255295,0.255938,
3,1,906608,1225239,1,20201007,2020,7,Alabama,Bibb County,,,,,,,,0,1,-1,6.4,,34,19,1816,2954,253,335,50,48,4,22,10479,11921,22400,8449,8762,5052,2660,2392,6142,3633,2509,6153,3330,2823,5053,2298,2755,0.001518,0.000848,0.081071,0.131875,0.011295,0.014955,0.002232,0.002143,0.000179,0.000982,0.377188,0.391161,0.467813,0.532188,0.225536,0.274196,0.274687,0.225580,
4,1,906608,1225239,1,20201009,2020,9,Alabama,Blount County,,,,,,,,0,1,-1,5.4,,103,82,443,507,2597,2939,183,195,26,44,29340,28500,57840,28149,27307,14665,7366,7299,13626,6933,6693,15318,7665,7653,14231,6536,7695,0.001781,0.001418,0.007659,0.008766,0.044900,0.050813,0.003164,0.003371,0.000450,0.000761,0.486670,0.472113,0.507261,0.492739,0.253544,0.235581,0.264834,0.246041,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3144,56,72747,128726,1,202056037,2020,37,Wyoming,Sweetwater County,,,,,,,,0,1,1,6.1,,253,182,264,328,3321,3603,343,322,34,34,20899,22152,43051,19563,20834,12430,6372,6058,11936,6151,5785,10595,5537,5058,8090,4092,3998,0.005877,0.004228,0.006132,0.007619,0.077141,0.083691,0.007967,0.007480,0.000790,0.000790,0.454415,0.483938,0.485447,0.514553,0.288727,0.277253,0.246103,0.187917,
3145,56,72747,128726,1,202056039,2020,39,Wyoming,Teton County,,,,,,,,0,1,1,3.5,,194,132,75,100,1651,1783,101,110,15,20,11172,11909,23081,10614,11357,4569,2334,2235,7060,3662,3398,6351,3326,3025,5101,2587,2514,0.008405,0.005719,0.003249,0.004333,0.071531,0.077250,0.004376,0.004766,0.000650,0.000867,0.459859,0.492050,0.484034,0.515966,0.197955,0.305879,0.275161,0.221004,
3146,56,72747,128726,1,202056041,2020,41,Wyoming,Uinta County,,,,,,,,0,1,1,5.8,,55,41,90,78,922,953,156,132,13,17,10014,10285,20299,9538,9818,6326,3283,3043,4855,2417,2438,4824,2444,2380,4294,2141,2153,0.002709,0.002020,0.004434,0.003843,0.045421,0.046948,0.007685,0.006503,0.000640,0.000837,0.469875,0.483669,0.493325,0.506675,0.311641,0.239174,0.237647,0.211538,
3147,56,72747,128726,1,202056043,2020,43,Wyoming,Washakie County,,,,,,,,0,1,1,4.8,,37,28,15,25,540,568,78,65,3,5,3897,3988,7885,3672,3778,1971,1017,954,1618,822,796,1985,1012,973,2311,1137,1174,0.004692,0.003551,0.001902,0.003171,0.068484,0.072036,0.009892,0.008244,0.000380,0.000634,0.465694,0.479138,0.494230,0.505770,0.249968,0.205200,0.251744,0.293088,


In [605]:
pred_actual_df.drop('YEAR', inplace=True, axis=1)

In [606]:
pred_actual_df.columns

Index(['STATE_FIPS', 'PRED_DEM_VOTES', 'PRED_REP_VOTES', 'PRED_WINNING_PARTY',
       'YEAR_FIPS', 'COUNTY_FIPS', 'STATE', 'COUNTY', 'REP_CANDIDATE',
       'DEM_CANDIDATE', 'REP_VOTES', 'DEM_VOTES', 'COUNTY_TOTALVOTES',
       'WINNING_CANDIDATE', 'WINNING_PARTY', 'WINNING_PARTY_BINARY',
       'HOUSE_WINNING_BINARY', 'SENATE_WINNING_BINARY', 'UNEMPLOYMENT_RATE',
       'AVG_WAGE_SALARY', 'AA_FEMALE', 'AA_MALE', 'BA_FEMALE', 'BA_MALE',
       'H_FEMALE', 'H_MALE', 'IA_FEMALE', 'IA_MALE', 'NA_FEMALE', 'NA_MALE',
       'TOT_FEMALE', 'TOT_MALE', 'TOT_POP', 'WA_FEMALE', 'WA_MALE',
       'TOT_POP_LESS19', 'TOT_MALE_LESS19', 'TOT_FEMALE_LESS19',
       'TOT_POP_20to39', 'TOT_MALE_20to39', 'TOT_FEMALE_20to39',
       'TOT_POP_40to59', 'TOT_MALE_40to59', 'TOT_FEMALE_40to59',
       'TOT_POP_Above60', 'TOT_MALE_Above60', 'TOT_FEMALE_Above60',
       'AA_FEMALE%', 'AA_MALE%', 'BA_FEMALE%', 'BA_MALE%', 'H_FEMALE%',
       'H_MALE%', 'IA_FEMALE%', 'IA_MALE%', 'NA_FEMALE%', 'NA_MALE%',
       'W

In [607]:
pred_actual_df.head()

Unnamed: 0,STATE_FIPS,PRED_DEM_VOTES,PRED_REP_VOTES,PRED_WINNING_PARTY,YEAR_FIPS,COUNTY_FIPS,STATE,COUNTY,REP_CANDIDATE,DEM_CANDIDATE,REP_VOTES,DEM_VOTES,COUNTY_TOTALVOTES,WINNING_CANDIDATE,WINNING_PARTY,WINNING_PARTY_BINARY,HOUSE_WINNING_BINARY,SENATE_WINNING_BINARY,UNEMPLOYMENT_RATE,AVG_WAGE_SALARY,AA_FEMALE,AA_MALE,BA_FEMALE,BA_MALE,H_FEMALE,H_MALE,IA_FEMALE,IA_MALE,NA_FEMALE,NA_MALE,TOT_FEMALE,TOT_MALE,TOT_POP,WA_FEMALE,WA_MALE,TOT_POP_LESS19,TOT_MALE_LESS19,TOT_FEMALE_LESS19,TOT_POP_20to39,TOT_MALE_20to39,TOT_FEMALE_20to39,TOT_POP_40to59,TOT_MALE_40to59,TOT_FEMALE_40to59,TOT_POP_Above60,TOT_MALE_Above60,TOT_FEMALE_Above60,AA_FEMALE%,AA_MALE%,BA_FEMALE%,BA_MALE%,H_FEMALE%,H_MALE%,IA_FEMALE%,IA_MALE%,NA_FEMALE%,NA_MALE%,WA_FEMALE%,WA_MALE%,TOT_FEMALE%,TOT_MALE%,TOT_POP_LESS19%,TOT_POP_20to39%,TOT_POP_40to59%,TOT_POP_Above60%,MARGIN_VICTORY
0,1,906608,1225239,1,20201001,1,Alabama,Autauga County,,,,,,,,0,1,-1,5.1,,380,301,5762,5153,789,860,146,121,29,33,28606,26995,55601,21801,20859,14424,7397,7027,13967,6885,7082,15320,7430,7890,11890,5283,6607,0.006834,0.005414,0.103631,0.092678,0.01419,0.015467,0.002626,0.002176,0.000522,0.000594,0.392097,0.375155,0.514487,0.485513,0.25942,0.251201,0.275535,0.213845,
1,1,906608,1225239,1,20201003,3,Alabama,Baldwin County,,,,,,,,0,1,-1,5.3,,1575,933,10092,9400,4829,5302,798,886,73,73,112365,105657,218022,97830,92471,51670,26101,25569,48792,24036,24756,57324,27532,29792,60236,27988,32248,0.007224,0.004279,0.046289,0.043115,0.022149,0.024319,0.00366,0.004064,0.000335,0.000335,0.448716,0.424136,0.515384,0.484616,0.236994,0.223794,0.262928,0.276284,
2,1,906608,1225239,1,20201005,5,Alabama,Barbour County,,,,,,,,0,1,-1,8.3,,61,52,5672,6370,480,584,67,97,16,30,11748,13133,24881,5799,6410,5692,2907,2785,6469,3919,2550,6352,3469,2883,6368,2838,3530,0.002452,0.00209,0.227965,0.256019,0.019292,0.023472,0.002693,0.003899,0.000643,0.001206,0.233069,0.257626,0.472168,0.527832,0.228769,0.259998,0.255295,0.255938,
3,1,906608,1225239,1,20201007,7,Alabama,Bibb County,,,,,,,,0,1,-1,6.4,,34,19,1816,2954,253,335,50,48,4,22,10479,11921,22400,8449,8762,5052,2660,2392,6142,3633,2509,6153,3330,2823,5053,2298,2755,0.001518,0.000848,0.081071,0.131875,0.011295,0.014955,0.002232,0.002143,0.000179,0.000982,0.377188,0.391161,0.467813,0.532188,0.225536,0.274196,0.274687,0.22558,
4,1,906608,1225239,1,20201009,9,Alabama,Blount County,,,,,,,,0,1,-1,5.4,,103,82,443,507,2597,2939,183,195,26,44,29340,28500,57840,28149,27307,14665,7366,7299,13626,6933,6693,15318,7665,7653,14231,6536,7695,0.001781,0.001418,0.007659,0.008766,0.0449,0.050813,0.003164,0.003371,0.00045,0.000761,0.48667,0.472113,0.507261,0.492739,0.253544,0.235581,0.264834,0.246041,


In [608]:
file_name = 'actuals_predictions_state_' + str(year) + '.csv'
pred_actual_df.to_csv(file_name)