In [276]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, f1_score, precision_recall_fscore_support
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report

In [277]:
pd.set_option('max_columns', None)

In [278]:
combined_data_file = "../../data/Data-Jul16/combined_jul16.csv"
data_2020_file = "../../data/Data-Jul16/F20_jul05.csv"
data_historical = pd.read_csv(combined_data_file, encoding = "ISO-8859-1")

In [279]:
data_2020 = pd.read_csv(data_2020_file, encoding = "ISO-8859-1")
data_2016 = data_historical.loc[data_historical['YEAR'] == 2016]
data_2012 = data_historical.loc[data_historical['YEAR'] == 2012]
data_2008 = data_historical.loc[data_historical['YEAR'] == 2008]

year_df_dict = {2020:data_2020, 2016:data_2016, 2012:data_2012, 2008:data_2008}

In [280]:
data_historical['COUNTY_TOTALVOTES'] = data_historical['COUNTY_TOTALVOTES'].astype(np.int64)

### PrepData###

In [281]:
data_historical['REP_VOTES%'] = data_historical['REP_VOTES'] / data_historical['COUNTY_TOTALVOTES']
data_historical['DEM_VOTES%'] = data_historical['DEM_VOTES'] / data_historical['COUNTY_TOTALVOTES']
data_2020['REP_VOTES%'] = data_2020['REP_VOTES'] / data_2020['COUNTY_TOTALVOTES']
data_2020['DEM_VOTES%'] = data_2020['DEM_VOTES']/ data_2020['COUNTY_TOTALVOTES']

In [282]:
data_historical.drop(['REP_VOTES', 'DEM_VOTES'], axis=1, inplace=True)
data_2020.drop(['REP_VOTES', 'DEM_VOTES'], axis=1, inplace=True)

In [283]:
data_historical.drop(['WINNING_CANDIDATE', 'WINNING_PARTY_BINARY', 'REP_CANDIDATE', 'DEM_CANDIDATE', 'WINNING_PARTY', 'COUNTY', 'STATE', 'REP_VOTES%', 'DEM_VOTES%', 'MARGIN_VICTORY'], axis=1, inplace=True)

In [284]:
def train_test_split_by_year(X, y, year, cols=None):
    
    if year != None:
        year_filter =  X['YEAR'] < year
        X = X[year_filter]
        y = y[year_filter]
        
    X = X.drop('YEAR', axis=1)
    y = np.delete(y, 1, axis=1)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    
    if (cols == None):
        return X_train, X_test, y_train, y_test
    else:
        return X_train[cols], X_test[cols], y_train, y_test

# Step 1 #

### Build/Train Model#1: Linear regression to preidct total votes by county###


In [285]:
year = 2020
target_df = year_df_dict.get(year)

In [286]:
X = data_historical.drop(['COUNTY_TOTALVOTES'], axis=1)
y = data_historical[['COUNTY_TOTALVOTES', 'YEAR']].values

In [287]:
X_train, X_test, y_train, y_test = train_test_split_by_year(X, y, year)

In [288]:
linear_model = LinearRegression()

In [289]:
linear_model.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [290]:
y_pred = linear_model.predict(X_test)

In [291]:
print("Accuracy score={:.2f}".format(linear_model.score(X_test, y_test) * 100))

Accuracy score=98.93


### Generate Total Votes Predictions by County###

In [292]:
target_df.drop(['COUNTY_TOTALVOTES'], inplace=True, axis=1, errors='ignore')

In [293]:
target_df.drop(['ID', 'YEAR', 'Unnamed: 0', 'Unnamed: 0.1', 'WINNING_CANDIDATE', 'WINNING_PARTY_BINARY', 'REP_CANDIDATE', 'DEM_CANDIDATE', 'WINNING_PARTY', 'COUNTY', 'STATE', 'REP_VOTES%', 'DEM_VOTES%', 'REP_VOTES', 'DEM_VOTES', 'COUNTY_TOTALVOTES', 'MARGIN_VICTORY'], axis=1, inplace=True, errors='ignore')

In [294]:
y_pred_target = linear_model.predict(target_df)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [None]:
#merge predictions with the target dataset. 
target_df['COUNTY_TOTALVOTES'] = y_pred_target

In [None]:
target_df['COUNTY_TOTALVOTES'] = target_df['COUNTY_TOTALVOTES'].astype(np.int64)

In [None]:
import datetime

x = datetime.datetime.now()
day = x.day
month = x.month

date_str = str(month) + str(day)

In [None]:
#save file (will be used by next step in the pipeline)
filename_step1 = '../../data/F20_step1_output_' + date_str + '.csv'
target_df.to_csv(filename_step1)

# Step 2 #

In [None]:
# start clean and reimport everything again. 
data = pd.read_csv(combined_data_file, encoding = "ISO-8859-1")


In [None]:
data_2020 = pd.read_csv(data_2020_file, encoding = "ISO-8859-1")
data_2016 = data.loc[data['YEAR'] == 2016]
data_2012 = data.loc[data['YEAR'] == 2012]
data_2008 = data.loc[data['YEAR'] == 2008]

### Build/Train Model#2: Random Forest Classifier to predict County Winner###

In [None]:
data.drop(['REP_VOTES', 'DEM_VOTES', 'MARGIN_VICTORY', 'WINNING_CANDIDATE', 'REP_CANDIDATE', 'DEM_CANDIDATE', 'WINNING_PARTY', 'COUNTY', 'STATE'], axis=1, inplace=True)
data.drop(['AA_FEMALE', 'AA_MALE', 'BA_FEMALE', 'BA_MALE', 'H_FEMALE', 'H_MALE', 'IA_FEMALE', 'IA_MALE', 'NA_FEMALE', 'NA_MALE' , 'TOT_FEMALE', 'TOT_MALE', 'TOT_POP', 'WA_FEMALE', 'WA_MALE', 'TOT_POP_LESS19', 'TOT_MALE_LESS19', 'TOT_FEMALE_LESS19', 'TOT_POP_20to39', 'TOT_MALE_20to39', 'TOT_FEMALE_20to39', 'TOT_POP_40to59', 'TOT_MALE_40to59', 'TOT_FEMALE_40to59', 'TOT_POP_Above60', 'TOT_MALE_Above60', 'TOT_FEMALE_Above60'] , axis=1, inplace=True)


In [None]:
significant_cols = [
 'STATE_FIPS',
 'COUNTY_FIPS',
 'COUNTY_TOTALVOTES',
 'HOUSE_WINNING_BINARY',
 'SENATE_WINNING_BINARY',
 'UNEMPLOYMENT_RATE',
 'AVG_WAGE_SALARY',
 'BA_FEMALE%',
 'BA_MALE%',
 'H_FEMALE%',
 'IA_FEMALE%',
 'WA_FEMALE%',
 'WA_MALE%',
 'TOT_FEMALE%',
 'TOT_MALE%',
 'TOT_POP_LESS19%',
 'TOT_POP_40to59%',
 'TOT_POP_Above60%']

In [None]:
X = data.drop('WINNING_PARTY_BINARY', axis=1)
y = data[['WINNING_PARTY_BINARY', 'YEAR']].values

X_train, X_test, y_train, y_test = train_test_split_by_year(X, y, year, significant_cols)

In [None]:
rfc4 = RandomForestClassifier(n_estimators=10).fit(X_train, y_train)

rfc_pred4= rfc4.predict(X_test)

print("Experiment#rfc4: {:.3f}%".format(accuracy_score(y_test, rfc_pred4) * 100))

In [None]:
print(classification_report(y_test, rfc_pred4))

### Predict County winner ###

In [None]:
target_year_data = pd.read_csv(filename_step1, encoding = "ISO-8859-1")

In [None]:
## use with significant model only..
## for now drop AVG_WAGE_SALARY till data is fixed. 
data_significant = target_year_data[[
 'STATE_FIPS',
 'COUNTY_FIPS',
 'COUNTY_TOTALVOTES',
 'HOUSE_WINNING_BINARY',
 'SENATE_WINNING_BINARY',
 'UNEMPLOYMENT_RATE',
 'AVG_WAGE_SALARY',
 'BA_FEMALE%',
 'BA_MALE%',
 'H_FEMALE%',
 'IA_FEMALE%',
 'WA_FEMALE%',
 'WA_MALE%',
 'TOT_FEMALE%',
 'TOT_MALE%',
 'TOT_POP_LESS19%',
 'TOT_POP_40to59%',
 'TOT_POP_Above60%']]

In [None]:
y_pred = rfc4.predict(data_significant)

In [None]:
target_year_data['WINNING_PARTY_BINARY'] = y_pred

In [None]:
#save to csv
filename_step2 = '../../data/F20_step2_output_' + date_str + '.csv'
target_year_data.to_csv(filename_step2)

# Step 3

### Build/Train Model#3: Linear Regression to predict R/D Votes for every county###

In [None]:
votes_historcail_df = pd.read_csv(combined_data_file, encoding = "ISO-8859-1")
votes_df = pd.read_csv(filename_step2)

In [None]:
votes_historcail_df_copy = votes_historcail_df[['YEAR','STATE_FIPS', 'COUNTY_FIPS', 'COUNTY_TOTALVOTES', 'WINNING_PARTY_BINARY', 'REP_VOTES']]

In [None]:
votes_historcail_df_copy['REP_VOTES%'] = votes_historcail_df_copy['REP_VOTES'] / votes_historcail_df_copy['COUNTY_TOTALVOTES']
votes_historcail_df_copy.drop('REP_VOTES', axis=1, inplace=True)

In [None]:
X = votes_historcail_df_copy.drop(['REP_VOTES%'], axis=1)
y = votes_historcail_df_copy[['REP_VOTES%', 'YEAR']].values

In [None]:
X_train, X_test, y_train, y_test = train_test_split_by_year(X, y, year)

In [None]:
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

In [None]:
y_pred = linear_model.predict(X_test)

In [None]:
print("Accuracy score={:.2f}".format(linear_model.score(X_test, y_test) * 100))

In [None]:
from sklearn import metrics
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

### Predict R and D Total votes per County###

In [None]:
votes_df_copy = votes_df[['STATE_FIPS', 'COUNTY_FIPS', 'COUNTY_TOTALVOTES', 'WINNING_PARTY_BINARY']]

In [None]:
y_pred = linear_model.predict(votes_df_copy)

In [None]:
target_year_data['REP_VOTES%'] = y_pred

In [None]:
#info D total votes. 
target_year_data['DEM_VOTES%'] = 1 - target_year_data['REP_VOTES%']

In [None]:
target_year_data['REP_VOTES'] = target_year_data['REP_VOTES%'] * target_year_data['COUNTY_TOTALVOTES']
target_year_data['DEM_VOTES'] = target_year_data['DEM_VOTES%'] * target_year_data['COUNTY_TOTALVOTES']

In [None]:
target_year_data['REP_VOTES'] = target_year_data['REP_VOTES'].astype(np.int64)
target_year_data['DEM_VOTES'] = target_year_data['DEM_VOTES'].astype(np.int64)

In [None]:
#save to csv
filename_step3 = '../../data/F20_step3_output_' + date_str + '.csv'
target_year_data.to_csv(filename_step3)

# Step 4

### Calculate Winner###

In [None]:
electoral_ref = pd.read_excel('../../data/Electoral College Votes.xlsx')
target_data = pd.read_csv(filename_step3)
data_historical = pd.read_csv(combined_data_file, encoding = "ISO-8859-1")
states_aggr = pd.read_csv('../../data/Data-Jul16/state_aggregated_0723.csv')

In [None]:
### 1) create a new df with StateFips/Count
state_fips_map = {}
for index, row in data_historical.iterrows():
    state = row['STATE']
    if (state_fips_map.get(state) == None):
        state_fips_map[state] = row['STATE_FIPS']
        
state_fips_df = pd.DataFrame(list(state_fips_map.items()), columns=['STATE','STATE_FIPS'])

In [None]:
df = electoral_ref.merge(state_fips_df, left_on='STATE', right_on='STATE')

In [None]:
target_data.drop('Unnamed: 0', inplace=True, axis=1)

In [None]:
"""extract how counties prediction for every state"""
visited = {}
electoral_votes = {'D':0, 'R':0 }
for index, row in target_data.iterrows():
    state = int(row['STATE_FIPS'])
    
    rep_votes = int(row['REP_VOTES'])
    dem_votes = int(row['DEM_VOTES'])

    if (visited.get(state) == None):
        electoral_votes = {'D': dem_votes, 'R': rep_votes }
        visited[state] = electoral_votes
    else:
        #update existing map
        current_votes = visited.get(state)
        current_r = current_votes.get('R')
        current_d = current_votes.get('D')
        total_dem = current_d + dem_votes
        total_rep = current_r + rep_votes
        electoral_votes = {'D': total_dem, 'R': total_rep }
        visited[state] = electoral_votes

In [None]:
votes_pred_df = pd.DataFrame(list(visited.items()), columns=['STATE_FIPS','PRED_VOTES'])

In [None]:
for index, row in votes_pred_df.iterrows():
    key_val = row['PRED_VOTES']
    d_votes = key_val.get('D')
    r_votes = key_val.get('R')
    votes_pred_df.loc[index, 'PRED_DEM_VOTES'] = key_val.get('D')
    votes_pred_df.loc[index, 'PRED_REP_VOTES'] = key_val.get('R')

In [None]:
votes_pred_df.drop('PRED_VOTES', inplace=True, axis=1)
votes_pred_df['PRED_DEM_VOTES'] = votes_pred_df['PRED_DEM_VOTES'].astype(np.int64)
votes_pred_df['PRED_REP_VOTES'] = votes_pred_df['PRED_REP_VOTES'].astype(np.int64)
votes_pred_df

In [None]:
"""determine the winner of every state"""
for index, row in votes_pred_df.iterrows():
    
    total_r = row['PRED_REP_VOTES']
    total_d = row['PRED_DEM_VOTES']
    
    votes_pred_df.loc[index, 'PRED_WINNING_PARTY'] = '0' if total_d > total_r else '1'

In [None]:
votes_pred_df

In [None]:
data_aggr_2016 = states_aggr.loc[states_aggr['YEAR'] == 2016]
data_aggr_2012 = states_aggr.loc[states_aggr['YEAR'] == 2012]


In [None]:
join_df = data_aggr_2012 if year == 2012 else data_aggr_2016

In [None]:
pred_actual_df = pd.merge(votes_pred_df, join_df, on='STATE_FIPS')

In [None]:
join_df

In [None]:
pred_actual_df.drop('YEAR', inplace=True, axis=1)

In [None]:
pred_actual_df.columns

In [None]:
pred_actual_df.head()

In [None]:
file_name = 'actuals_predictions_state_' + str(year) + '.csv'
pred_actual_df.to_csv(file_name)