<a href="https://colab.research.google.com/github/gaboojie/project_voting/blob/main/code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Plan:
1. Load CSV va_voting data - Gabe
2. Clean data into proper format for ML model - Gabe
3. Split into training and testing data - Naad
4. Choose correct ML model - Naad
5. Perform ML model - Naad
6. Calculate metrics to assess ML model - Ben
7. Create a state map based on 2024 inputs to the model - Ben

In [None]:
# Load CSV va_voting data

import pandas as pd
import numpy as np

csv_df = pd.read_csv("https://raw.githubusercontent.com/gaboojie/project_voting/main/data/voting_VA.csv")

In [None]:
# Clean data into proper format for ML model

# Independent variables to use:
# - Year
# - County Name
# - Total votes
# - Party (Republican or Democrat)

# Dependent variables to use:
# - Candidate votes

# Select the columns from the df that we will use
#df = pd.concat([csv_df['year'], csv_df['county_name'], csv_df['party'], csv_df['totalvotes'], csv_df['candidatevotes']], axis=1)

# Rename columns
df = csv_df.rename(columns={'year': 'Year', 'county_name': 'County Name', 'party': 'Party', 'totalvotes': 'Total Votes', 'candidatevotes': 'Candidate Votes', 'candidate': 'Candidate'})

# Clean Party: only include Republican and Democrat parties, drop any other party rows
df_index = df[ (df['Party'] != 'REPUBLICAN') & (df['Party'] != 'DEMOCRAT') ].index
df = df.drop(df_index)
print("Unique parties:", df['Party'].unique())

# Clean Year: Already looks good
print("Unique years:", df['Year'].unique())

# Clean County name: Already looks good
#print("Unique counties:", df['County Name'].unique())
print('Number of NAs in counties:', df['County Name'].isna().sum())

# Clean total votes: Drop any total where votes are less than 1 (there were two outliers for 2016 in Bedford with a vote count of 0)
df_index = df[df['Total Votes'] <= 0].index
df = df.drop(df_index)
print("Number of NAs in Total Votes:", df['Total Votes'].isna().sum())
#print(df['Total Votes'].unique())

# Merge 2020 absentee, election day, and provisional votes to one total candidate vote
#df = df[(df['Candidate'] == 'JOSEPH R BIDEN JR') & (df['Year'] == 2020)].groupby(['Candidate', 'Year']).sum().reset_index()
#joe_biden_entries = df[df['Candidate'] == 'JOSEPH R BIDEN JR']
#joe_biden_entries.head()

# Clean candidate votes: Drop any total where votes are less than 1 (there were two outliers for 2016 in Bedford with a vote count of 0)
df_index = df[df['Candidate Votes'] <= 0].index
df = df.drop(df_index)
print("Number of NAs in Candidate Votes:", df['Total Votes'].isna().sum())
#print(df['Candidate Votes'].unique())

df.head()

Unique parties: ['DEMOCRAT' 'REPUBLICAN']
Unique years: [2000 2004 2008 2012 2016 2020]
Number of NAs in counties: 0
Number of NAs in Total Votes: 0
Number of NAs in Candidate Votes: 0


Unnamed: 0.1,Unnamed: 0,Year,state,state_po,County Name,county_fips,office,Candidate,Party,Candidate Votes,Total Votes,version,mode
0,11161,2000,VIRGINIA,VA,ACCOMACK,51001,US PRESIDENT,AL GORE,DEMOCRAT,5092,11925,20220315,TOTAL
1,11162,2000,VIRGINIA,VA,ACCOMACK,51001,US PRESIDENT,GEORGE W. BUSH,REPUBLICAN,6352,11925,20220315,TOTAL
4,11165,2000,VIRGINIA,VA,ALBEMARLE,51003,US PRESIDENT,AL GORE,DEMOCRAT,16255,36846,20220315,TOTAL
5,11166,2000,VIRGINIA,VA,ALBEMARLE,51003,US PRESIDENT,GEORGE W. BUSH,REPUBLICAN,18291,36846,20220315,TOTAL
8,11169,2000,VIRGINIA,VA,ALLEGHANY,51005,US PRESIDENT,AL GORE,DEMOCRAT,2214,5123,20220315,TOTAL


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Separate the data into training and testing sets
# Testing data is from the year 2020
train_df = df[df['Year'] != 2020]
test_df = df[df['Year'] == 2020]

# Check the distribution of training and testing data
print(f"Training data shape: {train_df.shape}")
print(f"Testing data shape: {test_df.shape}")

train_df.head(), test_df.head()

Training data shape: (1338, 5)
Testing data shape: (763, 5)


(   Year County Name       Party  Total Votes  Candidate Votes
 0  2000    ACCOMACK    DEMOCRAT        11925             5092
 1  2000    ACCOMACK  REPUBLICAN        11925             6352
 4  2000   ALBEMARLE    DEMOCRAT        36846            16255
 5  2000   ALBEMARLE  REPUBLICAN        36846            18291
 8  2000   ALLEGHANY    DEMOCRAT         5123             2214,
       Year County Name       Party  Total Votes  Candidate Votes
 2144  2020    ACCOMACK    DEMOCRAT        16962             5495
 2145  2020    ACCOMACK    DEMOCRAT        16962             2072
 2146  2020    ACCOMACK    DEMOCRAT        16962               11
 2153  2020    ACCOMACK  REPUBLICAN        16962             3084
 2154  2020    ACCOMACK  REPUBLICAN        16962             6078)

In [None]:
# Create a linear regression model using the train data
X_train = pd.concat([
    pd.get_dummies(train_df['Year'], dtype='int'),
    pd.get_dummies(train_df['County Name'], dtype='int'),
    pd.get_dummies(train_df['Party'], dtype='int'),
    train_df[['Total Votes']]
], axis=1)
X_train.columns = X_train.columns.astype(str)
y_train = train_df['Candidate Votes']

# Construct a linear model with no intercept using sklearn
from sklearn import linear_model
reg = linear_model.LinearRegression(fit_intercept=False).fit(X_train, y_train)

results = pd.DataFrame({'Variables':reg.feature_names_in_, 'Coefficient':reg.coef_})
results

Unnamed: 0,Variables,Coefficient
0,2000,-52.135344
1,2004,216.580412
2,2008,231.421297
3,2012,166.439077
4,2016,-469.903070
...,...,...
132,WYTHE,-5.583097
133,YORK,6.281549
134,DEMOCRAT,130.792367
135,REPUBLICAN,-38.389995


In [None]:
X_test = pd.concat([
    pd.get_dummies(test_df['Year'], dtype='int'),
    pd.get_dummies(test_df['County Name'], dtype='int'),
    pd.get_dummies(test_df['Party'], dtype='int'),
    test_df[['Total Votes']]
], axis=1)
X_test.columns = X_test.columns.astype(str)
y_test = test_df['Candidate Votes']

# Calculate R^2
r2 = reg.score(X_test, y_test)
print('R_squared: ', r2)

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- 2020
- ALEXANDRIA CITY
- BRISTOL CITY
- BUENA VISTA CITY
- CHARLOTTESVILLE CITY
- ...
Feature names seen at fit time, yet now missing:
- 2000
- 2004
- 2008
- 2012
- 2016
- ...
