<a href="https://colab.research.google.com/github/gaboojie/project_voting/blob/main/code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Plan:
1. Load CSV va_voting data - Gabe
2. Clean data into proper format for ML model - Gabe
3. Split into training and testing data - Naad
4. Choose correct ML model - Naad
5. Perform ML model - Naad
6. Calculate metrics to assess ML model - Ben
7. Create a state map based on 2024 inputs to the model - Ben

In [2]:
# Load CSV va_voting data

import pandas as pd
import numpy as np

csv_df = pd.read_csv("https://raw.githubusercontent.com/gaboojie/project_voting/main/data/voting_VA.csv")

In [21]:
# Clean data into proper format for ML model

# Independent variables to use:
# - Year
# - County Name
# - Total votes
# - Party (Republican or Democrat)

# Dependent variables to use:
# - Candidate votes

# Select the columns from the df that we will use
#df = pd.concat([csv_df['year'], csv_df['county_name'], csv_df['party'], csv_df['totalvotes'], csv_df['candidatevotes']], axis=1)

# Rename columns
df = csv_df.rename(columns={'year': 'Year', 'county_name': 'County Name', 'party': 'Party', 'totalvotes': 'Total Votes', 'candidatevotes': 'Candidate Votes', 'candidate': 'Candidate'})

# Clean Party: only include Republican and Democrat parties, drop any other party rows
df_index = df[ (df['Party'] != 'REPUBLICAN') & (df['Party'] != 'DEMOCRAT') ].index
df = df.drop(df_index)
print("Unique parties:", df['Party'].unique())

# Clean Year: Already looks good
print("Unique years:", df['Year'].unique())

# Clean County name: Already looks good
#print("Unique counties:", df['County Name'].unique())
print('Number of NAs in counties:', df['County Name'].isna().sum())

# Clean total votes: Drop any total where votes are less than 1 (there were two outliers for 2016 in Bedford with a vote count of 0)
df_index = df[df['Total Votes'] <= 0].index
df = df.drop(df_index)
print("Number of NAs in Total Votes:", df['Total Votes'].isna().sum())
#print(df['Total Votes'].unique())

# Merge 2020 absentee, election day, and provisional votes to one candidate vote
county_total_votes = df[['County Name', 'Candidate', 'Year', 'Total Votes']].drop_duplicates()

biden_entries = df[(df['Candidate'] == 'JOSEPH R BIDEN JR') & (df['Year'] == 2020)].groupby(['County Name', 'Candidate', 'Year']).agg({'Candidate Votes': 'sum'}).reset_index()
biden_entries = pd.merge(biden_entries, county_total_votes, on=['County Name', 'Candidate', 'Year'], how='left')
df = df[df['Candidate'] != 'JOSEPH R BIDEN JR']
df = pd.concat([df, biden_entries], ignore_index=True)
print(biden_entries)

trump_entries = df[(df['Candidate'] == 'DONALD J TRUMP') & (df['Year'] == 2020)].groupby(['County Name', 'Candidate', 'Year']).agg({'Candidate Votes': 'sum'}).reset_index()
trump_entries = pd.merge(trump_entries, county_total_votes, on=['County Name', 'Candidate', 'Year'], how='left')
df = df[df['Candidate'] != 'DONALD J TRUMP']
df = pd.concat([df, trump_entries], ignore_index=True)
print(trump_entries)

# Clean candidate votes: Drop any total where votes are less than 1 (there were two outliers for 2016 in Bedford with a vote count of 0)
df_index = df[df['Candidate Votes'] <= 0].index
df = df.drop(df_index)
print("Number of NAs in Candidate Votes:", df['Total Votes'].isna().sum())
#print(df['Candidate Votes'].unique())

display(df)

Unique parties: ['DEMOCRAT' 'REPUBLICAN']
Unique years: [2000 2004 2008 2012 2016 2020]
Number of NAs in counties: 0
Number of NAs in Total Votes: 0
           County Name          Candidate  Year  Candidate Votes  Total Votes
0             ACCOMACK  JOSEPH R BIDEN JR  2020             7578        16962
1            ALBEMARLE  JOSEPH R BIDEN JR  2020            42466        64657
2      ALEXANDRIA CITY  JOSEPH R BIDEN JR  2020            66240        82521
3            ALLEGHANY  JOSEPH R BIDEN JR  2020             2243         8203
4               AMELIA  JOSEPH R BIDEN JR  2020             2411         7894
..                 ...                ...   ...              ...          ...
128  WILLIAMSBURG CITY  JOSEPH R BIDEN JR  2020             4790         6890
129    WINCHESTER CITY  JOSEPH R BIDEN JR  2020             6610        12113
130               WISE  JOSEPH R BIDEN JR  2020             3110        16615
131              WYTHE  JOSEPH R BIDEN JR  2020             3143       

Unnamed: 0.1,Unnamed: 0,Year,state,state_po,County Name,county_fips,office,Candidate,Party,Candidate Votes,Total Votes,version,mode
0,11161.0,2000,VIRGINIA,VA,ACCOMACK,51001.0,US PRESIDENT,AL GORE,DEMOCRAT,5092,11925,20220315.0,TOTAL
1,11162.0,2000,VIRGINIA,VA,ACCOMACK,51001.0,US PRESIDENT,GEORGE W. BUSH,REPUBLICAN,6352,11925,20220315.0,TOTAL
2,11165.0,2000,VIRGINIA,VA,ALBEMARLE,51003.0,US PRESIDENT,AL GORE,DEMOCRAT,16255,36846,20220315.0,TOTAL
3,11166.0,2000,VIRGINIA,VA,ALBEMARLE,51003.0,US PRESIDENT,GEORGE W. BUSH,REPUBLICAN,18291,36846,20220315.0,TOTAL
4,11169.0,2000,VIRGINIA,VA,ALLEGHANY,51005.0,US PRESIDENT,AL GORE,DEMOCRAT,2214,5123,20220315.0,TOTAL
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1599,,2020,,,WILLIAMSBURG CITY,,,DONALD J TRUMP,,1963,6890,,
1600,,2020,,,WINCHESTER CITY,,,DONALD J TRUMP,,5221,12113,,
1601,,2020,,,WISE,,,DONALD J TRUMP,,13366,16615,,
1602,,2020,,,WYTHE,,,DONALD J TRUMP,,11733,15073,,


In [23]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Separate the data into training and testing sets
# Testing data is from the year 2020
train_df = df[df['Year'] != 2020]
test_df = df[df['Year'] == 2020]

# Check the distribution of training and testing data
print(f"Training data shape: {train_df.shape}")
print(f"Testing data shape: {test_df.shape}")

train_df.head(), test_df.head()

Training data shape: (1338, 13)
Testing data shape: (266, 13)


(   Unnamed: 0  Year     state state_po County Name  county_fips        office  \
 0     11161.0  2000  VIRGINIA       VA    ACCOMACK      51001.0  US PRESIDENT   
 1     11162.0  2000  VIRGINIA       VA    ACCOMACK      51001.0  US PRESIDENT   
 2     11165.0  2000  VIRGINIA       VA   ALBEMARLE      51003.0  US PRESIDENT   
 3     11166.0  2000  VIRGINIA       VA   ALBEMARLE      51003.0  US PRESIDENT   
 4     11169.0  2000  VIRGINIA       VA   ALLEGHANY      51005.0  US PRESIDENT   
 
         Candidate       Party  Candidate Votes  Total Votes     version   mode  
 0         AL GORE    DEMOCRAT             5092        11925  20220315.0  TOTAL  
 1  GEORGE W. BUSH  REPUBLICAN             6352        11925  20220315.0  TOTAL  
 2         AL GORE    DEMOCRAT            16255        36846  20220315.0  TOTAL  
 3  GEORGE W. BUSH  REPUBLICAN            18291        36846  20220315.0  TOTAL  
 4         AL GORE    DEMOCRAT             2214         5123  20220315.0  TOTAL  ,
       Unname

In [24]:
# Create a linear regression model using the train data
X_train = pd.concat([
    pd.get_dummies(train_df['Candidate'], dtype='int'),
    pd.get_dummies(train_df['County Name'], dtype='int'),
    pd.get_dummies(train_df['Party'], dtype='int'),
    train_df[['Total Votes']]
], axis=1)
X_train.columns = X_train.columns.astype(str)
y_train = train_df['Candidate Votes']

# Construct a linear model with no intercept using sklearn
from sklearn import linear_model
reg = linear_model.LinearRegression(fit_intercept=False).fit(X_train, y_train)

results = pd.DataFrame({'Variables':reg.feature_names_in_, 'Coefficient':reg.coef_})
results

Unnamed: 0,Variables,Coefficient
0,AL GORE,4.100957e+10
1,BARACK OBAMA,4.100958e+10
2,DONALD TRUMP,6.270478e+07
3,GEORGE W. BUSH,6.270703e+07
4,HILLARY CLINTON,4.100957e+10
...,...,...
135,WYTHE,-3.135289e+08
136,YORK,-3.135289e+08
137,DEMOCRAT,-4.069605e+10
138,REPUBLICAN,2.508229e+08


In [25]:
X_test = pd.concat([
    pd.get_dummies(test_df['Year'], dtype='int'),
    pd.get_dummies(test_df['County Name'], dtype='int'),
    pd.get_dummies(test_df['Party'], dtype='int'),
    test_df[['Total Votes']]
], axis=1)
X_test.columns = X_test.columns.astype(str)
y_test = test_df['Candidate Votes']

# Calculate R^2
r2 = reg.score(X_test, y_test)
print('R_squared: ', r2)

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- 2020
- ALEXANDRIA CITY
- BRISTOL CITY
- BUENA VISTA CITY
- CHARLOTTESVILLE CITY
- ...
Feature names seen at fit time, yet now missing:
- AL GORE
- ALEXANDRIA
- BARACK OBAMA
- BRISTOL
- BUENA VISTA
- ...


In [None]:
# Calculate total votes growth rates

In [None]:
# Use growth rates and 2020 total votes to derive 2024 total votes

In [None]:
# Use our model to predict democrat and republic candidate votes for each county

In [None]:
# Calcute net votes for each county using predicted values

In [None]:
# Graph net vote values on VA map