In [46]:
#reading data of population per county in the United States in 2020 from CDC

import pandas as pd

col_list = ["countyFIPS", "population"]
county_population = pd.read_csv("covid_county_population_usafacts.csv", usecols=col_list)

print(county_population)



      countyFIPS  population
0           1001       55869
1           1003      223234
2           1005       24686
3           1007       22394
4           1009       57826
...          ...         ...
3139       56037       42343
3140       56039       23464
3141       56041       20226
3142       56043        7805
3143       56045        6927

[3144 rows x 2 columns]


In [47]:
#Reading data of cumulative confirmed cases up to the election date, 11/03/2020. Source CDC

col_list2 = ["countyFIPS", "cumul_confirmed"]
county_confirmed_cases = pd.read_csv("covid_confirmed_usafacts.csv", usecols = col_list2)

county_confirmed_cases["cumul_confirmed"] = county_confirmed_cases["cumul_confirmed"]/county_population["population"]

print(county_confirmed_cases)

      countyFIPS  cumul_confirmed
0           1001         0.039915
1           1003         0.031792
2           1005         0.043749
3           1007         0.040502
4           1009         0.038426
...          ...              ...
3138       56037         0.066931
3139       56039         0.021184
3140       56041         0.022545
3141       56043         0.008850
3142       56045         0.026009

[3143 rows x 2 columns]


In [48]:
# reading data of cumulative death per county due to COVID-19 up to the election date 11/03/2020. Source CDC

col_list3 = ["countyFIPS", "cumul_death"]
county_death_cases = pd.read_csv("covid_deaths_usafacts.csv", usecols = col_list3)

county_death_cases["cumul_death"] = county_death_cases["cumul_death"]/county_population["population"]
print(county_death_cases)

      countyFIPS  cumul_death
0           1001     0.000537
1           1003     0.000345
2           1005     0.000365
3           1007     0.000670
4           1009     0.000432
...          ...          ...
3138       56037     0.000407
3139       56039     0.000047
3140       56041     0.000128
3141       56043     0.000346
3142       56045     0.000000

[3143 rows x 2 columns]


In [49]:
# Concatanating three databases using countyFIPS. 

result = pd.concat([county_confirmed_cases["countyFIPS"], county_confirmed_cases["cumul_confirmed"], county_death_cases["cumul_death"]], axis=1)

# normalize data using population column

#result["cumul_confirmed"] = result["cumul_confirmed"]/result["population"]
#result["cumul_death"] = result["cumul_death"]/result["population"]

print(result)

      countyFIPS  cumul_confirmed  cumul_death
0           1001         0.039915     0.000537
1           1003         0.031792     0.000345
2           1005         0.043749     0.000365
3           1007         0.040502     0.000670
4           1009         0.038426     0.000432
...          ...              ...          ...
3138       56037         0.066931     0.000407
3139       56039         0.021184     0.000047
3140       56041         0.022545     0.000128
3141       56043         0.008850     0.000346
3142       56045         0.026009     0.000000

[3143 rows x 3 columns]


In [50]:
#Google's County Level Mobility Changes After COVID-19 Pandemic Hit

import numpy as np
from numpy import nan


# 1 element of workplace change from baseline due to COVID restrictions are picked for this study

col_list4 = ["countyFIPS","workplaces_percent_change_from_baseline"]
county_mobility = pd.read_csv("2020_US_Region_Mobility_Report.csv", usecols = col_list4)
county_mobility = county_mobility.dropna()
county_mobility = county_mobility.reset_index(drop=True)
# in order to calculate the mean, we eliminate cells with no data to get a more accurate mean 

#county_mobility["retail_and_recreation_percent_change_from_baseline"] = county_mobility["retail_and_recreation_percent_change_from_baseline"].replace(nan,0)

# this dataset includes all these indexes daily. We need to aggregate them by the countyFIPS to get the mean index per county up to the election date

county_mobility = county_mobility.groupby('countyFIPS').agg({'workplaces_percent_change_from_baseline':['mean']})

print(county_mobility)

           workplaces_percent_change_from_baseline
                                              mean
countyFIPS                                        
1001.0                                  -22.346715
1003.0                                  -22.039286
1005.0                                  -17.631373
1007.0                                  -19.058824
1009.0                                  -22.967153
...                                            ...
56037.0                                 -18.930502
56039.0                                 -32.200000
56041.0                                 -16.019608
56043.0                                 -19.651934
56045.0                                 -22.675978

[2798 rows x 1 columns]


In [51]:
# we use left join to merge the mobility data and the COVID data together. 
# we use left join because there are some counties with no mobility data yet the COVID data should be merged.

result_with_mobility = pd.merge(left = result, right = county_mobility, how='inner', left_on='countyFIPS', right_on='countyFIPS')


#print(result_with_mobility.isnull().values.any())
print(result_with_mobility)


      countyFIPS  cumul_confirmed  cumul_death  \
0           1001         0.039915     0.000537   
1           1003         0.031792     0.000345   
2           1005         0.043749     0.000365   
3           1007         0.040502     0.000670   
4           1009         0.038426     0.000432   
...          ...              ...          ...   
2793       56037         0.066931     0.000407   
2794       56039         0.021184     0.000047   
2795       56041         0.022545     0.000128   
2796       56043         0.008850     0.000346   
2797       56045         0.026009     0.000000   

      (workplaces_percent_change_from_baseline, mean)  
0                                          -22.346715  
1                                          -22.039286  
2                                          -17.631373  
3                                          -19.058824  
4                                          -22.967153  
...                                               ...  
2793   



In [53]:
# reading voting data in 2016 per county per DEM and GOP

col_list5 = ["combined_fips", "votes_dem", "votes_gop"]
vote2016 = pd.read_csv("2016.csv", usecols = col_list5)

binary_2016 = vote2016["votes_gop"]/vote2016["votes_dem"]

vote2016.insert(0, "binary_2016", value = binary_2016)

vote2016["binary_2016"][vote2016["binary_2016"] >= 1] = 1
vote2016["binary_2016"][vote2016["binary_2016"] < 1] = 0

#vote2016 = vote2016.dropna()
#print(vote2016.isnull().values.any())
print(vote2016)

      binary_2016  votes_dem  votes_gop  combined_fips
0             1.0       5908      18110           1001
1             1.0      18409      72780           1003
2             1.0       4848       5431           1005
3             1.0       1874       6733           1007
4             1.0       2150      22808           1009
...           ...        ...        ...            ...
3107          1.0       3233      12153          56037
3108          0.0       7313       3920          56039
3109          1.0       1202       6154          56041
3110          1.0        532       2911          56043
3111          1.0        294       2898          56045

[3112 rows x 4 columns]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


In [54]:
# reading voting data in 2020 per county per DEM and GOP

col_list6 = ["county_fips", "votes_dem", "votes_gop","state_name"]
vote2020 = pd.read_csv("2020.csv", usecols = col_list6)

binary_2020 = vote2020["votes_gop"]/vote2020["votes_dem"]

vote2020.insert(0, "binary_2020", value = binary_2020)

vote2020["binary_2020"][vote2020["binary_2020"] >= 1] = 1
vote2020["binary_2020"][vote2020["binary_2020"] < 1] = 0

vote2020 = vote2020.dropna()

#print(vote2020.isnull().values.any())
print(vote2020)

      binary_2020 state_name  county_fips  votes_gop  votes_dem
0             1.0    Alabama         1001      19764       7450
1             1.0    Alabama         1003      83055      24344
2             1.0    Alabama         1005       5605       4772
3             1.0    Alabama         1007       7508       1982
4             1.0    Alabama         1009      24595       2627
...           ...        ...          ...        ...        ...
3114          1.0    Wyoming        56037      12197       3822
3115          0.0    Wyoming        56039       4341       9848
3116          1.0    Wyoming        56041       7494       1591
3117          1.0    Wyoming        56043       3245        651
3118          1.0    Wyoming        56045       3107        360

[3110 rows x 5 columns]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


In [43]:
# The number counties with Different voting patterns between 2016 and 2020 is around 200 counties
count = 0 

for x in range(0, len(vote2020)):
    if (vote2020.iloc[x]["binary_2020"] != vote2016.iloc[x]["binary_2016"]):
        count += 1
    
print(count)

614


In [56]:
# combining all X and Y data to prepare it for using in prediction models.

vote_covid = pd.merge(left = vote2016 , right = result_with_mobility, how='inner', left_on='combined_fips', right_on='countyFIPS')

vote_covid = pd.merge(left = vote_covid , right = vote2020, how = 'inner', left_on = 'countyFIPS', right_on = 'county_fips')


state_fips = vote_covid["county_fips"].astype(str).str[:-3].astype(np.int64)

vote_covid.insert(1, "state_fips", value = state_fips)

#print(vote_covid.isnull().values.any())
print(vote_covid)


count1 = 0 

for x in range(0, len(vote_covid)):
    if (vote_covid.iloc[x]["binary_2020"] != vote_covid.iloc[x]["binary_2016"]):
        count1 += 1
    
print(count1)

      binary_2016  state_fips  votes_dem_x  votes_gop_x  combined_fips  \
0             1.0           1         5908        18110           1001   
1             1.0           1        18409        72780           1003   
2             1.0           1         4848         5431           1005   
3             1.0           1         1874         6733           1007   
4             1.0           1         2150        22808           1009   
...           ...         ...          ...          ...            ...   
2779          1.0          56         3233        12153          56037   
2780          0.0          56         7313         3920          56039   
2781          1.0          56         1202         6154          56041   
2782          1.0          56          532         2911          56043   
2783          1.0          56          294         2898          56045   

      countyFIPS  cumul_confirmed  cumul_death  \
0           1001         0.039915     0.000537   
1          

In [57]:
# Training a model on 2016 using logistic regression including the mobility changes from the base to the COVID-19 data
# 2020 voting counts has not been finished yet. Therefore, there are about 145 counties not included in the analysis

import numpy as np
from numpy import nan
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import mean_squared_error

X = np.array(vote_covid.iloc[:, [0, 1, 6, 7, 8]])
y = np.array(vote_covid.iloc[:, 9])
#xt = np.sum(y)
#array_has_nan=np.isnan(xt)
#print(array_has_nan)

#y2020 = np.array(vote_covid.iloc[:, 10])

kf = KFold(n_splits=5, shuffle = False)
kf.get_n_splits(X)

score = []
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    log = LogisticRegression(random_state=0).fit(X_train, y_train)
    score.append(log.score(X_test, y_test))
    prediction = log.predict(X_test)
    print("Confusion Matrix: ", confusion_matrix(y_test, prediction))
    print(" MSE: ", mean_squared_error(y_test, prediction))

print("average accuracy: ", sum(score)/5)

Confusion Matrix:  [[127   9]
 [  2 419]]
 MSE:  0.019748653500897665
Confusion Matrix:  [[ 60  10]
 [  1 486]]
 MSE:  0.019748653500897665
Confusion Matrix:  [[ 75  12]
 [  4 466]]
 MSE:  0.02872531418312388
Confusion Matrix:  [[ 92  13]
 [  6 446]]
 MSE:  0.03411131059245961
Confusion Matrix:  [[ 96  14]
 [ 10 436]]
 MSE:  0.04316546762589928
average accuracy:  0.9709001201193445




In [59]:
# Training the model on 2016 voting data and test them on 2020 data using RANDOM FOREST classifier

from sklearn.ensemble import RandomForestClassifier 
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


X = np.array(vote_covid.iloc[:, [0, 1, 6, 7, 8]])
y = np.array(vote_covid.iloc[:, 9])

y_2020 = np.array(vote_covid.iloc[:, 9])

kf = KFold(n_splits = 5, shuffle = False)
kf.get_n_splits(X)

score2 = []
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y_2020[test_index]
    classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
    classifier.fit(X_train, y_train) 
    y_pred2 = classifier.predict(X_test)
    score2.append(classifier.score(X_test, y_test))
    print(confusion_matrix(y_test, y_pred2))
    print(classification_report(y_test, y_pred2))
    print(accuracy_score(y_test, y_pred2))
    incorrect = y_test != y_pred2
    index  = np.where(incorrect == True)
    print(index)
    for i in index:
        print(vote_covid.iloc[i, 10])

    
print("average accuracy: ", sum(score2)/5)

[[128   8]
 [  2 419]]
              precision    recall  f1-score   support

         0.0       0.98      0.94      0.96       136
         1.0       0.98      1.00      0.99       421

    accuracy                           0.98       557
   macro avg       0.98      0.97      0.98       557
weighted avg       0.98      0.98      0.98       557

0.9820466786355476
(array([158, 168, 213, 219, 229, 273, 290, 327, 332, 358]),)
158    California
168    California
213      Colorado
219      Colorado
229      Colorado
273      Delaware
290       Florida
327       Florida
332       Florida
358       Georgia
Name: state_name, dtype: object
[[ 60  10]
 [  0 487]]
              precision    recall  f1-score   support

         0.0       1.00      0.86      0.92        70
         1.0       0.98      1.00      0.99       487

    accuracy                           0.98       557
   macro avg       0.99      0.93      0.96       557
weighted avg       0.98      0.98      0.98       557

0.982046