In [2]:
#reading data of population per county in the United States in 2020 from CDC

import pandas as pd

col_list = ["countyFIPS", "population"]
county_population = pd.read_csv("covid_county_population_usafacts.csv", usecols=col_list)

print(county_population)



      countyFIPS  population
0           1001       55869
1           1003      223234
2           1005       24686
3           1007       22394
4           1009       57826
...          ...         ...
3139       56037       42343
3140       56039       23464
3141       56041       20226
3142       56043        7805
3143       56045        6927

[3144 rows x 2 columns]


In [3]:
#Reading data of cumulative confirmed cases up to the election date, 11/03/2020. Source CDC

col_list2 = ["countyFIPS", "cumul_confirmed"]
county_confirmed_cases = pd.read_csv("covid_confirmed_usafacts.csv", usecols = col_list2)

county_confirmed_cases["cumul_confirmed"] = county_confirmed_cases["cumul_confirmed"]/county_population["population"]

print(county_confirmed_cases)

      countyFIPS  cumul_confirmed
0           1001         0.039915
1           1003         0.031792
2           1005         0.043749
3           1007         0.040502
4           1009         0.038426
...          ...              ...
3139       56037         0.015540
3140       56039         0.038229
3141       56041         0.026154
3142       56043         0.022934
3143       56045         0.029306

[3144 rows x 2 columns]


In [4]:
# reading data of cumulative death per county due to COVID-19 up to the election date 11/03/2020. Source CDC

col_list3 = ["countyFIPS", "cumul_death"]
county_death_cases = pd.read_csv("covid_deaths_usafacts.csv", usecols = col_list3)

county_death_cases["cumul_death"] = county_death_cases["cumul_death"]/county_population["population"]
print(county_death_cases)

      countyFIPS  cumul_death
0           1001     0.000537
1           1003     0.000345
2           1005     0.000365
3           1007     0.000670
4           1009     0.000432
...          ...          ...
3139       56037     0.000094
3140       56039     0.000085
3141       56041     0.000148
3142       56043     0.000897
3143       56045     0.000000

[3144 rows x 2 columns]


In [5]:
# Concatanating three databases using countyFIPS. 

result = pd.concat([county_confirmed_cases["countyFIPS"], county_confirmed_cases["cumul_confirmed"], county_death_cases["cumul_death"]], axis=1)

# normalize data using population column

#result["cumul_confirmed"] = result["cumul_confirmed"]/result["population"]
#result["cumul_death"] = result["cumul_death"]/result["population"]

print(result)

      countyFIPS  cumul_confirmed  cumul_death
0           1001         0.039915     0.000537
1           1003         0.031792     0.000345
2           1005         0.043749     0.000365
3           1007         0.040502     0.000670
4           1009         0.038426     0.000432
...          ...              ...          ...
3139       56037         0.015540     0.000094
3140       56039         0.038229     0.000085
3141       56041         0.026154     0.000148
3142       56043         0.022934     0.000897
3143       56045         0.029306     0.000000

[3144 rows x 3 columns]


In [16]:
#Google's County Level Mobility Changes After COVID-19 Pandemic Hit

import numpy as np
from numpy import nan


# 2 elements of workplace change and retail and recreation change from baseline due to COVID restrictions are picked for this study

col_list4 = ["countyFIPS","workplaces_percent_change_from_baseline"]
county_mobility = pd.read_csv("2020_US_Region_Mobility_Report.csv", usecols = col_list4)
county_mobility = county_mobility.dropna()
county_mobility = county_mobility.reset_index(drop=True)
# in order to calculate the mean, we eliminate cells with no data to get a more accurate mean 

#county_mobility["retail_and_recreation_percent_change_from_baseline"] = county_mobility["retail_and_recreation_percent_change_from_baseline"].replace(nan,0)

# this dataset includes all these indexes daily. We need to aggregate them by the countyFIPS to get the mean index per county up to the election date

county_mobility = county_mobility.groupby('countyFIPS').agg({'workplaces_percent_change_from_baseline':['mean']})

print(county_mobility)

           workplaces_percent_change_from_baseline
                                              mean
countyFIPS                                        
1001.0                                  -22.346715
1003.0                                  -22.039286
1005.0                                  -17.631373
1007.0                                  -19.058824
1009.0                                  -22.967153
...                                            ...
56037.0                                 -18.930502
56039.0                                 -32.200000
56041.0                                 -16.019608
56043.0                                 -19.651934
56045.0                                 -22.675978

[2798 rows x 1 columns]


In [17]:
# we use left join to merge the mobility data and the COVID data together. 
# we use left join because there are some counties with no mobility data yet the COVID data should be merged.

result_with_mobility = pd.merge(left = result, right = county_mobility, how='inner', left_on='countyFIPS', right_on='countyFIPS')

print(result_with_mobility)

      countyFIPS  cumul_confirmed  cumul_death  \
0           1001         0.039915     0.000537   
1           1003         0.031792     0.000345   
2           1005         0.043749     0.000365   
3           1007         0.040502     0.000670   
4           1009         0.038426     0.000432   
...          ...              ...          ...   
2793       56037         0.015540     0.000094   
2794       56039         0.038229     0.000085   
2795       56041         0.026154     0.000148   
2796       56043         0.022934     0.000897   
2797       56045         0.029306     0.000000   

      (workplaces_percent_change_from_baseline, mean)  
0                                          -22.346715  
1                                          -22.039286  
2                                          -17.631373  
3                                          -19.058824  
4                                          -22.967153  
...                                               ...  
2793   

In [38]:
# reading voting data in 2016 per county per DEM and GOP

col_list5 = ["combined_fips", "votes_dem", "votes_gop","total_votes" ]
vote2016 = pd.read_csv("2016.csv", usecols = col_list5)

binary_2016 = vote2016["votes_gop"]/vote2016["votes_dem"]

vote2016.insert(0, "binary_2016", value = binary_2016)

vote2016["binary_2016"][vote2016["binary_2016"] >= 1] = 1
vote2016["binary_2016"][vote2016["binary_2016"] < 1] = 0

print(vote2016)

      binary_2016  votes_dem  votes_gop  total_votes  combined_fips
0             1.0       5908      18110        24661           1001
1             1.0      18409      72780        94090           1003
2             1.0       4848       5431        10390           1005
3             1.0       1874       6733         8748           1007
4             1.0       2150      22808        25384           1009
...           ...        ...        ...          ...            ...
3136          1.0       3233      12153        16661          56037
3137          0.0       7313       3920        12176          56039
3138          1.0       1202       6154         8053          56041
3139          1.0        532       2911         3715          56043
3140          1.0        294       2898         3334          56045

[3141 rows x 5 columns]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


In [52]:
# reading voting data in 2020 per county per DEM and GOP

col_list6 = ["county_fips", "votes_dem", "votes_gop","total_votes" ]
vote2020 = pd.read_csv("2020.csv", usecols = col_list6)

binary_2020 = vote2020["votes_gop"]/vote2020["votes_dem"]

vote2020.insert(0, "binary_2020", value = binary_2020)

vote2020["binary_2020"][vote2020["binary_2020"] >= 1] = 1
vote2020["binary_2020"][vote2020["binary_2020"] < 1] = 0

vote2020 = vote2020.dropna()
print(vote2020)

      binary_2020  county_fips  votes_gop  votes_dem  total_votes
0             1.0         1001      19764       7450        27639
1             1.0         1003      83055      24344       108945
2             1.0         1005       5605       4772        10457
3             1.0         1007       7508       1982         9573
4             1.0         1009      24595       2627        27459
...           ...          ...        ...        ...          ...
3136          1.0        56037      12197       3822        16489
3137          0.0        56039       4341       9848        14677
3138          1.0        56041       7494       1591         9400
3139          1.0        56043       3245        651         4012
3140          1.0        56045       3107        360         3542

[3141 rows x 5 columns]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


In [53]:
# combining all X and Y data to prepare it for using in prediction models.

vote_covid = pd.merge(left = vote2016 , right = result_with_mobility, how='inner', left_on='combined_fips', right_on='countyFIPS')

vote_covid = pd.merge(left = vote_covid , right = vote2020, how = 'inner', left_on = 'countyFIPS', right_on = 'county_fips')
print(vote_covid)


      binary_2016  votes_dem_x  votes_gop_x  total_votes_x  combined_fips  \
0             1.0         5908        18110          24661           1001   
1             1.0        18409        72780          94090           1003   
2             1.0         4848         5431          10390           1005   
3             1.0         1874         6733           8748           1007   
4             1.0         2150        22808          25384           1009   
...           ...          ...          ...            ...            ...   
2792          1.0         3233        12153          16661          56037   
2793          0.0         7313         3920          12176          56039   
2794          1.0         1202         6154           8053          56041   
2795          1.0          532         2911           3715          56043   
2796          1.0          294         2898           3334          56045   

      countyFIPS  cumul_confirmed  cumul_death  \
0           1001         

In [61]:
# creating a model using logistic regression including the mobility changes from the base to the COVID-19 data
# 2020 voting counts has not been finished yet. Therefore, there are about 145 counties not included in the analysis

import numpy as np
from numpy import nan
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import mean_squared_error

X = np.array(vote_covid.iloc[:, [0, 6, 7,8]])
y = np.array(vote_covid.iloc[:, 9])


kf = KFold(n_splits=5)
kf.get_n_splits(X)

score = []
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    log = LogisticRegression(random_state=0).fit(X_train, y_train)
    score.append(log.score(X_test, y_test))
    prediction = log.predict(X_test)
    print("Confusion Matrix: ", confusion_matrix(y_test, prediction))
    print(" MSE: ", mean_squared_error(y_test, prediction))

print("average accuracy: ", sum(score)/5)

Confusion Matrix:  [[128  12]
 [  2 418]]
 MSE:  0.025
Confusion Matrix:  [[ 53   9]
 [  1 497]]
 MSE:  0.017857142857142856
Confusion Matrix:  [[ 97  17]
 [  4 441]]
 MSE:  0.03756708407871199
Confusion Matrix:  [[ 76   8]
 [  6 469]]
 MSE:  0.025044722719141325
Confusion Matrix:  [[ 96  14]
 [  8 441]]
 MSE:  0.03935599284436494
average accuracy:  0.9710350115001278




In [63]:
from sklearn.ensemble import RandomForestClassifier 
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


X = np.array(vote_covid.iloc[:, [6, 7,8]])
y = np.array(vote_covid.iloc[:, 9])


kf = KFold(n_splits=5)
kf.get_n_splits(X)

score2 = []
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
    classifier.fit(X_train, y_train) 
    y_pred2 = classifier.predict(X_test)
    score2.append(classifier.score(X_test, y_test))
    print(confusion_matrix(y_test, y_pred2))
    print(classification_report(y_test, y_pred2))
    print(accuracy_score(y_test, y_pred2))
    

    
print("average accuracy: ", sum(score2)/5)

[[ 52  88]
 [ 24 396]]
              precision    recall  f1-score   support

         0.0       0.68      0.37      0.48       140
         1.0       0.82      0.94      0.88       420

    accuracy                           0.80       560
   macro avg       0.75      0.66      0.68       560
weighted avg       0.78      0.80      0.78       560

0.8
[[ 29  33]
 [ 36 462]]
              precision    recall  f1-score   support

         0.0       0.45      0.47      0.46        62
         1.0       0.93      0.93      0.93       498

    accuracy                           0.88       560
   macro avg       0.69      0.70      0.69       560
weighted avg       0.88      0.88      0.88       560

0.8767857142857143
[[ 50  64]
 [ 32 413]]
              precision    recall  f1-score   support

         0.0       0.61      0.44      0.51       114
         1.0       0.87      0.93      0.90       445

    accuracy                           0.83       559
   macro avg       0.74      0.68   