In [3]:
#reading data of population per county in the United States in 2020 from CDC

import pandas as pd

col_list = ["countyFIPS", "population"]
county_population = pd.read_csv("covid_county_population_usafacts.csv", usecols=col_list)

print(county_population.shape)

county_population.sample(10)

(3144, 2)


Unnamed: 0,countyFIPS,population
2284,42077,369318
1823,35049,150358
1971,37159,142088
1952,37121,14964
704,18011,67843
852,19123,22095
2231,41043,129749
2420,46115,6376
482,13189,21312
1354,27077,3740


In [5]:
#Reading data of cumulative confirmed cases up to the election date, 11/03/2020. Source CDC

col_list2 = ["countyFIPS", "cumul_confirmed"]
county_confirmed_cases = pd.read_csv("covid_confirmed_usafacts.csv", usecols = col_list2)

county_confirmed_cases["cumul_confirmed"] = county_confirmed_cases["cumul_confirmed"]/county_population["population"]

print(county_confirmed_cases.shape)

county_confirmed_cases.sample(10)

(3143, 2)


Unnamed: 0,countyFIPS,cumul_confirmed
921,20065,0.025038
2877,51115,0.010482
2394,46065,0.756549
833,19087,0.088547
2135,40007,0.007923
439,13105,0.012318
2845,51049,0.002452
1238,26013,0.011288
660,17129,0.014813
1002,21017,0.002643


In [6]:
# reading data of cumulative death per county due to COVID-19 up to the election date 11/03/2020. Source CDC

col_list3 = ["countyFIPS", "cumul_death"]
county_death_cases = pd.read_csv("covid_deaths_usafacts.csv", usecols = col_list3)

county_death_cases["cumul_death"] = county_death_cases["cumul_death"]/county_population["population"]

print(county_death_cases.shape)

county_death_cases.sample(10)

(3143, 2)


Unnamed: 0,countyFIPS,cumul_death
2783,49011,0.046316
891,20005,0.000764
1,1003,0.000345
2652,48257,0.003974
2391,46059,0.000162
1903,37025,0.00105
1897,37013,0.00262
786,18177,0.001427
1559,29151,0.000285
2216,41015,8.2e-05


In [7]:
# Concatanating three databases using countyFIPS. 

result = pd.concat([county_confirmed_cases["countyFIPS"], county_confirmed_cases["cumul_confirmed"], county_death_cases["cumul_death"]], axis=1)

# normalize data using population column

#result["cumul_confirmed"] = result["cumul_confirmed"]/result["population"]
#result["cumul_death"] = result["cumul_death"]/result["population"]

print(result.shape)

result.sample(10)

(3143, 3)


Unnamed: 0,countyFIPS,cumul_confirmed,cumul_death
1161,22095,0.08011,0.005119
597,17003,0.002445,3.1e-05
2796,49037,0.000735,2.6e-05
801,19023,0.025229,0.000153
315,9013,0.007247,0.000256
2656,48265,0.964567,0.019685
1278,26093,0.027059,0.000356
1473,28143,0.02853,0.000877
109,4025,0.067401,0.001979
2722,48397,0.122877,0.001991


In [35]:
#Google's County Level Mobility Changes After COVID-19 Pandemic Hit

import numpy as np
from numpy import nan


# 1 element of workplace change from baseline due to COVID restrictions are picked for this study

col_list4 = ["countyFIPS","workplaces_percent_change_from_baseline"]
county_mobility = pd.read_csv("2020_US_Region_Mobility_Report.csv", usecols = col_list4)
county_mobility = county_mobility.dropna()
county_mobility = county_mobility.reset_index(drop=True)

county_mobility = county_mobility.rename(columns={"workplaces_percent_change_from_baseline": "workplace change %"})
# in order to calculate the mean, we eliminate cells with no data to get a more accurate mean 

#county_mobility["retail_and_recreation_percent_change_from_baseline"] = county_mobility["retail_and_recreation_percent_change_from_baseline"].replace(nan,0)

# this dataset includes all these indexes daily. We need to aggregate them by the countyFIPS to get the mean index per county up to the election date

county_mobility = county_mobility.groupby('countyFIPS').agg({'workplace change %':['mean']})



print(county_mobility.shape)

county_mobility.sample(10)

(2798, 1)


Unnamed: 0_level_0,workplace change %
Unnamed: 0_level_1,mean
countyFIPS,Unnamed: 1_level_2
54045.0,-20.694118
39075.0,-13.772549
48427.0,-28.886861
6037.0,-33.667857
55073.0,-22.470803
6019.0,-28.067857
31123.0,-24.647059
13053.0,-31.651934
27013.0,-27.956204
40151.0,-25.497238


In [36]:
# we use left join to merge the mobility data and the COVID data together. 
# we use left join because there are some counties with no mobility data yet the COVID data should be merged.

result_with_mobility = pd.merge(left = result, right = county_mobility, how='left', left_on='countyFIPS', right_on='countyFIPS')

result_with_mobility.iloc[:, 3].fillna(0, inplace=True)

#print(result_with_mobility.isnull().values.any())
print(result_with_mobility.shape)

result_with_mobility.sample(10)


(3143, 4)




Unnamed: 0,countyFIPS,cumul_confirmed,cumul_death,"(workplace change %, mean)"
2076,39065,0.006268,0.000185,-20.411765
2966,53025,1.53573,0.013034,-20.748175
319,10005,0.01493,0.00043,-16.642857
1377,27125,0.000189,4e-06,-30.4375
2095,39103,0.037623,0.000707,-26.404332
2081,39075,0.036867,0.000743,-13.772549
2739,48431,0.002456,0.0,0.0
2547,48047,0.23674,0.016171,-24.706215
1285,26107,0.024876,0.000137,-24.273438
400,13025,0.037365,0.00101,-19.423529


In [37]:
# reading voting data in 2016 per county per DEM and GOP

col_list5 = ["county_fips", "votes_dem", "votes_gop"]
vote2016 = pd.read_csv("2016.csv", usecols = col_list5)

binary_2016 = vote2016["votes_gop"]/vote2016["votes_dem"]

vote2016.insert(0, "binary_2016", value = binary_2016)

vote2016["binary_2016"][vote2016["binary_2016"] >= 1] = 1
vote2016["binary_2016"][vote2016["binary_2016"] < 1] = 0

#vote2016 = vote2016.dropna()
#print(vote2016.isnull().values.any())
print(vote2016.shape)

vote2016.sample(10)

(3112, 4)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


Unnamed: 0,binary_2016,votes_dem,votes_gop,county_fips
2428,1.0,998,3626,47061
988,1.0,752,5861,21051
312,1.0,1715,5320,12045
3085,1.0,1348,4461,54103
356,1.0,6861,25695,12131
1030,1.0,785,4363,21135
701,1.0,2496,9728,17075
446,0.0,9531,6125,13179
3004,1.0,5886,13255,55083
844,1.0,27207,30711,18157


In [13]:
# reading voting data in 2020 per county per DEM and GOP

col_list6 = ["county_fips", "votes_dem", "votes_gop","state_name"]
vote2020 = pd.read_csv("2020.csv", usecols = col_list6)

binary_2020 = vote2020["votes_gop"]/vote2020["votes_dem"]

vote2020.insert(0, "binary_2020", value = binary_2020)

vote2020["binary_2020"][vote2020["binary_2020"] >= 1] = 1
vote2020["binary_2020"][vote2020["binary_2020"] < 1] = 0

vote2020 = vote2020.dropna()

#print(vote2020.isnull().values.any())
print(vote2020.shape)

vote2020.sample(10)

(3112, 5)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


Unnamed: 0,binary_2020,state_name,county_fips,votes_gop,votes_dem
1256,1.0,Michigan,26111,27706,20513
244,1.0,Colorado,8057,680,173
1111,1.0,Louisiana,22057,36024,8672
2160,1.0,Oklahoma,40119,17813,10904
1147,1.0,Maine,23001,29259,27568
858,1.0,Kansas,20001,5486,1836
2412,1.0,Tennessee,47029,12182,2524
2899,0.0,Virginia,51650,18201,45481
182,0.0,California,6051,2433,3884
1596,1.0,Montana,30057,4186,1770


In [43]:
# The number counties with Different voting patterns between 2016 and 2020 is around 200 counties
count = 0 

for x in range(0, len(vote2020)):
    if (vote2020.iloc[x]["binary_2020"] != vote2016.iloc[x]["binary_2016"]):
        count += 1
    
print(count)

614


In [38]:
# combining all X and Y data to prepare it for using in prediction models.
cols_to_use = vote2016.columns.difference(vote2020.columns)

vote = pd.merge(left = vote2016[cols_to_use], right = vote2020, how='inner', left_on=vote2016['county_fips'], right_on='county_fips')
vote_covid = pd.merge(left = vote , right = result_with_mobility, how='left', left_on='county_fips', right_on='countyFIPS')

#vote_covid = pd.merge(left = vote_covid , right = vote2020, how = 'inner', left_on = 'countyFIPS', right_on = 'county_fips')


state_fips = vote_covid["county_fips"].astype(str).str[:-3].astype(np.int64)

vote_covid.insert(1, "state_fips", value = state_fips)

#print(vote_covid.isnull().values.any())
print(vote_covid.shape)

vote_covid.sample(20)




(3111, 11)


Unnamed: 0,binary_2016,state_fips,binary_2020,state_name,county_fips,votes_gop,votes_dem,countyFIPS,cumul_confirmed,cumul_death,"(workplace change %, mean)"
1827,1.0,31,1.0,Nebraska,31101,3544,763,31101,0.020939,0.0,-15.027624
2428,1.0,47,1.0,Tennessee,47063,18789,5497,47063,0.197215,0.003798,-17.062044
2804,1.0,51,1.0,Virginia,51045,2536,587,51045,0.004378,0.0,-30.787402
674,1.0,17,1.0,Illinois,17021,11505,4286,17021,0.005007,0.000148,-22.188235
2084,1.0,39,1.0,Ohio,39143,18487,10391,39143,0.011804,0.0003,-19.70438
2679,1.0,48,1.0,Texas,48375,22732,9867,48375,0.175628,0.001986,-21.614286
2123,1.0,40,1.0,Oklahoma,40045,1688,162,40045,0.006747,0.0,0.0
11,1.0,1,1.0,Alabama,1023,4294,3126,1023,0.031694,0.000953,-16.751269
1322,1.0,27,1.0,Minnesota,27077,1704,671,27077,0.005357,9.4e-05,0.0
2216,1.0,42,1.0,Pennsylvania,42005,27112,8352,42005,0.00072,1.8e-05,-21.277372


In [24]:
count1 = 0 

for x in range(0, len(vote_covid)):
    if (vote_covid.iloc[x]["binary_2020"] != vote_covid.iloc[x]["binary_2016"]):
        count1 += 1
    
print(count1)

78


In [39]:
# Training a model on 2016 using logistic regression including the mobility changes from the base to the COVID-19 data
# 2020 voting counts has not been finished yet. Therefore, there are about 145 counties not included in the analysis

import numpy as np
from numpy import nan
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import mean_squared_error

X = np.array(vote_covid.iloc[:, [0, 1, 6, 7, 8]])
y = np.array(vote_covid.iloc[:, 9])
#xt = np.sum(y)
#array_has_nan=np.isnan(xt)
#print(array_has_nan)

#y2020 = np.array(vote_covid.iloc[:, 10])

kf = KFold(n_splits=5, shuffle = False)
kf.get_n_splits(X)

score = []
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    log = LogisticRegression(random_state=0).fit(X_train, y_train)
    score.append(log.score(X_test, y_test))
    prediction = log.predict(X_test)
    print("Confusion Matrix: ", confusion_matrix(y_test, prediction))
    print(" MSE: ", mean_squared_error(y_test, prediction))

print("average accuracy: ", sum(score)/5)



ValueError: Unknown label type: 'continuous'

In [59]:
# Training the model on 2016 voting data and test them on 2020 data using RANDOM FOREST classifier

from sklearn.ensemble import RandomForestClassifier 
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


X = np.array(vote_covid.iloc[:, [0, 1, 6, 7, 8]])
y = np.array(vote_covid.iloc[:, 9])

y_2020 = np.array(vote_covid.iloc[:, 9])

kf = KFold(n_splits = 5, shuffle = False)
kf.get_n_splits(X)

score2 = []
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y_2020[test_index]
    classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
    classifier.fit(X_train, y_train) 
    y_pred2 = classifier.predict(X_test)
    score2.append(classifier.score(X_test, y_test))
    print(confusion_matrix(y_test, y_pred2))
    print(classification_report(y_test, y_pred2))
    print(accuracy_score(y_test, y_pred2))
    incorrect = y_test != y_pred2
    index  = np.where(incorrect == True)
    print(index)
    for i in index:
        print(vote_covid.iloc[i, 10])

    
print("average accuracy: ", sum(score2)/5)

[[128   8]
 [  2 419]]
              precision    recall  f1-score   support

         0.0       0.98      0.94      0.96       136
         1.0       0.98      1.00      0.99       421

    accuracy                           0.98       557
   macro avg       0.98      0.97      0.98       557
weighted avg       0.98      0.98      0.98       557

0.9820466786355476
(array([158, 168, 213, 219, 229, 273, 290, 327, 332, 358]),)
158    California
168    California
213      Colorado
219      Colorado
229      Colorado
273      Delaware
290       Florida
327       Florida
332       Florida
358       Georgia
Name: state_name, dtype: object
[[ 60  10]
 [  0 487]]
              precision    recall  f1-score   support

         0.0       1.00      0.86      0.92        70
         1.0       0.98      1.00      0.99       487

    accuracy                           0.98       557
   macro avg       0.99      0.93      0.96       557
weighted avg       0.98      0.98      0.98       557

0.982046