# Importing Packages

In [78]:
import requests
import json
from io import StringIO
from bs4 import BeautifulSoup as bs
from datetime import date, timedelta
import covidcast

import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 100)

import geopandas as gpd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl

import pickle

from sklearn.pipeline import Pipeline 
from sklearn.impute import KNNImputer, SimpleImputer 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier



# Data Collection

- Dataset was orignally constructed (webscraping/api calls) made and relevant to classification.  At least 1000 rows and at least 10 features.

# Exploratory Data Analysis

- Student created at least 2 well-constructed visualizations, discussed descriptive statistics, and is able to interpret and compare or link them together to give a cohesive introduction to the data


## Geopandas File for Mapping

In [33]:
data = gpd.read_file('https://services9.arcgis.com/6Hv9AANartyT7fJW/arcgis/rest/services/USCounties_cases_V1/FeatureServer/0/query?where=1%3D1&outFields=Countyname,FIPS,Shape__Area,Shape__Length,ST_Name,ST_Abbr,ST_ID,OBJECTID&outSR=4326&f=json')

## NYT Mask Wearing Survey Data

- This data comes from a large number of interviews conducted online by the global data and survey firm Dynata at the request of The New York Times. 
- The firm asked a question about mask use to obtain 250,000 survey responses between July 2 and July 14, enough data to provide estimates more detailed than the state level. - (Several states have imposed new mask requirements since the completion of these interviews.)

Specifically, each participant was asked: 

*How often do you wear a mask in public when you expect to be within six feet of another person?*

This survey was conducted a single time, and at this point we have no plans to update the data or conduct the survey again.

The fields have the following definitions:

- **COUNTYFP**: The county FIPS code.
- **NEVER**: The estimated share of people in this county who would say never in response to the question: “How often do you wear a mask in public when you expect to be within six feet of another person?”
- **RARELY**: The estimated share of people in this county who would say rarely
- **SOMETIMES**: The estimated share of people in this county who would say sometimes
- **FREQUENTLY**: The estimated share of people in this county who would say frequently
- **ALWAYS**: The estimated share of people in this county who would say always


In [35]:
url = 'https://raw.githubusercontent.com/nytimes/covid-19-data/master/mask-use/mask-use-by-county.csv'
s = requests.get(url).text
nymask = pd.read_csv(StringIO(s))
nymask.COUNTYFP = nymask.COUNTYFP.astype(str)
nymask.COUNTYFP = np.where(nymask['COUNTYFP'].str.len() == 4, '0' + nymask.COUNTYFP, nymask.COUNTYFP)
nymask.columns = ['FIPS', 'NEVER', 'RARELY', 'SOMETIMES', 'FREQUENTLY', 'ALWAYS']
nymask.head()

Unnamed: 0,FIPS,NEVER,RARELY,SOMETIMES,FREQUENTLY,ALWAYS
0,1001,0.053,0.074,0.134,0.295,0.444
1,1003,0.083,0.059,0.098,0.323,0.436
2,1005,0.067,0.121,0.12,0.201,0.491
3,1007,0.02,0.034,0.096,0.278,0.572
4,1009,0.053,0.114,0.18,0.194,0.459


In [None]:
# fig, ax = plt.subplots(1, figsize=(20,20))
# merged.plot(column='trump_pct', cmap='RdBu_r', linewidth=0.8, ax=ax, edgecolor='black')
# # ax.axis('off')
# # ax.set_title('Percentage By County for 2016 Election', fontsize=18)
# # plt.savefig('images/election2016.png')

<img src='images/fb_mask_data.png'>

## Carnegie Mellon Mask Wearing Facebook Survey Data

Between the new survey’s deployment on September 8, 2020, and October 7th, the study collected 1,220,000 valid responses from respondents across the United States. 
Each response includes questions about symptoms, mask wearing, testing, and the other important topics described above, along with demographic details about the respondent. These demographics include age, gender, race, occupation, and education, allowing us to understand how different groups have been affected and which groups are currently most vulnerable to COVID-19. 

- **smoothed_wearing_mask**
  - Estimated percentage of people who wore a mask for most or all of the time while in public in the past 5 days; those not in public in the past 5 days are not counted.	

- **smoothed_others_masked**
  - Estimated percentage of respondents who say that most or all other people wear masks, when they are in public and social distancing is not possible


In [6]:
# create variable to hold date object for two days ago
two_days_ago = date.today() - timedelta(days=2)

In [7]:
# get the most recent survey data
mask_ind = covidcast.signal("fb-survey", "smoothed_wearing_mask",
                            date(2020, 10, 1), two_days_ago,
                            "county")
mask_oth = covidcast.signal('fb-survey', 'smoothed_others_masked', 
                            date(2020, 11, 24),two_days_ago, 
                            "county")

In [8]:
# remove data that isn't at the county level
mask_fip = mask_ind.loc[~mask_ind.geo_value.str.endswith('000')]
mask_oth = mask_oth.loc[~mask_oth.geo_value.str.endswith('000')]

In [38]:
mask_fip.head()

Unnamed: 0,geo_value,signal,time_value,issue,lag,value,stderr,sample_size,geo_type,data_source
1,1003,smoothed_wearing_mask,2020-10-01,2020-12-09,69,73.866307,2.968477,219.0684,county,fb-survey
2,1073,smoothed_wearing_mask,2020-10-01,2020-12-09,69,88.893937,1.822515,297.2281,county,fb-survey
3,1089,smoothed_wearing_mask,2020-10-01,2020-12-09,69,90.361381,1.421118,431.258,county,fb-survey
4,1097,smoothed_wearing_mask,2020-10-01,2020-12-09,69,80.32587,2.119979,351.6316,county,fb-survey
5,1101,smoothed_wearing_mask,2020-10-01,2020-12-09,69,92.842875,2.164874,141.7822,county,fb-survey


In [13]:
mask_fip.describe()

Unnamed: 0,lag,value,stderr,sample_size
count,50264.0,50264.0,50264.0,50264.0
mean,25.046654,90.907979,1.861972,329.766106
std,22.055374,5.500603,0.864895,320.34401
min,1.0,51.314471,0.252205,100.0
25%,5.0,88.127758,1.167788,143.9171
50%,16.0,92.236727,1.739585,219.88805
75%,45.0,94.97218,2.459793,399.25745
max,69.0,99.677919,4.965824,4677.9669


In [48]:
df = pd.DataFrame(mask_fip.geo_value.unique())

In [54]:
df2 = mask_fip.groupby('geo_value').sample_size.sum()

In [55]:
df3 = mask_fip.groupby('geo_value').value.mean()

In [12]:
mask_oth.head(10)

Unnamed: 0,geo_value,signal,time_value,issue,lag,value,stderr,sample_size,geo_type,data_source
2,4013,smoothed_others_masked,2020-11-24,2020-12-09,15,82.923867,2.05172,336.382,county,fb-survey
3,4019,smoothed_others_masked,2020-11-24,2020-12-09,15,84.678313,2.96486,147.5946,county,fb-survey
6,6037,smoothed_others_masked,2020-11-24,2020-12-09,15,87.485556,1.684146,386.0008,county,fb-survey
7,6059,smoothed_others_masked,2020-11-24,2020-12-09,15,81.656743,3.294606,137.9944,county,fb-survey
8,6065,smoothed_others_masked,2020-11-24,2020-12-09,15,87.018507,2.681319,157.1228,county,fb-survey
9,6071,smoothed_others_masked,2020-11-24,2020-12-09,15,89.023513,3.062219,104.2067,county,fb-survey
10,6073,smoothed_others_masked,2020-11-24,2020-12-09,15,86.891234,2.64351,162.9956,county,fb-survey
11,6085,smoothed_others_masked,2020-11-24,2020-12-09,15,93.406706,2.336948,112.7671,county,fb-survey
13,8041,smoothed_others_masked,2020-11-24,2020-12-09,15,83.90914,3.441874,113.9721,county,fb-survey
15,9001,smoothed_others_masked,2020-11-24,2020-12-09,15,94.998454,2.088127,108.97,county,fb-survey


In [14]:
mask_oth.describe()

Unnamed: 0,lag,value,stderr,sample_size
count,22410.0,22410.0,22410.0,22410.0
mean,5.438108,82.779259,2.363568,334.298024
std,2.10807,11.656619,1.046377,330.008157
min,1.0,24.554959,0.418443,100.0
25%,5.0,77.526024,1.50999,144.444525
50%,5.0,86.25687,2.209437,218.30405
75%,5.0,91.439472,3.178061,400.507175
max,15.0,98.75,4.992825,4374.0278


In [9]:
# check number of counties represented 
print(mask_ind.geo_value.value_counts().shape)
print(mask_fip.geo_value.value_counts().shape)
mask_oth.geo_value.value_counts().shape

(726,)
(677,)


(655,)

In [10]:
# pare down datasets to just columns of interest
mask_ind_means = pd.DataFrame(mask_fip.groupby(['geo_value'])['value'].mean()).reset_index()
mask_ind_means.columns = ['FIPS', 'ind_mask']
mask_ind_means['FIPS'] = mask_ind_means['FIPS'].astype(int)
# repeat for other survey
mask_oth_means = pd.DataFrame(mask_oth.groupby(['geo_value'])['value'].mean()).reset_index()
mask_oth_means.columns = ['FIPS', 'oth_mask']
mask_oth_means['FIPS'] = mask_oth_means['FIPS'].astype(int)

<img src='images/delphi_dec.png'>

In [44]:
mask_ind_means.head()

Unnamed: 0,FIPS,ind_mask
0,1003,79.671626
1,1051,88.29197
2,1069,85.591414
3,1073,91.454122
4,1081,89.816314


## Johns Hopkins COVID Data Geopandas

This feature layer contains the most up-to-date COVID-19 cases for the US. Data is pulled from the Coronavirus COVID-19 Global Cases by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University, the Red Cross, the Census American Community Survey, and the Bureau of Labor and Statistics, and aggregated at the US county level. 

This web map created and maintained by the Centers for Civic Impact at the Johns Hopkins University, and is supported by the Esri Living Atlas team and JHU Data Services. It is used in the COVID-19 United States Cases by County dashboard. 

 For more information on Johns Hopkins University’s response to COVID-19, visit the Johns Hopkins Coronavirus Resource Center where our experts help to advance understanding of the virus, inform the public, and brief policymakers in order to guide a response, improve care, and save lives.


In [74]:
data2 = gpd.read_file('https://opendata.arcgis.com/datasets/4cb598ae041348fb92270f102a6783cb_0.geojson')
print(data2.shape)
data2.head()

(3331, 88)


Unnamed: 0,OBJECTID,Countyname,ST_Name,ST_Abbr,ST_ID,FIPS,FatalityRa,Confirmedb,DeathsbyPo,PCTPOVALL_,Unemployme,Med_HH_Inc,State_Fata,DateChecke,EM_type,EM_date,EM_notes,url,Thumbnail,Confirmed,Deaths,Age_85,Age_80_84,Age_75_79,Age_70_74,Age_65_69,Beds_Licen,Beds_Staff,Beds_ICU,Ventilator,POP_ESTIMA,POVALL_201,Unemployed,Median_Hou,Recovered,Active,State_Conf,State_Deat,State_Reco,State_Test,AgedPop,NewCases,NewDeaths,TotalPop,NonHispWhP,BlackPop,AmIndop,AsianPop,PacIslPop,OtherPop,TwoMorPop,HispPop,Wh_Alone,Bk_Alone,AI_Alone,As_Alone,NH_Alone,SO_Alone,Two_More,Not_Hisp,Age_Less15,Age_15_24,Age_25_34,Age_Over75,Agetotal,NonHisp,Age_35_64,Age_65_74,Day_1,Day_2,Day_3,Day_4,Day_5,Day_6,Day_7,Day_8,Day_9,Day_10,Day_11,Day_12,Day_13,Day_14,NewCasebyP,Inpat_Occ,ICU_Occ,Shape__Area,Shape__Length,geometry
0,1,Autauga,Alabama,AL,1,1001,1.076426,8354.17,89.92644,13.8,3.6,118.9591,1.299898,01/07/2021 11:30:49,Govt Ordered Community Quarantine,4/3/2020 10:40:51 PM,AL Governor issued a Shelter In Place order.,https://bao.arcgis.com/covid-19/jhu/county/010...,https://coronavirus.jhu.edu/static/media/dashb...,4645,50,815,1026,1498,2440,2271,85,55,6,2,55601,7587,942,59338,0,0,384184,4994,0,1997125,8050,99,0,55200,41412,10475,159,568,5,41,1012,1528,42437,10565,159,568,32,409,1030,53672,10842,7192,7064,3339,55200,1528,22052,4711,99.0,210.0,31.0,37.0,29.0,49.0,26.0,59.0,40.0,36.0,30.0,9.0,48.0,53.0,178.054352,96.320346,100.0,2209382000.0,246839.865479,"POLYGON ((-86.41312 32.70739, -86.41219 32.526..."
1,2,Baldwin,Alabama,AL,1,1003,1.166758,6722.26,78.432452,9.8,3.6,115.4508,1.299898,01/07/2021 11:30:49,Govt Ordered Community Quarantine,4/3/2020 10:41:19 PM,AL Governor issued a Shelter In Place order.,https://bao.arcgis.com/covid-19/jhu/county/010...,https://coronavirus.jhu.edu/static/media/dashb...,14656,171,3949,4792,7373,11410,13141,386,362,51,8,218022,21069,3393,57588,0,0,384184,4994,0,1997125,40665,216,2,208107,172768,19529,1398,1668,9,410,2972,9353,179526,19764,1522,1680,9,2034,3572,198754,37621,23497,23326,16114,208107,9353,82998,24551,216.0,253.0,123.0,109.0,132.0,222.0,209.0,220.0,210.0,137.0,117.0,42.0,145.0,200.0,99.072571,74.029401,120.089286,5770469000.0,728445.072448,"MULTIPOLYGON (((-87.78878 31.29877, -87.78849 ..."
2,3,Barbour,Alabama,AL,1,1005,2.191609,6418.55,140.669587,30.9,5.2,68.928,1.299898,01/07/2021 11:30:49,Govt Ordered Community Quarantine,4/3/2020 10:43:21 PM,AL Governor issued a Shelter In Place order.,https://bao.arcgis.com/covid-19/jhu/county/010...,https://coronavirus.jhu.edu/static/media/dashb...,1597,35,422,551,841,1305,1515,74,30,5,2,24881,6788,433,34382,0,0,384184,4994,0,1997125,4634,22,2,25782,11898,12199,63,85,1,86,344,1106,12216,12266,72,96,1,778,353,24676,4517,3092,3675,1814,25782,1106,9864,2820,22.0,42.0,3.0,2.0,11.0,3.0,22.0,30.0,45.0,11.0,8.0,2.0,6.0,7.0,88.420883,56.711409,88.571429,3258643000.0,307285.15451,"POLYGON ((-85.25609 32.13767, -85.25569 32.135..."
3,4,Bibb,Alabama,AL,1,1007,2.469136,8678.57,214.285714,21.8,4.0,92.3478,1.299898,01/07/2021 11:30:49,Govt Ordered Community Quarantine,4/3/2020 10:40:51 PM,AL Governor issued a Shelter In Place order.,https://bao.arcgis.com/covid-19/jhu/county/010...,https://coronavirus.jhu.edu/static/media/dashb...,1944,48,427,488,624,842,1280,35,25,4,1,22400,4400,344,46064,0,0,384184,4994,0,1997125,3661,21,2,22527,16801,4974,8,37,0,0,160,547,17268,5018,8,37,0,9,187,21980,3742,3005,3075,1539,22527,547,9044,2122,21.0,38.0,3.0,19.0,9.0,20.0,17.0,25.0,30.0,16.0,7.0,14.0,14.0,28.0,93.75,19.047619,,2310715000.0,227886.96384,"POLYGON ((-87.02685 33.24646, -87.02572 33.209..."
4,5,Blount,Alabama,AL,1,1009,1.367905,8468.19,115.836791,13.2,3.5,101.0645,1.299898,01/07/2021 11:30:49,Govt Ordered Community Quarantine,4/3/2020 10:40:51 PM,AL Governor issued a Shelter In Place order.,https://bao.arcgis.com/covid-19/jhu/county/010...,https://coronavirus.jhu.edu/static/media/dashb...,4898,67,866,1459,1776,2802,3330,25,25,6,2,57840,7527,878,50412,0,0,384184,4994,0,1997125,10233,49,4,57645,50232,820,124,198,18,174,818,5261,55054,862,141,198,18,437,935,52384,11112,6906,6786,4101,57645,5261,22608,6132,49.0,78.0,25.0,17.0,36.0,52.0,57.0,49.0,52.0,18.0,19.0,5.0,36.0,38.0,84.716459,76.190476,92.857143,2456058000.0,286306.840721,"POLYGON ((-86.44507 34.24954, -86.40902 34.205..."


- FatalityRa = County Fatality Rate
- Confirmedb = County Confirmed divided by County Population * 100,000
- DeathsbyPo = County Deaths divided by County Population * 100,000
- Unemployme = Unemployment Rate
- EM_type, EM_date, EM_notes - could be turned into a column about Emergency Declaration
- Confirmed = County Level Confirmed Cases
- Deaths = County Level Deaths
- Beds_Licen = Number of Licensed Beds 
- Beds_Staff = Number of Staffed Beds 
- Beds_ICU = Number of ICU Beds 
- Ventilator = Average Ventilation Used Per Hospital
- POP_ESTIMA = Total Population
- POVALL_201 = Poverty Rate
- Median_Hou - Median Household Income
- Recovered = Number of Recovered Cases
- Active = Number of Active Cases
- AgedPop = TOtal Population Aged 65 Plus
- NewCases = New cases since yesterday
- NewDeaths = Newdeaths since yesterday
- Wh_Alone, Bk_Alone, AI_Alone, As_Alone, NH_Alone, SO_Alone, Two_More, Not_Hisp = Population numbers of each group
- Age_? = Population numbers of age groups
- NewCasebyP = I think this is new case by percentage
- ICU_Occ, Inpat_Occ = Percent Occupied of ICUs and Inpatient Beds
- Shape_Area, Shape_Length = for mapping



In [76]:
# remove all the rows with Unassigned and Out of [State] designations for county name, as well as Puerto Rico
data2 = data2[~data2.Countyname.str.contains("Out of")]
data2 = data2[~data2.Countyname.str.contains("Unassigned")]
data2 = data2[~data2.ST_Name.str.contains("Puerto Rico")]
data2.drop(data2.tail(7).index, inplace=True)
# remove Day_1 to Day_14 columns, unclear what the days represent
data2.drop(columns=['Day_1', 'Day_2', 'Day_3', 'Day_4', 'Day_5', 'Day_6', 'Day_7', 'Day_8', 'Day_9', 'Day_10', 'Day_11', 'Day_12', 'Day_13', 'Day_14'], inplace=True)
# State level data columns were dropped, as well as any ID and repeated population data
data2.drop(columns=['OBJECTID', 'ST_ID', 'PCTPOVALL_', 'Med_HH_Inc', 'State_Fata', 'DateChecke', 'url', 'Thumbnail', 'State_Conf', 'State_Deat', 'State_Reco', 'State_Test', 'TotalPop', 'NonHispWhP', 'BlackPop', 'AmIndop', 'PacIslPop', 'OtherPop', 'TwoMorPop', 'HispPop', 'NonHisp', 'AsianPop'], inplace=True)
data2.drop(columns=['Wh_Alone', 'Bk_Alone', 'AI_Alone', 'As_Alone', 'NH_Alone', 'SO_Alone', 'Two_More', 'Not_Hisp', 'Age_Over75'], inplace=True)
# Kept Deaths per 
data2.drop(columns=['Confirmed', 'Deaths'], inplace=True)
# remove all age category data, as there is not statistical difference between the various age groups above 65
data2.drop(columns=['Age_85', 'Age_80_84', 'Age_75_79', 'Age_70_74', 'Age_65_69', 'Agetotal', 'Age_Less15', 'Age_15_24', 'Age_25_34', 'Age_35_64', 'Age_65_74'], inplace=True)
# Columns are mostly empty values and difficult to categorize
data2.drop(columns=['Recovered', 'Active', 'EM_notes'], inplace=True)

In [77]:
data2.head()

Unnamed: 0,Countyname,ST_Name,ST_Abbr,FIPS,FatalityRa,Confirmedb,DeathsbyPo,Unemployme,EM_type,EM_date,Beds_Licen,Beds_Staff,Beds_ICU,Ventilator,POP_ESTIMA,POVALL_201,Unemployed,Median_Hou,AgedPop,NewCases,NewDeaths,NewCasebyP,Inpat_Occ,ICU_Occ,Shape__Area,Shape__Length,geometry
0,Autauga,Alabama,AL,1001,1.076426,8354.17,89.92644,3.6,Govt Ordered Community Quarantine,4/3/2020 10:40:51 PM,85,55,6,2,55601,7587,942,59338,8050,99,0,178.054352,96.320346,100.0,2209382000.0,246839.865479,"POLYGON ((-86.41312 32.70739, -86.41219 32.526..."
1,Baldwin,Alabama,AL,1003,1.166758,6722.26,78.432452,3.6,Govt Ordered Community Quarantine,4/3/2020 10:41:19 PM,386,362,51,8,218022,21069,3393,57588,40665,216,2,99.072571,74.029401,120.089286,5770469000.0,728445.072448,"MULTIPOLYGON (((-87.78878 31.29877, -87.78849 ..."
2,Barbour,Alabama,AL,1005,2.191609,6418.55,140.669587,5.2,Govt Ordered Community Quarantine,4/3/2020 10:43:21 PM,74,30,5,2,24881,6788,433,34382,4634,22,2,88.420883,56.711409,88.571429,3258643000.0,307285.15451,"POLYGON ((-85.25609 32.13767, -85.25569 32.135..."
3,Bibb,Alabama,AL,1007,2.469136,8678.57,214.285714,4.0,Govt Ordered Community Quarantine,4/3/2020 10:40:51 PM,35,25,4,1,22400,4400,344,46064,3661,21,2,93.75,19.047619,,2310715000.0,227886.96384,"POLYGON ((-87.02685 33.24646, -87.02572 33.209..."
4,Blount,Alabama,AL,1009,1.367905,8468.19,115.836791,3.5,Govt Ordered Community Quarantine,4/3/2020 10:40:51 PM,25,25,6,2,57840,7527,878,50412,10233,49,4,84.716459,76.190476,92.857143,2456058000.0,286306.840721,"POLYGON ((-86.44507 34.24954, -86.40902 34.205..."


In [79]:
data2.columns

Index(['Countyname', 'ST_Name', 'ST_Abbr', 'FIPS', 'FatalityRa', 'Confirmedb',
       'DeathsbyPo', 'Unemployme', 'EM_type', 'EM_date', 'Beds_Licen',
       'Beds_Staff', 'Beds_ICU', 'Ventilator', 'POP_ESTIMA', 'POVALL_201',
       'Unemployed', 'Median_Hou', 'AgedPop', 'NewCases', 'NewDeaths',
       'NewCasebyP', 'Inpat_Occ', 'ICU_Occ', 'Shape__Area', 'Shape__Length',
       'geometry'],
      dtype='object')

In [81]:
data2.head()

Unnamed: 0,Countyname,ST_Name,ST_Abbr,FIPS,FatalityRa,Confirmedb,DeathsbyPo,Unemployme,EM_type,EM_date,Beds_Licen,Beds_Staff,Beds_ICU,Ventilator,POP_ESTIMA,POVALL_201,Unemployed,Median_Hou,AgedPop,NewCases,NewDeaths,NewCasebyP,Inpat_Occ,ICU_Occ,Shape__Area,Shape__Length,geometry
0,Autauga,Alabama,AL,1001,1.076426,8354.17,89.92644,3.6,Govt Ordered Community Quarantine,4/3/2020 10:40:51 PM,85,55,6,2,55601,7587,942,59338,8050,99,0,178.054352,96.320346,100.0,2209382000.0,246839.865479,"POLYGON ((-86.41312 32.70739, -86.41219 32.526..."
1,Baldwin,Alabama,AL,1003,1.166758,6722.26,78.432452,3.6,Govt Ordered Community Quarantine,4/3/2020 10:41:19 PM,386,362,51,8,218022,21069,3393,57588,40665,216,2,99.072571,74.029401,120.089286,5770469000.0,728445.072448,"MULTIPOLYGON (((-87.78878 31.29877, -87.78849 ..."
2,Barbour,Alabama,AL,1005,2.191609,6418.55,140.669587,5.2,Govt Ordered Community Quarantine,4/3/2020 10:43:21 PM,74,30,5,2,24881,6788,433,34382,4634,22,2,88.420883,56.711409,88.571429,3258643000.0,307285.15451,"POLYGON ((-85.25609 32.13767, -85.25569 32.135..."
3,Bibb,Alabama,AL,1007,2.469136,8678.57,214.285714,4.0,Govt Ordered Community Quarantine,4/3/2020 10:40:51 PM,35,25,4,1,22400,4400,344,46064,3661,21,2,93.75,19.047619,,2310715000.0,227886.96384,"POLYGON ((-87.02685 33.24646, -87.02572 33.209..."
4,Blount,Alabama,AL,1009,1.367905,8468.19,115.836791,3.5,Govt Ordered Community Quarantine,4/3/2020 10:40:51 PM,25,25,6,2,57840,7527,878,50412,10233,49,4,84.716459,76.190476,92.857143,2456058000.0,286306.840721,"POLYGON ((-86.44507 34.24954, -86.40902 34.205..."


In [83]:
data2.describe()

Unnamed: 0,FatalityRa,Confirmedb,DeathsbyPo,Unemployme,Beds_Licen,Beds_Staff,Beds_ICU,Ventilator,POP_ESTIMA,POVALL_201,Unemployed,Median_Hou,AgedPop,NewCases,NewDeaths,NewCasebyP,Inpat_Occ,ICU_Occ,Shape__Area,Shape__Length
count,3142.0,3142.0,3142.0,3142.0,3142.0,3142.0,3142.0,3142.0,3142.0,3142.0,3142.0,3142.0,3142.0,3142.0,3142.0,3142.0,2423.0,1552.0,3142.0,3142.0
mean,1.750076,6948.591687,122.715199,4.132018,298.811903,254.153405,29.178867,7.380013,104127.1,13320.28,2003.960853,52777.610439,15093.36,80.888288,1.239338,81.933501,53.976173,65.56021,6808623000.0,371488.6
std,1.112969,2837.637191,89.957044,1.503802,985.479124,862.311711,102.185808,23.534068,333486.3,45596.31,6969.042922,13909.834889,44993.9,316.806914,5.901344,82.248343,34.142614,29.709178,55569680000.0,739740.8
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-41.0,-5.0,-1416.234888,0.0,0.0,8763938.0,15636.78
25%,1.010014,5149.4325,60.155273,3.1,16.0,15.0,2.0,0.0,10926.5,1594.0,200.25,43679.5,880.75,6.0,0.0,38.213398,34.330776,50.33986,1860524000.0,205800.0
50%,1.537125,6825.79,105.698029,3.9,42.0,32.0,5.0,2.0,25758.5,3766.0,482.5,50566.5,4080.0,19.0,0.0,68.450955,54.614412,72.340426,2680092000.0,249354.3
75%,2.288272,8529.0,161.625494,4.8,171.0,134.0,14.0,4.0,67820.5,9391.25,1275.75,58845.5,11618.75,55.0,1.0,107.267763,72.14168,88.000071,3944540000.0,336767.0
max,13.043478,28439.74,842.266462,19.9,23579.0,22419.0,2352.0,523.0,10105520.0,1409155.0,239829.0,140382.0,1299277.0,11554.0,257.0,1564.455569,1218.285714,200.0,2228678000000.0,17964230.0


<img src='images/county_conf_by_pop.png'>

<img src='images/county_confirmed.png'>

## 2020 Election Data

In [71]:
# 
df2020 = pd.read_csv('data/2020_Results.csv')
df2020.county_fips = df2020.county_fips.astype(str)
df2020.county_fips = df2020.county_fips.apply(lambda x: x.zfill(5))
# upload shapefile for USA counties
shapefile = 'data/USA_Counties.shp'
gdf = gpd.read_file(shapefile)
geo = gdf[~gdf.STATE_NAME.isin(['Puerto Rico', 'Alaska', 'Hawaii'])]
geo.FIPS = geo.FIPS.astype(str)
merged = geo.merge(df2020, left_on='FIPS', right_on='county_fips', how='left')

In [73]:
# fig, ax = plt.subplots(1, figsize=(20,20))
# merged.plot(column='per_gop', cmap='RdBu_r', linewidth=0.8, ax=ax, edgecolor='black')
# ax.axis('off')
# ax.set_title('Percentage By County for 2020 Election', fontsize=18)
# plt.savefig('images/election2020.png')

<img src='images/election2020.png'>

In [None]:
data2['Staf_Bed_Perc'] = data2.Beds_Staff / data2.Beds_Licen
data2['Aged_Perc'] = data2.AgedPop / data2.POP_ESTIMA
data2['Pov_Perc'] = data2.POVALL_201 / data2.POP_ESTIMA
data2['Aged_Per_ICU'] = data2.AgedPop / data2.Beds_ICU
data2.drop(columns=['Unemployed', 'Beds_Licen', 'Beds_Staff', 'Ventilator'], inplace=True)

data2.EM_date = data2.EM_date.str.extract(r'((\d+)\/(\d{2})\/(\d{4}))')
data2.EM_type = data2.EM_type.replace({'Govt Ordered Community Quarantine': 'CQ', 'Govt Directed Social Distancing': 'SD'})