In [1]:
import pandas as pd 
import csv  
import requests
import numpy as np
import os
from bs4 import BeautifulSoup
from selenium import webdriver
import time
from datetime import datetime, timedelta
from pytz import timezone
import pytz

In [2]:
#A list of all 50 states plus the District of Columbia saved in a 
#variable named us_states_list

us_states_list = [
    'Alabama',
    'Alaska',
    'Arizona',
    'Arkansas',
    'California',
    'Colorado',
    'Connecticut',
    'Delaware',
    'District of Columbia',
    'Florida',
    'Georgia',
    'Hawaii',
    'Idaho',
    'Illinois',
    'Indiana',
    'Iowa',
    'Kansas',
    'Kentucky',
    'Louisiana',
    'Maine',
    'Maryland',
    'Massachusetts',
    'Michigan',
    'Minnesota',
    'Mississippi',
    'Missouri',
    'Montana',
    'Nebraska',
    'Nevada',
    'New Hampshire',
    'New Jersey',
    'New Mexico',
    'New York',
    'North Carolina',
    'North Dakota',
    'Ohio',
    'Oklahoma',
    'Oregon',
    'Pennsylvania',
    'Rhode Island',
    'South Carolina',
    'South Dakota',
    'Tennessee',
    'Texas',
    'Utah',
    'Vermont',
    'Virginia',
    'Washington',
    'West Virginia',
    'Wisconsin',
    'Wyoming'
]

# Import Kaggle CSV With Cases & Deaths By State #

In [3]:
# Import Kaggle csv file with total cases and deaths by US state and county
# Data was downloaded from https://www.kaggle.com/imdevskp/corona-virus-report#usa_county_wise.csv
file = "Raw_Data/usa_county_wise.csv"
kaggle_df = pd.read_csv(file)
kaggle_df.head()

Unnamed: 0,UID,iso2,iso3,code3,FIPS,Admin2,Province_State,Country_Region,Lat,Long_,Combined_Key,Date,Confirmed,Deaths
0,16,AS,ASM,16,60.0,,American Samoa,US,-14.271,-170.132,"American Samoa, US",1/22/20,0,0
1,316,GU,GUM,316,66.0,,Guam,US,13.4443,144.7937,"Guam, US",1/22/20,0,0
2,580,MP,MNP,580,69.0,,Northern Mariana Islands,US,15.0979,145.6739,"Northern Mariana Islands, US",1/22/20,0,0
3,630,PR,PRI,630,72.0,,Puerto Rico,US,18.2208,-66.5901,"Puerto Rico, US",1/22/20,0,0
4,850,VI,VIR,850,78.0,,Virgin Islands,US,18.3358,-64.8963,"Virgin Islands, US",1/22/20,0,0


In [4]:
# Only keep rows for US states (i.e. remove Guam, Virgin Islands, etc.)
kaggle_us = kaggle_df.loc[kaggle_df["iso2"] == "US",:]

In [5]:
# Remove columns not needed (UID, iso2(country/territory), iso3(country/territory), & code3)
kaggle_rem_cols = kaggle_us[["FIPS", "Admin2","Province_State", "Lat", "Long_", "Combined_Key", "Date", "Confirmed", "Deaths"]]

In [6]:
# Rename columns (Admin 2 to County, Province_State to State, Long_ to Lng)
kaggle_renamed = kaggle_rem_cols.rename(columns={"Admin2":"County", "Province_State":"State", "Long_":"Lng"})
kaggle_renamed.sample(10)

Unnamed: 0,FIPS,County,State,Lat,Lng,Combined_Key,Date,Confirmed,Deaths
141418,28095.0,Monroe,Mississippi,33.892368,-88.480844,"Monroe, Mississippi, US",3/5/20,0,0
109273,36051.0,Livingston,New York,42.725963,-77.779662,"Livingston, New York, US",2/24/20,0,0
42535,6059.0,Orange,California,33.701475,-117.7646,"Orange, California, US",2/4/20,1,0
53504,28037.0,Franklin,Mississippi,31.477866,-90.89673,"Franklin, Mississippi, US",2/7/20,0,0
6938,13075.0,Cook,Georgia,31.160685,-83.429472,"Cook, Georgia, US",1/24/20,0,0
61340,48445.0,Terry,Texas,33.174028,-102.335452,"Terry, Texas, US",2/9/20,0,0
200643,39081.0,Jefferson,Ohio,40.386141,-80.762595,"Jefferson, Ohio, US",3/23/20,0,0
237888,8047.0,Gilpin,Colorado,39.856465,-105.52526,"Gilpin, Colorado, US",4/4/20,0,0
16201,80036.0,Out of NY,New York,0.0,0.0,"Out of NY, New York, US",1/26/20,0,0
141901,37083.0,Halifax,North Carolina,36.256693,-77.655611,"Halifax, North Carolina, US",3/5/20,0,0


In [7]:
# Check for missing values
kaggle_renamed.count()

FIPS            285648
County          285824
State           286000
Lat             286000
Lng             286000
Combined_Key    286000
Date            286000
Confirmed       286000
Deaths          286000
dtype: int64

In [8]:
# Drop rows with missing data
kaggle_drop = kaggle_renamed.dropna()
kaggle_drop.count()

FIPS            285472
County          285472
State           285472
Lat             285472
Lng             285472
Combined_Key    285472
Date            285472
Confirmed       285472
Deaths          285472
dtype: int64

In [9]:
# Check if date column date are strings or date objects
kaggle_drop.dtypes

FIPS            float64
County           object
State            object
Lat             float64
Lng             float64
Combined_Key     object
Date             object
Confirmed         int64
Deaths            int64
dtype: object

In [10]:
# Change dates from strings to datetime objects
kaggle_drop['Date'] = pd.to_datetime(kaggle_drop['Date'],format='%m/%d/%y')

In [11]:
# Check that column type changed
kaggle_drop.dtypes

FIPS                   float64
County                  object
State                   object
Lat                    float64
Lng                    float64
Combined_Key            object
Date            datetime64[ns]
Confirmed                int64
Deaths                   int64
dtype: object

In [12]:
# Remove rows without a county name
kaggle_final = kaggle_drop.loc[kaggle_drop["County"] != "Unassigned",:]
kaggle_final2 = kaggle_final[~kaggle_final["County"].str.contains("Out of")]
kaggle_final2.tail(10)

Unnamed: 0,FIPS,County,State,Lat,Lng,Combined_Key,Date,Confirmed,Deaths
286322,56027.0,Niobrara,Wyoming,43.056077,-104.47589,"Niobrara, Wyoming, US",2020-04-18,1,0
286323,56029.0,Park,Wyoming,44.521575,-109.585282,"Park, Wyoming, US",2020-04-18,1,0
286324,56031.0,Platte,Wyoming,42.132991,-104.966331,"Platte, Wyoming, US",2020-04-18,0,0
286325,56033.0,Sheridan,Wyoming,44.790489,-106.886239,"Sheridan, Wyoming, US",2020-04-18,12,0
286326,56035.0,Sublette,Wyoming,42.765583,-109.913092,"Sublette, Wyoming, US",2020-04-18,1,0
286327,56037.0,Sweetwater,Wyoming,41.659439,-108.882788,"Sweetwater, Wyoming, US",2020-04-18,10,0
286328,56039.0,Teton,Wyoming,43.935225,-110.58908,"Teton, Wyoming, US",2020-04-18,62,0
286329,56041.0,Uinta,Wyoming,41.287818,-110.547578,"Uinta, Wyoming, US",2020-04-18,6,0
286330,56043.0,Washakie,Wyoming,43.904516,-107.680187,"Washakie, Wyoming, US",2020-04-18,5,0
286331,56045.0,Weston,Wyoming,43.839612,-104.567488,"Weston, Wyoming, US",2020-04-18,0,0


In [13]:
# Reset index to use as primary key for county table
kaggle_final3 = kaggle_final2.reset_index(drop=True)
kaggle_final3.tail()

Unnamed: 0,FIPS,County,State,Lat,Lng,Combined_Key,Date,Confirmed,Deaths
276491,56037.0,Sweetwater,Wyoming,41.659439,-108.882788,"Sweetwater, Wyoming, US",2020-04-18,10,0
276492,56039.0,Teton,Wyoming,43.935225,-110.58908,"Teton, Wyoming, US",2020-04-18,62,0
276493,56041.0,Uinta,Wyoming,41.287818,-110.547578,"Uinta, Wyoming, US",2020-04-18,6,0
276494,56043.0,Washakie,Wyoming,43.904516,-107.680187,"Washakie, Wyoming, US",2020-04-18,5,0
276495,56045.0,Weston,Wyoming,43.839612,-104.567488,"Weston, Wyoming, US",2020-04-18,0,0


In [14]:
# Export kaggle df as a csv so it can be imported to postgres
kaggle_final3.to_csv("Clean_CSVs/county_data.csv", encoding="utf-8", index=True)

# Import CDC CSV With COVID Forecasts By State #

In [15]:
# Import csv file with forecast of potential deaths by state
# Data was downloaded from https://www.cdc.gov/coronavirus/2019-ncov/covid-data/forecasting-us.html
file2 = "Raw_Data/forecast_data_0420.csv"
forecast_raw = pd.read_csv(file2)
forecast_raw.tail()

Unnamed: 0,model,forecast_date,target,target_week_end_date,location_name,point,quantile_0.025,quantile_0.975
1640,YYG,4/20/2020,4 wk ahead cum death,5/16/2020,West Virginia,75,54.0,110.0
1641,YYG,4/20/2020,1 wk ahead cum death,4/25/2020,Wyoming,4,4.0,4.0
1642,YYG,4/20/2020,2 wk ahead cum death,5/2/2020,Wyoming,8,7.0,9.0
1643,YYG,4/20/2020,3 wk ahead cum death,5/9/2020,Wyoming,11,10.0,14.0
1644,YYG,4/20/2020,4 wk ahead cum death,5/16/2020,Wyoming,15,13.0,19.0


In [16]:
# Only keep rows for states, not whole of US and US territories
forecast_states = forecast_raw.loc[forecast_raw["location_name"].isin(us_states_list),:]
forecast_states.head()

Unnamed: 0,model,forecast_date,target,target_week_end_date,location_name,point,quantile_0.025,quantile_0.975
1,CU 40% contact reduction,4/20/2020,1 wk ahead cum death,4/25/2020,Alabama,184,157.0,226.0
2,CU 40% contact reduction,4/20/2020,1 wk ahead cum death,4/25/2020,Alaska,15,15.0,18.0
3,CU 40% contact reduction,4/20/2020,1 wk ahead cum death,4/25/2020,Arizona,265,222.0,325.0
4,CU 40% contact reduction,4/20/2020,1 wk ahead cum death,4/25/2020,Arkansas,50,42.0,62.0
5,CU 40% contact reduction,4/20/2020,1 wk ahead cum death,4/25/2020,California,2012,1792.0,2520.0


In [17]:
# Drop rows with missing data
forecast_states = forecast_states.dropna()
forecast_states.count()

model                   1592
forecast_date           1592
target                  1592
target_week_end_date    1592
location_name           1592
point                   1592
quantile_0.025          1592
quantile_0.975          1592
dtype: int64

In [18]:
# Rename columns (target week end date to target end date, location name to state, point to actual)
forecast_renamed = forecast_states.rename(columns={"target_week_end_date":"target_end_date", "location_name":"state", "point":"actual"})
forecast_renamed.head(20)

Unnamed: 0,model,forecast_date,target,target_end_date,state,actual,quantile_0.025,quantile_0.975
1,CU 40% contact reduction,4/20/2020,1 wk ahead cum death,4/25/2020,Alabama,184,157.0,226.0
2,CU 40% contact reduction,4/20/2020,1 wk ahead cum death,4/25/2020,Alaska,15,15.0,18.0
3,CU 40% contact reduction,4/20/2020,1 wk ahead cum death,4/25/2020,Arizona,265,222.0,325.0
4,CU 40% contact reduction,4/20/2020,1 wk ahead cum death,4/25/2020,Arkansas,50,42.0,62.0
5,CU 40% contact reduction,4/20/2020,1 wk ahead cum death,4/25/2020,California,2012,1792.0,2520.0
6,CU 40% contact reduction,4/20/2020,1 wk ahead cum death,4/25/2020,Colorado,585,525.0,650.0
7,CU 40% contact reduction,4/20/2020,1 wk ahead cum death,4/25/2020,Connecticut,2153,1748.0,2483.0
8,CU 40% contact reduction,4/20/2020,1 wk ahead cum death,4/25/2020,Delaware,232,175.0,306.0
9,CU 40% contact reduction,4/20/2020,1 wk ahead cum death,4/25/2020,Florida,1419,1162.0,1715.0
10,CU 40% contact reduction,4/20/2020,1 wk ahead cum death,4/25/2020,Georgia,927,843.0,1026.0


In [19]:
# Check if date column types are datetime or string 
forecast_renamed.dtypes

model               object
forecast_date       object
target              object
target_end_date     object
state               object
actual               int64
quantile_0.025     float64
quantile_0.975     float64
dtype: object

In [20]:
# Change dates from strings to datetime objects
forecast_renamed['forecast_date'] = pd.to_datetime(forecast_renamed['forecast_date'],format='%m/%d/%Y')
forecast_renamed['target_end_date'] = pd.to_datetime(forecast_renamed['target_end_date'],format='%m/%d/%Y')
forecast_renamed.tail()

Unnamed: 0,model,forecast_date,target,target_end_date,state,actual,quantile_0.025,quantile_0.975
1640,YYG,2020-04-20,4 wk ahead cum death,2020-05-16,West Virginia,75,54.0,110.0
1641,YYG,2020-04-20,1 wk ahead cum death,2020-04-25,Wyoming,4,4.0,4.0
1642,YYG,2020-04-20,2 wk ahead cum death,2020-05-02,Wyoming,8,7.0,9.0
1643,YYG,2020-04-20,3 wk ahead cum death,2020-05-09,Wyoming,11,10.0,14.0
1644,YYG,2020-04-20,4 wk ahead cum death,2020-05-16,Wyoming,15,13.0,19.0


In [21]:
# Verify columns changed to datetime
forecast_renamed.dtypes

model                      object
forecast_date      datetime64[ns]
target                     object
target_end_date    datetime64[ns]
state                      object
actual                      int64
quantile_0.025            float64
quantile_0.975            float64
dtype: object

In [22]:
# Reset index to use as primary key for county table
forecast_final = forecast_renamed.reset_index(drop=True)
forecast_final.tail()

Unnamed: 0,model,forecast_date,target,target_end_date,state,actual,quantile_0.025,quantile_0.975
1587,YYG,2020-04-20,4 wk ahead cum death,2020-05-16,West Virginia,75,54.0,110.0
1588,YYG,2020-04-20,1 wk ahead cum death,2020-04-25,Wyoming,4,4.0,4.0
1589,YYG,2020-04-20,2 wk ahead cum death,2020-05-02,Wyoming,8,7.0,9.0
1590,YYG,2020-04-20,3 wk ahead cum death,2020-05-09,Wyoming,11,10.0,14.0
1591,YYG,2020-04-20,4 wk ahead cum death,2020-05-16,Wyoming,15,13.0,19.0


In [23]:
# Export forecast df as a csv so it can be imported to postgres
forecast_final.to_csv("Clean_CSVs/forecast_cdc.csv", encoding="utf-8", index=True)

# Scraping Policydates And Hospital Resource Data #

In [24]:
#Define empty lists to store data for each state
covid19_healthdate_dates_bystate = []
covid19_healthdate_resources_bystate = []

In [25]:
#function to scrape covid19.healthdata.org for each state
def covid19_healthdata_scrape(state):
    #formating the state to match url 
    formatedstate = state.lower().replace(' ', '-') 
    url = f"https://covid19.healthdata.org/united-states-of-america/{formatedstate}"
    #access the webpage using selenium
    driver = webdriver.Chrome()
    driver.get(url)
    #after accessing the page, wait 5 seconds so the page can fully load before running next line of code
    time.sleep(5)
    #direct path to policy dates information since info is located at same place for every state
    dates_div = driver.find_elements_by_xpath("/html/body/div/div/main/div[3]/div[1]/div[2]")
    #grab the information using list comprehension
    dates_divs_text = [x.text for x in dates_div]
    #splitting up the data as they are all stored in the same div
    dates_text_split = dates_divs_text[0].split('\n')
    #location of resources information
    resources_div = driver.find_elements_by_xpath("/html/body/div/div/main/div[3]/div[3]/div[2]/div/div[2]/div[2]")
    #grab the information using list comprehension
    resources_div_text = [x.text for x in resources_div]
    #splitting up the data as they are all stored in the same div
    resources_div_split = resources_div_text[0].split('\n')
    #close the web browser after getting the information needed
    driver.close()
    #splitting the dates data and storing them in a dictionary
    dates_data = {
    'state': state,
    'mass_gathering_restriction': dates_text_split[1],
    'initual_business_closure': dates_text_split[3],
    'educational_facilities_closure': dates_text_split[5],
    'non-essential_services_closure': dates_text_split[7],
    'stay_at_home_order': dates_text_split[9],
    'travel_severely_limited': dates_text_split[11]}
    #splitting the resources data, removing unnecessary words, converting them into integers, and storing them in a dictionary
    try: 
        resources_data = {
        'state': state,
        'hospital_beds_needed': int(resources_div_split[1].replace('beds','').replace(',','')),
        'hospital_beds_available': int(resources_div_split[3].replace('beds','').replace(',','')),
        'hospital_beds_shortage': int(resources_div_split[5].replace('beds','').replace(',','')),
        'icu_beds_needed': int(resources_div_split[7].replace('beds','').replace(',','')),
        'icu_beds_available': int(resources_div_split[9].replace('beds','').replace(',','')),
        'icu_beds_shortage': int(resources_div_split[11].replace('beds','').replace(',','')),
        'ventilators_needed': int(resources_div_split[13].replace('ventilators','').replace(',',''))
            } 
    except IndexError:
        resources_data = {
        'state': state,
        'hospital_beds_needed': '',
        'hospital_beds_available': int(resources_div_split[1].replace('beds','').replace(',','')),
        'hospital_beds_shortage': '',
        'icu_beds_needed': '',
        'icu_beds_available': int(resources_div_split[3].replace('beds','').replace(',','')),
        'icu_beds_shortage': '',
        'ventilators_needed': ''}
    #push both dictionaries into their respective list
    covid19_healthdate_dates_bystate.append(dates_data)
    covid19_healthdate_resources_bystate.append(resources_data)

In [26]:
#Run the code to scrape covid19.healthdata site to grab data for the
#policy dates declaration and hospital resource information for each state
#and save them into list of dictionaries
for state in us_states_list:
    covid19_healthdata_scrape(state)

In [27]:
#Converting the policydates from a list of dictionaries to a panda dataframe
policydates_df = pd.DataFrame(covid19_healthdate_dates_bystate)
#Converting the hospital resources from a list of dictionaries to a panda dataframe
resource_df = pd.DataFrame(covid19_healthdate_resources_bystate)

In [28]:
#Setting the index for both dataframe to the states 
policydates_df = policydates_df.set_index('state')
resource_df = resource_df.set_index('state')

In [29]:
resource_df.head()

Unnamed: 0_level_0,hospital_beds_needed,hospital_beds_available,hospital_beds_shortage,icu_beds_needed,icu_beds_available,icu_beds_shortage,ventilators_needed
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Alabama,274,5744,0,80,475,0,74
Alaska,22,683,0,6,54,0,6
Arizona,496,6018,0,141,508,0,128
Arkansas,101,5006,0,26,395,0,23
California,2731,26654,0,722,1994,0,658


In [30]:
#Grabbing the name of each columns from the policydates dataframe 
columns = policydates_df.columns
#Going through each state and changing all of the data strings into datetime #objects but only keeping the date portion of the datetime object
#Some states have not implemented some policy so thus has a not implemented
#value, those values were converted to an empty space

for x in columns:
    for y in range(len(us_states_list)):
        try:
            policydates_df[x][y] = pd.to_datetime(policydates_df[x][y], format='%B %d, %Y')
            policydates_df[x][y] = policydates_df[x][y].date()
        except ValueError:
            policydates_df[x][y] = policydates_df[x][y].replace('Not implemented', '')

In [31]:
policydates_df.head()

Unnamed: 0_level_0,mass_gathering_restriction,initual_business_closure,educational_facilities_closure,non-essential_services_closure,stay_at_home_order,travel_severely_limited
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Alabama,2020-03-19,2020-03-19,2020-03-19,2020-03-28,2020-04-04,
Alaska,2020-03-24,2020-03-17,2020-03-16,2020-03-28,2020-03-28,
Arizona,2020-03-30,,2020-03-16,,2020-03-30,
Arkansas,2020-03-27,2020-03-19,2020-03-17,,,
California,2020-03-11,2020-03-19,2020-03-19,2020-03-19,2020-03-19,


In [32]:
#Exporting the dataframes as csvs 
policydates_df.to_csv('Clean_CSVs/covid19_policydates.csv')
resource_df.to_csv('Clean_CSVs/hospital_resources.csv')

# Scraping WorldOMeters #

# Making the requests and retrieving the HTML contents #

In [41]:
#the request (needs to be 200)
result = requests.get("https://www.worldometers.info/coronavirus/country/us/")

src = result.content
soup = BeautifulSoup(src, 'html.parser')

table = soup.find_all('table')

state_list = []

# Retrieving the table data from the HTML and stripping unnecessary characters and tags#

In [42]:
#html by id tag
table_data = soup.find(id="usa_table_countries_today")

#html by style tags
table_data = table_data.find_all(style = ["font-weight: bold; text-align:right","text-align:right;font-weight:bold;",\
"font-weight: bold; text-align:right;","font-weight: bold; text-align:right;background-color:#FFEEAA;","font-weight: bold; text-align:right;background-color:red; color:white"] )
#six_day = table_data.find_all('td')
print(table_data)


#for loop to strip the tags from the HTML
for data in table_data:
    
    data = data.text
    data = data.replace(',', '')
    data = data.replace(' ', '')
    data = data.strip('\n')
    
    state_list.append(data) #stripping the '\n' from the data

t">87</td>, <td style="text-align:right;font-weight:bold;">
66,094 </td>, <td style="font-weight: bold; text-align:right">22,114</td>, <td style="font-weight: bold; text-align:right">6,095 </td>, <td style="font-weight: bold; text-align:right;background-color:#FFEEAA;">
+214 </td>, <td style="font-weight: bold; text-align:right;">
244 </td>, <td style="font-weight: bold; text-align:right;background-color:red; color:white">+12 </td>, <td style="text-align:right;font-weight:bold;">
2,150 </td>, <td style="font-weight: bold; text-align:right">1,230</td>, <td style="font-weight: bold; text-align:right">49</td>, <td style="text-align:right;font-weight:bold;">
56,512 </td>, <td style="font-weight: bold; text-align:right">11,403</td>, <td style="font-weight: bold; text-align:right">5,136 </td>, <td style="font-weight: bold; text-align:right;background-color:#FFEEAA;">
+492 </td>, <td style="font-weight: bold; text-align:right;">
343 </td>, <td style="font-weight: bold; text-align:right;backgr

# Creating the row and column lists for the dataframe #

In [43]:
#list of the states for the dataframe rows
#worldometer data are listed from most cases to least, this states variable matches the data 
#on the site
list_of_states = ['New York', 'New Jersey', 'Massachusetts', 'California', 'Pennsylvania', 'Illinois', 'Michigan', 'Florida', 'Louisiana', 'Connecticut', 'Texas', 'Georgia', 'Maryland', 'Ohio\
', 'Indiana', 'Washington', 'Colorado', 'Virginia', 'Tennessee', 'North Carolina', 'Missouri', 'Rhode Island', 'Alabama', 'Arizona', 'Mississippi', 'Wisconsin', 'South Carolina', 'Nevada', 'Iowa\
', 'Utah', 'Kentucky', 'District of Columbia', 'Delaware', 'Oklahoma', 'Minnesota', 'Arkansas', 'Kansas', 'New Mexico', 'Oregon', 'Nebraska', 'South Dakota\
', 'Idaho', 'New Hampshire', 'West Virginia', 'Maine', 'Vermont', 'North Dakota', 'Hawaii', 'Wyoming', 'Montana', 'Alaska']

#column list for the dateframe columns
list_of_columns = ["Total_cases", "New_cases", "Total_deaths", "New_deaths", "Active_cases", "Cases_per_million", "Deaths_per_million", "Total_tests", "Tests_per_million"]

In [44]:
state_list = state_list[:459] #intentially cutting string values off to eliminate US territories and crusie ships in the data

df = pd.DataFrame(np.array(state_list).reshape(51,9), index= list_of_states, columns = list_of_columns) #turns the giant list of data into a 51x9 dataframe with the columns and rows listed

df.index.name = "State" #sets index name

# Retrieving the news date and time and stripping tags and characters #

In [45]:
news_date = soup.find(style="font-size:13px; color:#999; text-align:center") #find the date and time in GMT time
news_date = news_date.text #strip the HTML tags
news_date = news_date.strip("Last updated: ") # removes the Last Updated part of text


#stripping the time string of spaces and colon
news_date = news_date.replace(' ', '')
news_date = news_date.replace(':', '')

# Creating CSV file name and loading date and time from HTML into title #

In [46]:
csv_string = "states - " + str(news_date) + ".csv"

#dictionary to convert month name to month number of the year

month_string_to_number = {
    'January': '01',
    'February': '02',
    'March': '03',
    'April':'04',
    'May':'05',
    'June':'06',
    'July':'07',
    'August':'08',
    'September':'09',
    'October':'10',
    'November':'11',
    'December':'12'
    }


#pulling date and time info from news_date string
month_string = news_date[:-15]
month_number = month_string_to_number[month_string]
day_string = int(news_date[-15:-13])
year_string = news_date[-12:-8]
GMTtime_string = int(news_date[-7:-3])
ESTtime_string = year_string + "-" + month_number + "-" + str(day_string)
df.insert(0, 'Date', ESTtime_string)

# Stripping the final extraneous characters and creating CSV from dataframe #

In [47]:
#Strip the plus signs from the "new" columns
df['New_cases'] = df['New_cases'].str.replace('+', '')
df['New_deaths'] = df['New_deaths'].str.replace('+', '')

#creating and uploading dataframe to csv file with the name of the date and time

df.to_csv(f"Clean_CSVs/{csv_string}")

In [48]:
df

Unnamed: 0_level_0,Date,Total_cases,New_cases,Total_deaths,New_deaths,Active_cases,Cases_per_million,Deaths_per_million,Total_tests,Tests_per_million
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
New York,2020-04-30,310839,4681.0,23780,306.0,237654,15844,1212,900636,45908
New Jersey,2020-04-30,118652,2388.0,7228,458.0,110153,13359,814,246934,27802
Massachusetts,2020-04-30,62205,1940.0,3562,157.0,50525,9107,522,275647,40357
California,2020-04-30,52918,2563.0,2355,140.0,49957,4127,184,269867,21048
Pennsylvania,2020-04-30,49840,1275.0,2014,75.0,44489,1273,51,603139,15406
Illinois,2020-04-30,47999,2271.0,2541,187.0,44693,3753,199,221365,17306
Michigan,2020-04-30,41379,980.0,3789,119.0,29248,4156,381,177228,17798
Florida,2020-04-30,33690,497.0,1268,50.0,31736,1636,62,384153,18650
Louisiana,2020-04-30,28455,889.0,802,53.0,15146,1020,29,330300,11845
Connecticut,2020-04-30,28001,341.0,1905,60.0,8793,6004,408,156568,33572
