In [1]:
import pandas as pd 
import csv  
import requests
import numpy as np
import os
from bs4 import BeautifulSoup
from selenium import webdriver
import time

ModuleNotFoundError: No module named 'selenium'

# Import Kaggle CSV With Cases & Deaths By State #

In [None]:
# Import Kaggle csv file with total cases and deaths by US state and county
# Data was downloaded from https://www.kaggle.com/imdevskp/corona-virus-report#usa_county_wise.csv
file = "Raw_Data/usa_county_wise.csv"
kaggle_df = pd.read_csv(file)
kaggle_df.head()

In [None]:
# Only keep rows for US states (i.e. remove Guam, Virgin Islands, etc.)
kaggle_us = kaggle_df.loc[kaggle_df["iso2"] == "US",:]

In [None]:
# Remove columns not needed (UID, iso2(country/territory), iso3(country/territory), & code3)
kaggle_rem_cols = kaggle_us[["FIPS", "Admin2","Province_State", "Lat", "Long_", "Combined_Key", "Date", "Confirmed", "Deaths"]]

In [None]:
# Rename columns (Admin 2 to County, Province_State to State, Long_ to Lng)
kaggle_renamed = kaggle_rem_cols.rename(columns={"Admin2":"County", "Province_State":"State", "Long_":"Lng"})
kaggle_renamed.sample(10)

In [None]:
# Check for missing values
kaggle_renamed.count()

In [None]:
# Drop rows with missing data
kaggle_drop = kaggle_renamed.dropna()
kaggle_drop.count()

In [None]:
# Check if date column date are strings or date objects
kaggle_drop.dtypes

In [None]:
# Change dates from strings to datetime objects
kaggle_drop['Date'] = pd.to_datetime(kaggle_drop['Date'],format='%m/%d/%y')

In [None]:
# Check that column type changed
kaggle_drop.dtypes

In [None]:
# Remove rows without a county name
kaggle_final = kaggle_drop.loc[kaggle_drop["County"] != "Unassigned",:]
kaggle_final2 = kaggle_final[~kaggle_final["County"].str.contains("Out of")]
kaggle_final2.tail(10)

In [None]:
# Reset index to use as primary key for county table
kaggle_final3 = kaggle_final2.reset_index(drop=True)
kaggle_final3.tail()

In [None]:
# Export kaggle df as a csv so it can be imported to postgres
kaggle_final3.to_csv("Clean_CSVs/county_data.csv", encoding="utf-8", index=True)

# Import CDC CSV With COVID Forecasts By State #

In [None]:
# Import csv file with forecast of potential deaths by state
# Data was downloaded from https://www.cdc.gov/coronavirus/2019-ncov/covid-data/forecasting-us.html
file2 = "Raw_Data/forecast_data_0413.csv"
forecast_raw = pd.read_csv(file2)
forecast_raw.tail()

In [None]:
# Only keep rows for states, not whole of US
forecast_states = forecast_raw.loc[forecast_raw["location_name"] != "US",:]
forecast_states.head()

In [None]:
# Drop rows with missing data
forecast_states = forecast_states.dropna()
forecast_states.count()

In [None]:
# Rename columns (target week end date to target end date, location name to state, point to actual)
forecast_renamed = forecast_states.rename(columns={"target_week_end_date":"target_end_date", "location_name":"state", "point":"actual"})
forecast_renamed.head(20)

In [None]:
# Check if date column types are datetime or string 
forecast_renamed.dtypes

In [None]:
# Change dates from strings to datetime objects
forecast_renamed['forecast_date'] = pd.to_datetime(forecast_renamed['forecast_date'],format='%m/%d/%Y')
forecast_renamed['target_end_date'] = pd.to_datetime(forecast_renamed['target_end_date'],format='%m/%d/%Y')
forecast_renamed.tail()

In [None]:
# Verify columns changed to datetime
forecast_renamed.dtypes

In [None]:
# Reset index to use as primary key for county table
forecast_final = forecast_renamed.reset_index(drop=True)
forecast_final.tail()

In [None]:
# Export forecast df as a csv so it can be imported to postgres
forecast_final.to_csv("Clean_CSVs/forecast_cdc.csv", encoding="utf-8", index=True)

# Scraping Policydates And Hospital Resource Data #

In [None]:
#Define empty lists to store data for each state
covid19_healthdate_dates_bystate = []
covid19_healthdate_resources_bystate = []

In [None]:
#A list of all 50 states plus the District of Columbia saved in a
#variable named us_states_list

us_states_list = [
    'Alabama',
    'Alaska',
    'Arizona',
    'Arkansas',
    'California',
    'Colorado',
    'Connecticut',
    'Delaware',
    'District of Columbia',
    'Florida',
    'Georgia',
    'Hawaii',
    'Idaho',
    'Illinois',
    'Indiana',
    'Iowa',
    'Kansas',
    'Kentucky',
    'Louisiana',
    'Maine',
    'Maryland',
    'Massachusetts',
    'Michigan',
    'Minnesota',
    'Mississippi',
    'Missouri',
    'Montana',
    'Nebraska',
    'Nevada',
    'New Hampshire',
    'New Jersey',
    'New Mexico',
    'New York',
    'North Carolina',
    'North Dakota',
    'Ohio',
    'Oklahoma',
    'Oregon',
    'Pennsylvania',
    'Rhode Island',
    'South Carolina',
    'South Dakota',
    'Tennessee',
    'Texas',
    'Utah',
    'Vermont',
    'Virginia',
    'Washington',
    'West Virginia',
    'Wisconsin',
    'Wyoming'
]

In [None]:
#function to scrape covid19.healthdata.org for each state
def covid19_healthdata_scrape(state):
    #formating the state to match url 
    formatedstate = state.lower().replace(' ', '-') 
    url = f"https://covid19.healthdata.org/united-states-of-america/{formatedstate}"
    #access the webpage using selenium
    driver = webdriver.Chrome()
    driver.get(url)
    #after accessing the page, wait 5 seconds so the page can fully load before running next line of code
    time.sleep(5)
    #location of policy dates information
    dates_div = driver.find_elements_by_xpath("/html/body/div/div/main/div[3]/div[1]/div[2]")
    #grab the information using list comprehension
    dates_divs_text = [x.text for x in dates_div]
    #splitting up the data as they can all stored in the same div
    dates_text_split = dates_divs_text[0].split('\n')
    #location of resources information
    resources_div = driver.find_elements_by_xpath("/html/body/div/div/main/div[3]/div[3]/div[2]/div/div[2]/div[2]")
    #grab the information using list comprehension
    resources_div_text = [x.text for x in resources_div]
    #splitting up the data as they can all stored in the same div
    resources_div_split = resources_div_text[0].split('\n')
    #close the web browser after getting the information needed
    driver.close()
    #splitting the dates data and storing them in a dictionary
    dates_data = {
    'state': state,
    'mass_gathering_restriction': dates_text_split[1],
    'initual_business_closure': dates_text_split[3],
    'educational_facilities_closure': dates_text_split[5],
    'non-Essential_services_closure': dates_text_split[7],
    'stay_at_home_order': dates_text_split[9],
    'travel_severely_limited': dates_text_split[11]}
    #splitting the resources data, removing unnecessary words, converting them into integers, and storing them in a dictionary
    resources_data = {
    'state': state,
    'hospital_beds_needed': int(resources_div_split[1].replace('beds','').replace(',','')),
    'hospital_beds_available': int(resources_div_split[3].replace('beds','').replace(',','')),
    'hospital_beds_shortage': int(resources_div_split[5].replace('beds','').replace(',','')),
    'icu_beds_needed': int(resources_div_split[7].replace('beds','').replace(',','')),
    'icu_beds_available': int(resources_div_split[9].replace('beds','').replace(',','')),
    'icu_beds_shortage': int(resources_div_split[11].replace('beds','').replace(',','')),
    'ventilators_needed': int(resources_div_split[13].replace('ventilators','').replace(',',''))
    }
    #push both dictionaries into their respective list
    covid19_healthdate_dates_bystate.append(dates_data)
    covid19_healthdate_resources_bystate.append(resources_data)

In [None]:
#Run the code to scrape covid19.healthdata site to grab data for the
#policy dates declaration and hospital resource information for each state
#and save them into list of dictionaries
for state in us_states:
    covid19_healthdata_scrape(state)

In [None]:
#Converting the policydates from a list of dictionaries to a panda dataframe
policydates_df = pd.DataFrame(covid19_healthdate_dates_bystate)
#Converting the hospital resources from a list of dictionaries to a panda dataframe
resource_df = pd.DataFrame(covid19_healthdate_resources_bystate)

In [None]:
#Setting the index for both dataframe to the states 
policydates_df = policydates_df.set_index('state')
resource_df = resource_df.set_index('state')

In [None]:
#Grabbing the name of each columns from the policydates dataframe 
columns = policydates_df.columns
#Going through each state and changing all of the data strings into datetime #objects but only keeping the date portion of the datetime object
#Some states have not implemented some policy so thus has a not implemented
#value, those values were converted to an empty space

for x in columns:
    for y in range(len(us_states)):
        try:
            policydates_df[x][y] = pd.to_datetime(policydates_df[x][y], format='%B %d, %Y')
            policydates_df[x][y] = policydates_df[x][y].date()
        except ValueError:
            policydates_df[x][y] = policydates_df[x][y].replace('Not implemented', '')

In [None]:
#Exporting the dataframes as csvs 
policydates_df.to_csv('Clean_CSVs/covid19_policydates.csv')
resource_df.to_csv('Clean_CSVs/hospital_resources.csv')

# Scraping WorldOMeters #

In [None]:
import requests
import pandas as pd
import numpy as np
import os
from bs4 import BeautifulSoup

from datetime import datetime, timedelta
from pytz import timezone
import pytz

In [None]:
#https://www.worldometers.info/coronavirus/

#https://www.worldometers.info/coronavirus/country/us/

""" Worldometers.info	Web scrape	
Total cases, new cases(per day), total deaths, 
new deaths, active cases, total cases/1M pop, deaths/1M pop, 
total tests, tests/1M pop
 """



# Making the requests and retrieving the HTML contents #

In [None]:
#the request (needs to be 200)
result = requests.get("https://www.worldometers.info/coronavirus/country/us/")

src = result.content
soup = BeautifulSoup(src, 'html.parser')

table = soup.find_all('table')

state_list = []

# Retrieving the table data from the HTML and stripping unnecessary characters and tags#

In [None]:
#html by id tag
table_data = soup.find(id="usa_table_countries_today")

#html by style tags
table_data = table_data.find_all(style = ["font-weight: bold; text-align:right","text-align:right;font-weight:bold;",\
"font-weight: bold; text-align:right;","font-weight: bold; text-align:right;background-color:#FFEEAA;","font-weight: bold; text-align:right;background-color:red; color:white"] )
#six_day = table_data.find_all('td')
print(table_data)


#for loop to strip the tags from the HTML
for data in table_data:
    
    data = data.text
    data = data.replace(',', '')
    data = data.replace(' ', '')
    data = data.strip('\n')
    
    state_list.append(data) #stripping the '\n' from the data

# Creating the row and column lists for the dataframe #

In [None]:
#list of the states for the dataframe rows
list_of_states = ['New York', 'New Jersey', 'Massachusetts', 'California', 'Pennsylvania', 'Illinois', 'Michigan', 'Florida', 'Louisiana', 'Connecticut', 'Texas', 'Georgia', 'Maryland', 'Ohio\
', 'Indiana', 'Washington', 'Colorado', 'Virginia', 'Tennessee', 'North Carolina', 'Missouri', 'Rhode Island', 'Alabama', 'Arizona', 'Mississippi', 'Wisconsin', 'South Carolina', 'Nevada', 'Iowa\
', 'Utah', 'Kentucky', 'District Of Columbia', 'Delaware', 'Oklahoma', 'Minnesota', 'Arkansas', 'Kansas', 'New Mexico', 'Oregon', 'Nebraska', 'South Dakota\
', 'Idaho', 'New Hampshire', 'West Virginia', 'Maine', 'Vermont', 'North Dakota', 'Hawaii', 'Wyoming', 'Montana', 'Alaska']

#column list for the dateframe columns
list_of_columns = ["Total_cases", "New_cases", "Total_deaths", "New_deaths", "Active_cases", "Cases_per_million", "Deaths_per_million", "Total_tests", "Tests_per_million"]

In [None]:
state_list = state_list[:459] #intentially cutting string values off to eliminate US territories and crusie ships in the data

df = pd.DataFrame(np.array(state_list).reshape(51,9), index= list_of_states, columns = list_of_columns) #turns the giant list of data into a 51x9 dataframe with the columns and rows listed

df.index.name = "State" #sets index name

# Retrieving the news date and time and stripping tags and characters #

In [None]:
news_date = soup.find(style="font-size:13px; color:#999; text-align:center") #find the date and time in GMT time
news_date = news_date.text #strip the HTML tags
news_date = news_date.strip("Last updated: ") # removes the Last Updated part of text


#stripping the time string of spaces and colon
news_date = news_date.replace(' ', '')
news_date = news_date.replace(':', '')

# Creating CSV file name and loading date and time from HTML into title #

In [None]:
csv_string = "states - " + str(news_date) + ".csv"

#dictionary to convert month name to month number of the year

month_string_to_number = {
    'January': '01',
    'February': '02',
    'March': '03',
    'April':'04',
    'May':'05',
    'June':'06',
    'July':'07',
    'August':'08',
    'September':'09',
    'October':'10',
    'November':'11',
    'December':'12'
    }


#pulling date and time info from news_date string
month_string = news_date[:-15]
month_number = month_string_to_number[month_string]
day_string = int(news_date[-15:-13])
year_string = news_date[-12:-8]
GMTtime_string = int(news_date[-7:-3])
ESTtime_string = year_string + "-" + month_number + "-" + str(day_string)
df.insert(0, 'Date', ESTtime_string)

# Stripping the final extraneous characters and creating CSV from dataframe #

In [None]:
#Strip the plus signs from the "new" columns
df['New_cases'] = df['New_cases'].str.replace('+', '')
df['New_deaths'] = df['New_deaths'].str.replace('+', '')

#creating and uploading dataframe to csv file with the name of the date and time

df.to_csv(csv_string)