<a href="https://colab.research.google.com/github/dansjack/ad-450/blob/main/Week_3_Web_Scraping_(dj_winter_2022).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Scrape Worldometers' Coronavirus state level data

In [55]:
import requests
from bs4 import BeautifulSoup

url = 'https://www.worldometers.info/coronavirus/country/us/#nav-yesterday'
resp = requests.get(url)
data = resp.text

soup = BeautifulSoup(data, 'html.parser')
table = soup.find(id="usa_table_countries_today")

## Load table into DataFrame, filter data 

In [56]:
import pandas as pd

df = pd.read_html(str(table))[0]

# state column should contain just the state name (no HTML)
df['USAState'] = df['USAState'].str.replace('\xa0', ' ')
df.set_index('USAState', inplace=True)

# remove whitespace char from column name
df = df.rename(columns={ "Tot\xa0Cases/1M pop": "Tot Cases/1M pop" })

# exclude the '#', 'source', and 'projections' columns 
df = df.drop(columns=['#', 'Source', 'Projections']) 

# exclude rows for country totals
df = df.drop(["USA Total", "Total:"]) 

# clean data, NewCases column contains '+', ',' which makes the column 
# impossible to sort later if not replaced
df['NewCases'] = df['NewCases'].str.replace('+', '').str.replace(',', '')
df['NewCases'] = df['NewCases'].astype(float)
df = df.fillna(0)

## Print the first few rows of data

In [48]:
df.head()

Unnamed: 0_level_0,TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,ActiveCases,Tot Cases/1M pop,Deaths/1M pop,TotalTests,Tests/ 1M pop,Population
USAState,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
California,7376878,0.0,78730.0,0.0,0.0,0.0,186699.0,1993.0,131160392.0,3319489.0,39512223.0
Texas,5864525,0.0,78270.0,0.0,0.0,0.0,202254.0,2699.0,55371082.0,1909619.0,28995881.0
Florida,5328573,0.0,63574.0,0.0,3665356.0,1599643.0,248098.0,2960.0,50280048.0,2341031.0,21477737.0
New York,4790487,0.0,63563.0,0.0,2592921.0,2134003.0,246252.0,3267.0,94956710.0,4881199.0,19453561.0
Illinois,2773362,0.0,33446.0,0.0,1924335.0,815581.0,218861.0,2639.0,49488107.0,3905367.0,12671821.0


## DataFrame summary statistics 



In [49]:
summary_stats = df.describe()
summary_stats.drop(["count"])

Unnamed: 0,TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,ActiveCases,Tot Cases/1M pop,Deaths/1M pop,TotalTests,Tests/ 1M pop,Population
mean,1133527.0,279.730159,14091.555556,1.984127,477943.5,197256.0,175776.793651,2055.444444,14305740.0,2355562.0,5263912.0
std,1454533.0,2220.289303,18106.483926,15.74852,722881.0,378315.1,88431.970228,1169.640664,22030570.0,2144199.0,7056634.0
min,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,212328.0,0.0,1880.0,0.0,0.0,0.0,152809.0,1317.5,2638481.0,1299232.0,823360.5
50%,681382.0,0.0,8736.0,0.0,178859.0,19968.0,209194.0,2523.0,6787993.0,2236708.0,3155070.0
75%,1448960.0,0.0,17832.5,0.0,581219.5,213510.0,233859.0,2993.0,17137200.0,2888398.0,6780696.0
max,7376878.0,17623.0,78730.0,125.0,3665356.0,2134003.0,308780.0,3602.0,131160400.0,14451500.0,39512220.0


## Top 5s

### New Cases

In [51]:
# Note: depending on when you look at the data, you'll see a lot of 0s
top_new_cases = df.sort_values(by=["NewCases"], ascending=False)
top_new_cases = pd.DataFrame(top_new_cases, columns = ['NewCases'])
top_new_cases.head()

Unnamed: 0_level_0,NewCases
USAState,Unnamed: 1_level_1
Pennsylvania,17623.0
California,0.0
Maine,0.0
Oregon,0.0
New Mexico,0.0


### Total Deaths

In [57]:
top_total_deaths = df.sort_values(by=["TotalDeaths"], ascending=False)
top_total_deaths = pd.DataFrame(top_total_deaths, columns = ['TotalDeaths'])
top_total_deaths.head()

Unnamed: 0_level_0,TotalDeaths
USAState,Unnamed: 1_level_1
California,78730.0
Texas,78270.0
Florida,63574.0
New York,63563.0
Pennsylvania,39487.0


### Total Cases / 1m Population

In [53]:
top_cases_per_m = df.sort_values(by=["Tot Cases/1M pop"], ascending=False)
top_cases_per_m = pd.DataFrame(top_cases_per_m, columns = ['Tot Cases/1M pop'])
top_cases_per_m.head()

Unnamed: 0_level_0,Tot Cases/1M pop
USAState,Unnamed: 1_level_1
Rhode Island,308780.0
North Dakota,272944.0
Alaska,257827.0
Utah,257642.0
Tennessee,251062.0


### Total Deaths / 1m Population

In [54]:
top_deaths_per_m = df.sort_values(by=["Deaths/1M pop"], ascending=False)
top_deaths_per_m = pd.DataFrame(top_deaths_per_m, columns = ['Deaths/1M pop'])
top_deaths_per_m.head()

Unnamed: 0_level_0,Deaths/1M pop
USAState,Unnamed: 1_level_1
Mississippi,3602.0
Arizona,3504.0
New Jersey,3447.0
Alabama,3431.0
Louisiana,3296.0
