In [1]:
import re
import requests
import pandas as pd
from bs4 import BeautifulSoup

### Download the webpage

In [2]:
url = "https://www.cdph.ca.gov/Programs/CID/DCDC/Pages/Monkeypox-Data.aspx"

In [3]:
page = requests.get(url)

### Parse the webpage

In [4]:
soup = BeautifulSoup( page.content, 'html.parser' )

In [5]:
content = soup.find("div", id="DeltaPlaceHolderMain")

In [6]:
tables = content.find_all("table")

In [9]:
for tag in content.find_all(text=re.compile('Jurisdiction')):
    table = tag.findParent("table")

In [10]:
def safetxt(element):
    v = element.text.strip()
    v = v.replace("\u200b", "")
    return v

In [11]:
def safenumber(element):
    v = safetxt(element)
    v = v.replace(",", "")
    v = v.replace(" ", "")
    return v

In [12]:
dict_list = []

for row in table.find_all("tr"):
    cell_list = row.find_all("td")
    try:
        d = dict(
            lhj_name = safetxt( cell_list[0] ),
            cases = safenumber( cell_list[1] )        
        )
        dict_list.append(d)
    except Exception as e: 
        print(e)

list index out of range


### Parse date

In [61]:
date_container = content.find("div", id="WebPartWPQ5")

In [67]:
date_sentence = date_container.find('span', attrs={'class':'ms-rteStyle-Emphasis'}).string

In [80]:
date_str = date_sentence.split("Last updated ")[1].replace(".","")
date_str

### Convert to dataframe

In [27]:
df = pd.DataFrame(dict_list)

In [89]:
clean_date = pd.to_datetime(date_str).date()

In [90]:
df["date"] = clean_date

In [92]:
df

Unnamed: 0,lhj_name,cases,date
0,Los Angeles,431,2022-08-04
1,San Francisco,398,2022-08-04
2,Alameda,83,2022-08-04
3,Sacramento,63,2022-08-04
4,Santa Clara,61,2022-08-04
5,San Diego,54,2022-08-04
6,Riverside,35,2022-08-04
7,Contra Costa,28,2022-08-04
8,Orange,24,2022-08-04
9,Long Beach,23,2022-08-04


### Export

In [28]:
df.to_csv("raw/cdph/latest.csv", index=False)

In [93]:
df.to_csv(f"raw/cdph/{clean_date}.csv", index=False)