In [1]:
import pytz
import requests
import pandas as pd
from bs4 import BeautifulSoup
from playwright.async_api import async_playwright
from datetime import datetime, date, timedelta
import altair as alt
import altair_latimes as lat

In [2]:
from requests.packages.urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)

---

Scrape metadata of reservoirs serving California from https://cdec.water.ca.gov/reportapp/javareports?name=ResInfo

In [3]:
url = "https://cdec.water.ca.gov/reportapp/javareports?name=ResInfo"

In [4]:
page = requests.get(url, verify=False)

In [5]:
soup = BeautifulSoup(page.content, "html.parser")

In [6]:
table = soup.find(id='RESINFO_LIST')

In [7]:
def safetxt(element):
    v = element.text.strip()
    v = v.replace("\u200b", "")
    return v

In [8]:
def safenumber(element):
    v = safetxt(element)
    v = v.replace(",", "")
    v = v.replace(" ", "")
    return v

In [9]:
def parse_table(soup):
    #tbody = soup.tbody
    row_list = table.find_all("tr")[1:]
    dict_list = []
    for row in row_list:
        cell_list = row.find_all("td")
        d = dict(
            id=safetxt(cell_list[0]),
            dam=safetxt(cell_list[1]),
            lake=safetxt(cell_list[2]),
            stream=safetxt(cell_list[3]),
            capacity=safenumber(cell_list[4]),
        )
        dict_list.append(d)
    df = pd.DataFrame(dict_list)
    df["date"] = latest_date
    return df

In [10]:
row_list = table.find_all("tr")[2:]
dict_list = []
for row in row_list:
    cell_list = row.find_all("td")
    d = dict(
        id=safetxt(cell_list[0]),
        name=safetxt(cell_list[1]),
        lake=safetxt(cell_list[2]),
        stream=safetxt(cell_list[3]),
        capacity=safenumber(cell_list[4]),
    )
    dict_list.append(d)

In [11]:
metadata_df = pd.DataFrame(dict_list)

In [12]:
metadata_df["url"] = "https://cdec.water.ca.gov/dynamicapp/staMeta?station_id=" + metadata_df["id"]

In [13]:
metadata_df.to_csv("../data/metadata/reservoirs-metadata.csv", index=False)

---
Scrape detail pages for each reservoir

In [14]:
reservoir_urls = list(metadata_df.url)

In [15]:
dict_list = []
for url in reservoir_urls:
    page = requests.get(url, verify=False)
    soup = BeautifulSoup(page.content, "html.parser")
    # lol
    table = soup.find(border='1')
    try:
        cell_list = table.find_all("td")
        print(safetxt(cell_list[1]))        
        d = dict(
            id=safetxt(cell_list[1]),
            elevation=safenumber(cell_list[3]),
            basin=safetxt(cell_list[5]),
            county=safetxt(cell_list[7]),
            hydrologic_region=safetxt(cell_list[9]),
            nearby_city=safetxt(cell_list[11]),
            lat=safetxt(cell_list[13]),
            lon=safetxt(cell_list[15]),
            operator=safetxt(cell_list[17]),
            maintenance=safetxt(cell_list[19]),
        )
        dict_list.append(d)
    
    except AttributeError:
        pass

APN
ANT
AST
BRT
BAR
BRV
BRD
BTH
BLB
BOC
BMP
BQC
BWN
BWS
BDP
BIO
BHC
BUC
BIL
BCL
BTV
CCH
CVE
CRO
CMN
CMI
CFW
ALM
CPL
CSI
CAS
SLW
CHB
CHV
JNN
CLK
CLA
CLC
CGS
CMB
CTG
CYC
COY
CNV
CUY
MHV
DLV
DMV
DNP
DON
DNN
DNL
DRE
EPK
ELC
ENR
EJC
ENG
FRM
FLR
FOL
FRL
FMD
FRD
MIL
GLK
GBR
GLL
GBL
PWL
GDW
GNT
DAV
GDR
HWE
HNS
HID
MEA
HNT
ICH
INP
INL
MMW
INV
IRC
ISB
JCK
ATN
LNG
JML
JNC
KNT
KKR
KES
LGR
LFY
LGT
LEA
LKF
HMT
HNN
HDG
LVD
THC
LVY
LRA
LEW
LGV
LRK
CRW
LON
LOP
LBS
LVQ
LWB
CRY
HHL
LYS
SWB
MPL
MAR
EDN
MRT
MAT
MHW
MCO
MCS
MMR
MDO
BER
MOR
MRR
NCM
NAT
BUL
EXC
NHG
NML
SPM
NWL
NCA
ONF
HTH
OLH
ORO
LOT
OWN
PAR
HVS
PRR
LPY
PNF
PT6
PT7
BIT
PVP
PRA
PRS
PYM
QUL
RLC
RDN
RLF
RLL
RBL
RTD
SDB
SLN
SLS
SNN
SAT
SGB
SNL
LUS
SLF
SPB
SVT
PRU
SGC
SCD
SFL
SVO
SHA
SHV
SIV
SKN
SLB
SLC
JNK
SOL
SLJ
SKR
SLK
SPG
SPC
STP
SWV
SEC
STG
SCC
STD
SW3
TRM
TAB
TMT
THD
TNM
CLE
TUL
TLC
TWT
UNV
KLM
SJT
USL
UTI
UVA
VIL
VAR
TAE
EDN
VLP
WRS
WHR
WHI
WSN


In [16]:
len(dict_list)

226

In [17]:
df_list = []
for d in dict_list:
    df = pd.DataFrame(d, index=[0])
    df_list.append(df)

In [18]:
df_details = pd.concat(df_list)

In [19]:
df_details.to_csv("../data/metadata/reservoirs-details.csv", index=False)

---
### Merge

In [20]:
merge = pd.merge(metadata_df, df_details, how="left", on="id")

In [21]:
len(metadata_df)

227

In [24]:
len(df_details)

226

In [30]:
merge[merge.hydrologic_region.isna()]

Unnamed: 0,id,name,lake,stream,capacity,url,elevation,basin,county,hydrologic_region,nearby_city,lat,lon,operator,maintenance
136,MCR,New Exchequer,Lake McClure,Merced River,1024600,https://cdec.water.ca.gov/dynamicapp/staMeta?s...,,,,,,,,,
222,EDS,Vermillion Valley,Thomas A. Edison Lake,S Fork San Joaquin River,125000,https://cdec.water.ca.gov/dynamicapp/staMeta?s...,,,,,,,,,


In [31]:
merge.to_csv("../data/metadata/reservoirs-metadata-details.csv", index=False)