In [130]:
import pytz
import requests
import pandas as pd
from bs4 import BeautifulSoup
from playwright.async_api import async_playwright
from datetime import datetime, date, timedelta

In [131]:
from requests.packages.urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)

---

In [132]:
major_reservoir_list = [
"ATN",
"CCH",
"GBL",
"NCM",
"SLN",
"WHR",
"CLE",
"COY",
"DRE",
"LEW",
"LPY",
"WRS",
"BDP",
"BOC",
"PRS",
"STP",
"THC",
"ALM",
"ANT",
"BCL",
"BER",
"BIT",
"BLB",
"BTV",
"BUL",
"BWN",
"CFW",
"CLA",
"CPL",
"DAV",
"ENG",
"EPK",
"FMD",
"FOL",
"FRD",
"FRL",
"HHL",
"ICH",
"INV",
"IRC",
"JCK",
"KES",
"LGV",
"LON",
"MCO",
"MMW",
"NAT",
"ORO",
"PT6",
"PT7",
"RLL",
"SFL",
"SHA",
"SLB",
"SLC",
"BWS",
"STG",
"TMT",
"UNV",
"WHI",
"APN",
"BIO",
"CHB",
"CRY",
"CVE",
"CYC",
"DLV",
"HNN",
"KNT",
"LNG",
"LRA",
"NCA",
"SAT",
"SLJ",
"SNN",
"SPB",
"USL",
"BRD",
"BUC",
"CHV",
"CMN",
"CNV",
"DNP",
"DON",
"ENR",
"EXC",
"FLR",
"HID",
"HNT",
"HTH",
"JNK",
"LBS",
"LVQ",
"LWB",
"MDO",
"MIL",
"MPL",
"NHG",
"NML",
"ONF",
"PAR",
"RDN",
"RLF",
"SHV",
"SLS",
"SNL",
"SPM",
"SWB",
"TAE",
"TLC",
"TUL",
"BRT",
"BRV",
"CAS",
"CGS",
"CSI",
"CUY",
"ELC",
"HDG",
"HMT",
"HNS",
"JNN",
"LOT",
"LVD",
"MAT",
"MHW",
"MMR",
"MOR",
"MRR",
"PRR",
"PRU",
"PYM",
"RLC",
"SGB",
"SGC",
"SKN",
"STD",
"SVT",
"SW3",
"VIL",
"CRW",
"GLK",
"GNT",
"HWE",
"SDB",
"SKR",
"SLW",
"TNM",
"CTG",
"ISB",
"PNF",
"SCC",
"TRM",
"WSN",
#interstate reservoirs
"HVS","MEA","MHV","PWL","CLK","GBR","KLM"
]

Scrape historical averages for major reservoirs

Example: https://cdec.water.ca.gov/dynamicapp/profile?s=ORO&type=res

In [133]:
dict_list = []

df_list = []

for r in major_reservoir_list:
    # query each info page
    url = f"https://cdec.water.ca.gov/dynamicapp/profile?s={r}&type=res"
    
    # parse the soup
    page = requests.get(url, verify=False)
    soup = BeautifulSoup(page.content, "html.parser")
    content = soup.find(id="main_content")
    
    # get the two tables
    tables = content.find_all("table")
    
    # first table: basic reservoir information
    reservoir_information_table = tables[0]
    
    # second table: monthly 30-year averages
    monthly_averages_table = tables[1]
    
    # parse first table
    table1_cells = tables[0].find_all("td")

    d = dict(
        reservoir_id = table1_cells[1].text,
        dam_name = table1_cells[3].text,
        lake_name = table1_cells[5].text,
        stream_name = table1_cells[7].text,
        capacity = table1_cells[9].text,
        year_built = table1_cells[11].text,
        year_fill = table1_cells[13].text,
        start_year_avg = table1_cells[15].text,
        end_year_avg = table1_cells[17].text,
    )
    dict_list.append(d)
    
    # dump it into a dataframe
    meta_df = pd.DataFrame(dict_list)
        
    # dump second table into a dataframe
    avgs_df = pd.read_html(str(tables[1]))[0]
    
    # name the columns
    avgs_df.columns = ['month', 'average_storage']
    
    # append reservoir id
    avgs_df['reservoir_id'] = table1_cells[1].text
    
    # drop empty cells
    avgs_df = avgs_df.dropna(subset=['month', 'average_storage'])
    
    # append to list
    df_list.append(avgs_df)    

In [148]:
len(meta_df)

161

Check if all the start years are 1991 and all the end years are 2020

In [None]:
meta_df.start_year_avg.unique()

In [142]:
#meta_df[meta_df.start_year_avg == "1973"]

In [136]:
meta_df.end_year_avg.unique()

array(['2020'], dtype=object)

In [143]:
#meta_df[meta_df.end_year_avg == "1992"]

In [157]:
len(meta_df[meta_df.capacity.str.contains("af")])

161

Make sure all capacity values are in acre feet

In [160]:
meta_df["capacity_value"] = meta_df["capacity"].str.split(" ").str[0]

In [162]:
meta_df["capacity_unit"] = meta_df["capacity"].str.split(" ").str[1]

In [164]:
meta_df.capacity_unit.unique()

array(['af'], dtype=object)

Clean capacity value column

In [166]:
meta_df["capacity_value"] = pd.to_numeric(meta_df["capacity_value"].str.replace(",",""))

Mark interstate reservoirs

In [200]:
meta_df["is_interstate"] = False

In [201]:
meta_df.loc[
    meta_df.reservoir_id.isin(["HVS","MEA","MHV","PWL","CLK","GBR","KLM"]),
    "is_interstate"
] = True

In [202]:
meta_df[meta_df.is_interstate == False ].capacity_value.sum()

38118021

In [138]:
historical_averages_df = pd.concat(df_list)

Make sure all capacity values are in acre feet

In [174]:
historical_averages_df["average_storage_value"] = historical_averages_df["average_storage"].str.split(" ").str[0]

In [178]:
historical_averages_df["average_storage_unit"] = historical_averages_df["average_storage"].str.split(" ").str[1]

In [180]:
historical_averages_df.average_storage_unit.unique()

array(['af'], dtype=object)

Clean capacity value column

In [181]:
historical_averages_df["average_storage_value"] = pd.to_numeric(historical_averages_df["average_storage_value"].str.replace(",",""))

Mark intrastate reservoirs, such as those on the Colorado River and some near/in Oregon

In [195]:
intrastate_res = historical_averages_df[
    ~historical_averages_df.reservoir_id.isin(["HVS","MEA","MHV","PWL","CLK","GBR","KLM"])
]

In [204]:
intrastate_res[intrastate_res.month == "June"].average_storage_value.sum()

27902990

In [205]:
meta_df["url"] = "https://cdec.water.ca.gov/dynamicapp/profile?s=" + meta_df["reservoir_id"] + "&type=res"

In [206]:
meta_df.to_csv("../data/metadata/reservoirs-details.csv", index=False)

In [207]:
historical_averages_df.to_csv("../data/metadata/reservoirs-historical-averages.csv", index=False)