In [8]:
import pytz
import requests
import pandas as pd
from playwright.async_api import async_playwright
from datetime import datetime, date, timedelta
import altair as alt
import altair_latimes as lat

In [51]:
from vega_datasets import data

Scrape storage data of major reservoirs from https://cdec.water.ca.gov/dynamicapp/QueryDaily

---

List of reservoir IDs

In [9]:
reservoir_list = [
    "SHA", # Shasta
    "ORO", # Oroville
    "BUL", # New Bullards Bar
    "FOL", # Folsom
    "CLE", # Trinity
    "CMN", # Camanche
    "WRS", # Sonoma
    "SNL", #San Luis
    "NML", # New Melones
    "DNP", # Don Pedro
    "CCH", # Cachuma
    "CSI", # Casitas
    "EXC", # McClur
    "MIL", # Millerton
    "PNF", # Pine Flat
    "CAS", # Castaic
    "DMV", # Diamond Valley Lake
    # "MHW", # Lake Matthews
    # "SLW", # Silverwood Lake
    # "SKN", # Lake Skinner
    # "PRR", # Lake Perris
]

In [10]:
reservoir_list_enhanced = [
 'APN',
 'ANT',
 'AST',
 'BRT',
 'BAR',
 'BRV',
 'BRD',
 'BTH',
 'BLB',
 'BOC',
 'BMP',
 'BQC',
 'BWN',
 'BWS',
 'BDP',
 'BIO',
 'BHC',
 'BUC',
 'BIL',
 'BCL',
 'BTV',
 'CCH',
 'CVE',
 'CRO',
 'CMN',
 'CMI',
 'CFW',
 'ALM',
 'CPL',
 'CSI',
 'CAS',
 'SLW',
 'CHB',
 'CHV',
 'JNN',
 'CLK',
 'CLA',
 'CLC',
 'CGS',
 'CMB',
 'CTG',
 'CYC',
 'COY',
 'CNV',
 'CUY',
 'MHV',
 'DLV',
 'DMV',
 'DNP',
 'DON',
 'DNN',
 'DNL',
 'DRE',
 'EPK',
 'ELC',
 'ENR',
 'EJC',
 'ENG',
 'FRM',
 'FLR',
 'FOL',
 'FRL',
 'FMD',
 'FRD',
 'MIL',
 'GLK',
 'GBR',
 'GLL',
 'GBL',
 'PWL',
 'GDW',
 'GNT',
 'DAV',
 'GDR',
 'HWE',
 'HNS',
 'HID',
 'MEA',
 'HNT',
 'ICH',
 'INP',
 'INL',
 'MMW',
 'INV',
 'IRC',
 'ISB',
 'JCK',
 'ATN',
 'LNG',
 'JML',
 'JNC',
 'KNT',
 'KKR',
 'KES',
 'LGR',
 'LFY',
 'LGT',
 'LEA',
 'LKF',
 'HMT',
 'HNN',
 'HDG',
 'LVD',
 'THC',
 'LVY',
 'LRA',
 'LEW',
 'LGV',
 'LRK',
 'CRW',
 'LON',
 'LOP',
 'LBS',
 'LVQ',
 'LWB',
 'CRY',
 'HHL',
 'LYS',
 'SWB',
 'MPL',
 'MAR',
 'EDN',
 'MRT',
 'MAT',
 'MHW',
 'MCO',
 'MCS',
 'MMR',
 'MDO',
 'BER',
 'MOR',
 'MRR',
 'NCM',
 'NAT',
 'BUL',
 'MCR',
 'EXC',
 'NHG',
 'NML',
 'SPM',
 'NWL',
 'NCA',
 'ONF',
 'HTH',
 'OLH',
 'ORO',
 'LOT',
 'OWN',
 'PAR',
 'HVS',
 'PRR',
 'LPY',
 'PNF',
 'PT6',
 'PT7',
 'BIT',
 'PVP',
 'PRA',
 'PRS',
 'PYM',
 'QUL',
 'RLC',
 'RDN',
 'RLF',
 'RLL',
 'RBL',
 'RTD',
 'SDB',
 'SLN',
 'SLS',
 'SNN',
 'SAT',
 'SGB',
 'SNL',
 'LUS',
 'SLF',
 'SPB',
 'SVT',
 'PRU',
 'SGC',
 'SCD',
 'SFL',
 'SVO',
 'SHA',
 'SHV',
 'SIV',
 'SKN',
 'SLB',
 'SLC',
 'JNK',
 'SOL',
 'SLJ',
 'SKR',
 'SLK',
 'SPG',
 'SPC',
 'STP',
 'SWV',
 'SEC',
 'STG',
 'SCC',
 'STD',
 'SW3',
 'TRM',
 'TAB',
 'TMT',
 'THD',
 'TNM',
 'CLE',
 'TUL',
 'TLC',
 'TWT',
 'UNV',
 'KLM',
 'SJT',
 'USL',
 'UTI',
 'UVA',
 'VIL',
 'VAR',
 'TAE',
 'EDS',
 'VLP',
 'WRS',
 'WHR',
 'WHI',
 'WSN'
]

In [11]:
len(reservoir_list)

17

Get current date

In [12]:
tz = pytz.timezone("America/Los_Angeles")

In [13]:
today = datetime.now(tz).date()

In [14]:
async with async_playwright() as playwright:
    browser = await playwright.chromium.launch()
    context = await browser.new_context(accept_downloads=True)

    # Open new page
    page = await context.new_page()

    df_list = []

    # Query database
    for res in reservoir_list_enhanced:

            # url to query
            url = await page.goto(f"https://cdec.water.ca.gov/dynamicapp/QueryDaily?s={res}&end={today}")
            
            print(f"Downloading data for {res} from {url.url}")
            
            try:
                # get CSV download
                download_csv_button = "button.buttons-csv"
                await page.wait_for_selector(download_csv_button)
                await page.wait_for_function(
                    f"document.querySelector('{download_csv_button}').textContent"
                )

                # get reservoir name
                header = "h1"
                res_name = await page.text_content(header)  # .split(": ")[1]
                
                await page.wait_for_timeout(5000)

                # download!
                async with page.expect_download() as download_info:
                    await page.click(download_csv_button)
                download = await download_info.value

                # make dataframe
                path = await download.path()
                df = pd.read_csv(path)
                df.insert(0, "reservoir_name", res_name.split(" (")[0])
                df.insert(1, "reservoir_id", res)

                # append
                df_list.append(df)

                print("Done!")
            except:
                print("FAILED!")

    # Close context
    await context.close()
    # Close browser
    await browser.close() 

Downloading data for APN from https://cdec.water.ca.gov/dynamicapp/QueryDaily?s=APN&end=2022-06-07
Done!
Downloading data for ANT from https://cdec.water.ca.gov/dynamicapp/QueryDaily?s=ANT&end=2022-06-07
Done!
Downloading data for AST from https://cdec.water.ca.gov/dynamicapp/QueryDaily?s=AST&end=2022-06-07
FAILED!
Downloading data for BRT from https://cdec.water.ca.gov/dynamicapp/QueryDaily?s=BRT&end=2022-06-07
Done!
Downloading data for BAR from https://cdec.water.ca.gov/dynamicapp/QueryDaily?s=BAR&end=2022-06-07
Done!
Downloading data for BRV from https://cdec.water.ca.gov/dynamicapp/QueryDaily?s=BRV&end=2022-06-07
Done!
Downloading data for BRD from https://cdec.water.ca.gov/dynamicapp/QueryDaily?s=BRD&end=2022-06-07
Done!
Downloading data for BTH from https://cdec.water.ca.gov/dynamicapp/QueryDaily?s=BTH&end=2022-06-07
FAILED!
Downloading data for BLB from https://cdec.water.ca.gov/dynamicapp/QueryDaily?s=BLB&end=2022-06-07
Done!
Downloading data for BOC from https://cdec.water.ca

In [15]:
concat = pd.concat(df_list)

In [16]:
concat.drop(df.columns[df.columns.str.contains('unnamed',case = False)],axis = 1, inplace = True)

In [17]:
concat.head()

Unnamed: 0,reservoir_name,reservoir_id,DATE / TIME (PST),RES ELE FEET,STORAGE AF,RES CHG AF,Unnamed: 6,PPT INC INCHES,Unnamed: 8,RAIN INCHES,...,Unnamed: 24,Unnamed: 26,FNF CFS,WINDLEN MILES,SNOW WC INCHES,SNO ADJ INCHES,SNOW DP INCHES,PPTINC4 INCHES,Unnamed: 28,RES EL FEET
0,ALPINE LAKE,APN,05/08/2022,--,--,,,,,,...,,,,,,,,,,
1,ALPINE LAKE,APN,05/09/2022,--,--,,,,,,...,,,,,,,,,,
2,ALPINE LAKE,APN,05/10/2022,--,--,,,,,,...,,,,,,,,,,
3,ALPINE LAKE,APN,05/11/2022,--,--,,,,,,...,,,,,,,,,,
4,ALPINE LAKE,APN,05/12/2022,--,--,,,,,,...,,,,,,,,,,


In [18]:
trim = concat[["reservoir_name", "reservoir_id", "DATE / TIME (PST)", "STORAGE AF"]]

In [19]:
trim.columns = ["reservoir_name", "reservoir_id", "date", "storage_af"]

In [20]:
trim.head()

Unnamed: 0,reservoir_name,reservoir_id,date,storage_af
0,ALPINE LAKE,APN,05/08/2022,--
1,ALPINE LAKE,APN,05/09/2022,--
2,ALPINE LAKE,APN,05/10/2022,--
3,ALPINE LAKE,APN,05/11/2022,--
4,ALPINE LAKE,APN,05/12/2022,--


In [21]:
trim["date"] = pd.to_datetime(trim["date"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  trim["date"] = pd.to_datetime(trim["date"])


In [22]:
trim["storage_af"] = trim["storage_af"].str.replace(",","")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  trim["storage_af"] = trim["storage_af"].str.replace(",","")


In [23]:
trim["storage_af"] = trim["storage_af"].replace("--", "")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  trim["storage_af"] = trim["storage_af"].replace("--", "")


In [24]:
trim["storage_af"] = pd.to_numeric(trim.storage_af, errors='coerce').astype('Int64')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  trim["storage_af"] = pd.to_numeric(trim.storage_af, errors='coerce').astype('Int64')


In [25]:
trim[trim.date == "2022-06-05"].storage_af.sum()

21041828

In [26]:
len(trim.reservoir_name.unique())

201

In [31]:
trim.sort_values("storage_af", ascending=False)

Unnamed: 0,reservoir_name,reservoir_id,date,storage_af
0,OROVILLE DAM,ORO,2022-05-08,1942482
7,OROVILLE DAM,ORO,2022-05-15,1941551
1,OROVILLE DAM,ORO,2022-05-09,1940413
8,OROVILLE DAM,ORO,2022-05-16,1940000
14,OROVILLE DAM,ORO,2022-05-22,1939276
...,...,...,...,...
26,WISHON,WSN,2022-06-03,
27,WISHON,WSN,2022-06-04,
28,WISHON,WSN,2022-06-05,
29,WISHON,WSN,2022-06-06,


In [35]:
trim["storage_af"] = trim.groupby(["reservoir_id"])["storage_af"].ffill()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  trim["storage_af"] = trim.groupby(["reservoir_id"])["storage_af"].ffill()


In [86]:
latest = trim[trim.date == trim.date.max()]
latest

Unnamed: 0,reservoir_name,reservoir_id,date,storage_af
30,ALPINE LAKE,APN,2022-06-07,8805
30,ANTELOPE LAKE,ANT,2022-06-07,22453
30,BARRETT,BRT,2022-06-07,
30,BEAR,BAR,2022-06-07,2
30,BEAR VALLEY DAM,BRV,2022-06-07,
...,...,...,...,...
30,THOMAS A EDISON,TAE,2022-06-07,42979
30,WARM SPRINGS,WRS,2022-06-07,138378
30,WHALE ROCK,WHR,2022-06-07,
30,WHISKEYTOWN DAM,WHI,2022-06-07,237863


In [87]:
latest = latest.dropna(subset=['storage_af'])

In [88]:
latest.storage_af.sum()

21495072

In [107]:
latest.reservoir_name = latest.reservoir_name.str.title().str.replace(" Lake","").str.replace(" Reservoir","").str.replace("

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  latest.reservoir_name = latest.reservoir_name.str.title().str.replace(" Lake","").str.replace(" Reservoir","")


In [108]:
latest

Unnamed: 0,reservoir_name,reservoir_id,date,storage_af
30,Alpine,APN,2022-06-07,8805
30,Antelope,ANT,2022-06-07,22453
30,Bear,BAR,2022-06-07,2
30,Beardsley,BRD,2022-06-07,87707
30,Black Butte,BLB,2022-06-07,76709
...,...,...,...,...
30,Van Arsdale 24Hr Avg,VAR,2022-06-07,204
30,Thomas A Edison,TAE,2022-06-07,42979
30,Warm Springs,WRS,2022-06-07,138378
30,Whiskeytown Dam,WHI,2022-06-07,237863


In [109]:
metadata_df = pd.read_csv("../data/metadata/reservoirs-metadata-details.csv")

In [133]:
merge_latest = pd.merge(
    latest, 
    metadata_df[["name", "id","nearby_city","capacity", "lat", "lon"]], 
    how="left", 
    left_on=["reservoir_id"], 
    right_on=["id"]
)

In [135]:
merge_latest#[merge_latest.capacity.isna()]

Unnamed: 0,reservoir_name,reservoir_id,date,storage_af,name,id,nearby_city,capacity,lat,lon
0,Alpine,APN,2022-06-07,8805,Alpine,APN,STINSON BEACH,8892.0,37.940000°,-122.637000°
1,Antelope,ANT,2022-06-07,22453,Antelope,ANT,SUSANVILLE,22566.0,40.180000°,-120.607000°
2,Bear,BAR,2022-06-07,2,Bear,BAR,PLANADA,7700.0,37.367000°,-120.217000°
3,Beardsley,BRD,2022-06-07,87707,Beardsley,BRD,STRAWBERRY,97800.0,38.203000°,-120.075000°
4,Black Butte,BLB,2022-06-07,76709,Black Butte Rereg,BLB,ORLAND,143700.0,39.808000°,-122.329000°
...,...,...,...,...,...,...,...,...,...,...
167,Van Arsdale 24Hr Avg,VAR,2022-06-07,204,Van Arsdale,VAR,POTTER VALLEY,700.0,39.383900°,-123.102500°
168,Thomas A Edison,TAE,2022-06-07,42979,Vermilion Valley,TAE,BIG CREEK,125000.0,37.370000°,-118.987000°
169,Warm Springs,WRS,2022-06-07,138378,Warm Springs,WRS,GEYSERVILLE,381000.0,38.723000°,-123.010000°
170,Whiskeytown Dam,WHI,2022-06-07,237863,Whiskeytown,WHI,REDDING,241100.0,40.598000°,-122.537000°


In [136]:
merge_latest.storage_af.sum()

21514942

In [137]:
merge_latest.capacity.sum()

40543378.0

In [138]:
metadata_df.capacity.sum()

136467266.0

In [139]:
merge_latest.storage_af.sum()/merge_latest.capacity.sum()

0.5306647610862617

In [140]:
metadata_df[metadata_df.id == "USL"]

Unnamed: 0,id,name,lake,stream,capacity,url,elevation,basin,county,hydrologic_region,nearby_city,lat,lon,operator,maintenance
216,USL,Upper San Leandro,U San Leandro R,Alameda Creek,37960.0,https://cdec.water.ca.gov/dynamicapp/staMeta?s...,477ft,ALAMEDA CR,ALAMEDA,SAN FRANCISCO BAY,OAKLAND,37.764400°,-122.101600°,East Bay Municipal Utility District,.None Specified


In [146]:
merge_latest.lat = merge_latest.lat.str.replace("°","").astype(float)
merge_latest.lon = merge_latest.lon.str.replace("°","").astype(float)

In [158]:
data.us_10m.url

'https://cdn.jsdelivr.net/npm/vega-datasets@v1.29.0/data/us-10m.json'

In [176]:
states = alt.topo_feature(data.us_10m.url, feature='states')
# US states background
background = alt.Chart(states).mark_geoshape(
    fill='lightgray',
    stroke='white'
).transform_filter((alt.datum.id == 6)).properties(
    width=600,
    height=1200
).project('albersUsa')

capacity = alt.Chart(merge_latest).mark_circle(opacity=0.6, color='#83c6e0').encode(
    longitude='lon:Q',
    latitude='lat:Q',
    size='capacity',
    tooltip='name',
).project(
    "albersUsa"
).properties(
    width=600,
    height=1200
)

storage = alt.Chart(merge_latest).mark_circle(color='#1281aa').encode(
    longitude='lon:Q',
    latitude='lat:Q',
    size='storage_af',
    tooltip='name'
).project(
    "albersUsa"
).properties(
    width=600,
    height=1200
)

background + capacity + storage