In [217]:
import pytz
from datetime import datetime, date, timedelta
import pandas as pd
import altair as alt
import altair_latimes as lat

In [218]:
alt.themes.register('latimes', lat.theme)
alt.themes.enable('latimes')

ThemeRegistry.enable('latimes')

In [219]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)

### Import

In [220]:
df = pd.read_csv(
    "../data/raw/reservoirs/major-reservoir-scrape-latest.csv", 
    parse_dates=["DATE TIME", "OBS DATE"]
)

In [221]:
hist_df = pd.read_csv(
    "../data/metadata/reservoirs-historical-averages.csv"
)

In [222]:
details_df = pd.read_csv(
    "../data/metadata/reservoirs-metadata-details.csv"
)

### Clean

In [223]:
df.columns = df.columns.str.lower()

In [224]:
df.columns = df.columns.str.replace(" ","_")

In [225]:
df = df.rename(columns={
    "station_id": "reservoir_id",
    "date_time": "date",
    "value": "storage_af"
})

In [226]:
df["storage_af"] = df["storage_af"].str.replace("---","")

In [227]:
df["storage_af"] = pd.to_numeric(df["storage_af"])

In [228]:
details_df["lat"] = details_df["lat"].str.replace("°","")
details_df["lon"] = details_df["lon"].str.replace("°","")

### Merge historical average

Create month column for merging

In [229]:
df["month"] = pd.DatetimeIndex(df.date).month

In [230]:
hist_df["month"] = pd.to_datetime(hist_df["month"], format='%B').dt.month

In [231]:
merge_historical = pd.merge(
    df[["reservoir_id","date","month","storage_af","units"]],
    hist_df[["reservoir_id", "month", "average_storage_value", "average_storage_unit"]],
    how="left",
    on=["reservoir_id","month"]
)

In [232]:
details_df.columns

Index(['id', 'name', 'lake', 'stream', 'capacity', 'url', 'elevation', 'basin',
       'county', 'hydrologic_region', 'nearby_city', 'lat', 'lon', 'operator',
       'maintenance'],
      dtype='object')

In [233]:
merge_details = pd.merge(
    merge_historical,
    details_df[["id", "lake", "capacity", "lat", "lon"]],
    how="left",
    left_on=["reservoir_id"],
    right_on=["id"]
).drop(["id", "units", "average_storage_unit"], axis=1)

### Calculate percentages

In [234]:
merge_details["current_level_pct_of_total"] = merge_details["storage_af"] / merge_details["capacity"]

In [235]:
merge_details["average_level_pct_of_total"] = merge_details["average_storage_value"] / merge_details["capacity"]

In [236]:
merge_details["current_level_pct_of_avg"] = merge_details["storage_af"] / merge_details["average_storage_value"]

### Check out latest data

First drop na rows

In [237]:
drop_na_df = merge_details.dropna(subset=["storage_af"])

In [238]:
tz = pytz.timezone("America/Los_Angeles")

In [239]:
today = datetime.now(tz).date()
today

datetime.date(2022, 6, 20)

In [240]:
yesterday = (today - pd.DateOffset(days=1)).date()
yesterday

datetime.date(2022, 6, 19)

In [241]:
latest_df = drop_na_df[drop_na_df.date == pd.to_datetime(yesterday)].copy()

In [242]:
len(latest_df)

138

In [243]:
latest_df.storage_af.sum()

17894232.0

### Filter to just "major" reservoirs

In [245]:
# src: https://cdec.water.ca.gov/resapp/RescondMain
major_reservoirs_list = [
    'SHA', # Shasta
    'ORO', # Oroville
    'CLE', # Trinity Lake
    'NML', # New Melones Reservoir
    'SNL', # San Luis
    'DNP', # New Don Pedro
    'BER', # Berryessa
    'ALM', # Lake Almanor
    # 'LUS', # San Luis Reservoir
    'FOL', # Folsom    
    # 'BUL',
    # 'CMN', 
    # 'NML',
    # 'DNP', # New Don Pedro Reservoir
    # 'EXC', # Lake McClure
    # 'WRS',
    # 'SNL',
    # 'CCH',
    # 'CAS',
    # 'CSI',
    # 'DMV',
    # 'MIL',
    # 'PNF',
]

In [246]:
major_reservoirs = latest_df[latest_df.reservoir_id.isin(major_reservoirs_list)]

Drop missing values

In [248]:
drop_na = major_reservoirs.dropna(subset=["storage_af","average_storage_value"])

In [249]:
len(drop_na)

8

In [250]:
melt=pd.melt(drop_na, id_vars="reservoir_id", value_vars=["storage_af", "average_storage_value", "capacity"])

In [251]:
bar_order = {
    "storage_af": 1, 
    "average_storage_value": 2, 
    "capacity":3            
}

In [252]:
melt["bar_order"] = melt.variable.map(bar_order)

In [253]:
domain = ["capacity", "average_storage_value", "storage_af", ]
range_ = ['#ddd', '#83c6e0', '#1281aa', ]

alt.Chart(melt).mark_bar().encode(
    x=alt.X('reservoir_id', axis=alt.Axis(labels=False)),
    y=alt.Y('value',stack="normalize"),
    order="bar_order",
    color=alt.Color(
        'variable', 
        scale=alt.Scale(domain=domain, range=range_), 
        #sort='descending'
    ),
    tooltip=["reservoir_id"]
).properties(title="California's largest reservoirs")

In [254]:
domain = ["capacity", "average_storage_value", "storage_af", ]
range_ = ['#ddd', '#83c6e0', '#1281aa', ]

alt.Chart(melt[melt.variable != "capacity_value"]).mark_bar().encode(
    x=alt.X('reservoir_id', axis=alt.Axis(labels=False)),
    y=alt.Y('value',stack="normalize"),
    order="bar_order",
    color=alt.Color(
        'variable', 
        scale=alt.Scale(domain=domain, range=range_), 
        #sort='descending'
    ),
    tooltip=["reservoir_id"]
).properties(title="California's largest reservoirs")

In [255]:
domain = ["capacity", "average_storage_value", "storage_af", ]
range_ = ['#ddd', '#83c6e0', '#1281aa', ]

alt.Chart(melt[melt.reservoir_id=="ORO"]).mark_bar().encode(
    x=alt.X('reservoir_id', axis=alt.Axis(labels=False)),
    y=alt.Y('value',stack="normalize"),
    order="bar_order",
    color=alt.Color(
        'variable', 
        scale=alt.Scale(domain=domain, range=range_), 
        #sort='descending'
    ),
    tooltip=["reservoir_id"]
).properties(title="Lake Oroville", width=150)

### Export

Trim and rename for export

In [256]:
export_df = drop_na[
    ['reservoir_id','lake', 'date', 'storage_af', 'average_storage_value', 'capacity', 'current_level_pct_of_total',
       'average_level_pct_of_total', 'current_level_pct_of_avg', 'lat', 'lon' ]
].rename(columns={
    'lake': 'reservoir_name',
    'average_storage_value': 'historical_average',
    'capacity': 'total_capacity'
})

In [257]:
export_df.to_csv("../data/processed/reservoirs/reservoirs-latest.csv", index=False)