In [35]:
import pytz
from datetime import datetime, date, timedelta
import pandas as pd
import altair as alt
import altair_latimes as lat

In [36]:
alt.themes.register('latimes', lat.theme)
alt.themes.enable('latimes')

ThemeRegistry.enable('latimes')

In [37]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)

### Import

In [38]:
df = pd.read_csv(
    "../data/raw/reservoirs/major-reservoir-scrape-latest.csv", 
    parse_dates=["DATE TIME", "OBS DATE"]
)

In [39]:
hist_df = pd.read_csv(
    "../data/metadata/reservoirs-historical-averages.csv"
)

In [40]:
details_df = pd.read_csv(
    "../data/metadata/reservoirs-metadata-details.csv"
)

### Clean

In [41]:
df.columns = df.columns.str.lower()

In [42]:
df.columns = df.columns.str.replace(" ","_")

In [43]:
df = df.rename(columns={
    "station_id": "reservoir_id",
    "date_time": "date",
    "value": "storage_af"
})

In [48]:
df["storage_af"] = df["storage_af"].str.replace("---","")
df["storage_af"] = df["storage_af"].str.replace("ART","")

In [49]:
df.sort_values("storage_af")

Unnamed: 0,reservoir_id,duration,sensor_number,sensor_type,date,obs_date,storage_af,data_flag,units
6495,SLW,D,15,STORAGE,2023-01-12,2023-01-12,,,AF
1899,OLH,D,15,STORAGE,2022-12-23,2022-12-23,,,AF
1898,OLH,D,15,STORAGE,2022-12-22,2022-12-22,,,AF
1897,OLH,D,15,STORAGE,2022-12-21,2022-12-21,,,AF
1896,OLH,D,15,STORAGE,2022-12-20,2022-12-20,,,AF
...,...,...,...,...,...,...,...,...,...
1122,NCM,D,15,STORAGE,2022-12-14,2022-12-14,99734,,AF
1015,UVA,D,15,STORAGE,2023-01-04,2023-01-04,9977,,AF
5950,STP,D,15,STORAGE,2023-01-11,2023-01-12,99800,,AF
6037,PRS,D,15,STORAGE,2023-01-02,2023-01-03,9987,,AF


In [50]:
df["storage_af"] = pd.to_numeric(df["storage_af"])

In [51]:
details_df["lat"] = details_df["lat"].str.replace("°","")
details_df["lon"] = details_df["lon"].str.replace("°","")

### Merge historical average

Create month column for merging

In [52]:
df["month"] = pd.DatetimeIndex(df.date).month

In [53]:
hist_df["month"] = pd.to_datetime(hist_df["month"], format='%B').dt.month

In [54]:
merge_historical = pd.merge(
    df[["reservoir_id","date","month","storage_af","units"]],
    hist_df[["reservoir_id", "month", "average_storage_value", "average_storage_unit"]],
    how="left",
    on=["reservoir_id","month"]
)

In [55]:
merge_details = pd.merge(
    merge_historical,
    details_df[["id", "lake", "capacity", "lat", "lon"]],
    how="left",
    left_on=["reservoir_id"],
    right_on=["id"]
).drop(["id", "units", "average_storage_unit"], axis=1)

### Calculate percentages

In [56]:
merge_details["current_level_pct_of_total"] = merge_details["storage_af"] / merge_details["capacity"]

In [57]:
merge_details["average_level_pct_of_total"] = merge_details["average_storage_value"] / merge_details["capacity"]

In [58]:
merge_details["current_level_pct_of_avg"] = merge_details["storage_af"] / merge_details["average_storage_value"]

### Filter to just "major" reservoirs

In [59]:
# src: https://cdec.water.ca.gov/resapp/RescondMain
major_reservoirs_list = [
    'SHA', # Shasta
    'ORO', # Oroville
    'CLE', # Trinity Lake
    'NML', # New Melones Reservoir
    'SNL', # San Luis
    'DNP', # New Don Pedro
    'BER', # Berryessa
    'ALM', # Lake Almanor
    # 'LUS', # San Luis Reservoir
    'FOL', # Folsom    
    # 'BUL',
    # 'CMN', 
    # 'NML',
    # 'DNP', # New Don Pedro Reservoir
    # 'EXC', # Lake McClure
    # 'WRS',
    # 'SNL',
    # 'CCH',
    # 'CAS',
    # 'CSI',
    # 'DMV',
    # 'MIL',
    # 'PNF',
]

In [60]:
major_reservoirs = merge_details[merge_details.reservoir_id.isin(major_reservoirs_list)]

### Check out latest data

First drop na rows

In [61]:
drop_na_df = major_reservoirs.dropna(subset=["storage_af"])

In [62]:
latest_df = drop_na_df.sort_values('date').groupby('reservoir_id').tail(1)

In [63]:
latest_date = latest_df.date.max()

In [64]:
latest_df["flag"] = False

In [65]:
latest_df.loc[
    (latest_df.date < latest_date),
    "flag"
] = True

In [66]:
latest_df

Unnamed: 0,reservoir_id,date,month,storage_af,average_storage_value,lake,capacity,lat,lon,current_level_pct_of_total,average_level_pct_of_total,current_level_pct_of_avg,flag
4251,BER,2023-01-08,1,882077.0,1254696.0,Lake Berryessa,1602000.0,38.513,-122.104,0.551,0.783,0.703,True
62,CLE,2023-01-11,1,653111.0,1558120.0,Trinity Lake,2447650.0,40.801,-122.762,0.267,0.637,0.419,False
4830,NML,2023-01-11,1,829836.0,1392445.0,New Melones Reservoir,2400000.0,37.9481,-120.525,0.346,0.58,0.596,False
2398,SHA,2023-01-11,1,1995141.0,2965414.0,Lake Shasta,4552000.0,40.718,-122.42,0.438,0.651,0.673,False
2846,ALM,2023-01-11,1,799243.0,809445.0,Lake Almanor,1308000.0,40.218,-121.173,0.611,0.619,0.987,False
4094,FOL,2023-01-11,1,407865.0,448928.0,Folsom Lake,977000.0,38.683,-121.183,0.417,0.459,0.909,False
3102,ORO,2023-01-11,1,1731413.0,2055890.0,Lake Oroville,3537577.0,39.54,-121.493,0.489,0.581,0.842,False
5022,DNP,2023-01-11,1,1412091.0,1441008.0,New Don Pedro Reservoir,2030000.0,37.702,-120.421,0.696,0.71,0.98,False
5662,SNL,2023-01-11,1,832411.0,1527220.0,San Luis Reservoir,2041000.0,37.033,-121.133,0.408,0.748,0.545,False


In [67]:
latest_df.storage_af.sum()

9543188.0

In [68]:
melt=pd.melt(latest_df, id_vars="reservoir_id", value_vars=["storage_af", "average_storage_value", "capacity"])

In [69]:
bar_order = {
    "storage_af": 1, 
    "average_storage_value": 2, 
    "capacity":3            
}

In [70]:
melt["bar_order"] = melt.variable.map(bar_order)

In [71]:
domain = ["capacity", "average_storage_value", "storage_af", ]
range_ = ['#ddd', '#83c6e0', '#1281aa', ]

alt.Chart(melt).mark_bar().encode(
    x=alt.X('reservoir_id', axis=alt.Axis(labels=False)),
    y=alt.Y('value',stack="normalize"),
    order="bar_order",
    color=alt.Color(
        'variable', 
        scale=alt.Scale(domain=domain, range=range_), 
        #sort='descending'
    ),
    tooltip=["reservoir_id"]
).properties(title="California's largest reservoirs")

### Export

Trim and rename for export

In [72]:
export_df = latest_df[
    ['reservoir_id','lake', 'date', 'storage_af', 'average_storage_value', 'capacity', 'current_level_pct_of_total',
       'average_level_pct_of_total', 'current_level_pct_of_avg', 'lat', 'lon', 'flag' ]
].rename(columns={
    'lake': 'reservoir_name',
    'average_storage_value': 'historical_average',
    'capacity': 'total_capacity'
}).sort_values("total_capacity", ascending=False)

In [73]:
export_df.to_csv("../data/processed/reservoirs/reservoirs-latest.csv", index=False)