In [1]:
import pytz
from datetime import datetime, date, timedelta
import pandas as pd
import altair as alt
import altair_latimes as lat

In [2]:
alt.themes.register('latimes', lat.theme)
alt.themes.enable('latimes')

ThemeRegistry.enable('latimes')

In [3]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)

### Import

In [4]:
df = pd.read_csv(
    "../data/raw/reservoirs/major-reservoir-scrape-latest.csv", 
    parse_dates=["DATE TIME", "OBS DATE"]
)

In [5]:
hist_df = pd.read_csv(
    "../data/metadata/reservoirs-historical-averages.csv"
)

In [6]:
details_df = pd.read_csv(
    "../data/metadata/reservoirs-metadata-details.csv"
)

### Clean

In [7]:
df.columns = df.columns.str.lower()

In [8]:
df.columns = df.columns.str.replace(" ","_")

In [9]:
df = df.rename(columns={
    "station_id": "reservoir_id",
    "date_time": "date",
    "value": "storage_af"
})

In [10]:
df["storage_af"] = df["storage_af"].str.replace("---","")

In [11]:
df["storage_af"] = pd.to_numeric(df["storage_af"])

In [12]:
details_df["lat"] = details_df["lat"].str.replace("°","")
details_df["lon"] = details_df["lon"].str.replace("°","")

### Merge historical average

Create month column for merging

In [13]:
df["month"] = pd.DatetimeIndex(df.date).month

In [14]:
hist_df["month"] = pd.to_datetime(hist_df["month"], format='%B').dt.month

In [15]:
merge_historical = pd.merge(
    df[["reservoir_id","date","month","storage_af","units"]],
    hist_df[["reservoir_id", "month", "average_storage_value", "average_storage_unit"]],
    how="left",
    on=["reservoir_id","month"]
)

In [16]:
merge_details = pd.merge(
    merge_historical,
    details_df[["id", "lake", "capacity", "lat", "lon"]],
    how="left",
    left_on=["reservoir_id"],
    right_on=["id"]
).drop(["id", "units", "average_storage_unit"], axis=1)

### Calculate percentages

In [17]:
merge_details["current_level_pct_of_total"] = merge_details["storage_af"] / merge_details["capacity"]

In [18]:
merge_details["average_level_pct_of_total"] = merge_details["average_storage_value"] / merge_details["capacity"]

In [19]:
merge_details["current_level_pct_of_avg"] = merge_details["storage_af"] / merge_details["average_storage_value"]

### Filter to just "major" reservoirs

In [20]:
# src: https://cdec.water.ca.gov/resapp/RescondMain
major_reservoirs_list = [
    'SHA', # Shasta
    'ORO', # Oroville
    'CLE', # Trinity Lake
    'NML', # New Melones Reservoir
    'SNL', # San Luis
    'DNP', # New Don Pedro
    'BER', # Berryessa
    'ALM', # Lake Almanor
    # 'LUS', # San Luis Reservoir
    'FOL', # Folsom    
    # 'BUL',
    # 'CMN', 
    # 'NML',
    # 'DNP', # New Don Pedro Reservoir
    # 'EXC', # Lake McClure
    # 'WRS',
    # 'SNL',
    # 'CCH',
    # 'CAS',
    # 'CSI',
    # 'DMV',
    # 'MIL',
    # 'PNF',
]

In [21]:
major_reservoirs = merge_details[merge_details.reservoir_id.isin(major_reservoirs_list)]

### Check out latest data

First drop na rows

In [22]:
drop_na_df = major_reservoirs.dropna(subset=["storage_af"])

In [23]:
latest_df = drop_na_df.sort_values('date').groupby('reservoir_id').tail(1)

In [24]:
latest_date = latest_df.date.max()

In [25]:
latest_df["flag"] = False

In [26]:
latest_df.loc[
    (latest_df.date < latest_date),
    "flag"
] = True

In [27]:
latest_df

Unnamed: 0,reservoir_id,date,month,storage_af,average_storage_value,lake,capacity,lat,lon,current_level_pct_of_total,average_level_pct_of_total,current_level_pct_of_avg,flag
4118,BER,2022-07-10,7,916916.0,1236209.0,Lake Berryessa,1602000.0,38.513,-122.104,0.572,0.772,0.742,True
60,CLE,2022-07-13,7,696053.0,1752490.0,Trinity Lake,2447650.0,40.801,-122.762,0.284,0.716,0.397,False
4679,NML,2022-07-13,7,755275.0,1448580.0,New Melones Reservoir,2400000.0,37.9481,-120.525,0.315,0.604,0.521,False
3005,ORO,2022-07-13,7,1589314.0,2349829.0,Lake Oroville,3537577.0,39.54,-121.493,0.449,0.664,0.676,False
2323,SHA,2022-07-13,7,1738098.0,3102099.0,Lake Shasta,4552000.0,40.718,-122.42,0.382,0.681,0.56,False
2757,ALM,2022-07-13,7,850236.0,956255.0,Lake Almanor,1308000.0,40.218,-121.173,0.65,0.731,0.889,False
3966,FOL,2022-07-13,7,706938.0,623348.0,Folsom Lake,977000.0,38.683,-121.183,0.724,0.638,1.134,False
4865,DNP,2022-07-13,7,1270922.0,1594269.0,New Don Pedro Reservoir,2030000.0,37.702,-120.421,0.626,0.785,0.797,False
5485,SNL,2022-07-13,7,726712.0,876375.0,San Luis Reservoir,2041000.0,37.033,-121.133,0.356,0.429,0.829,False


In [28]:
latest_df.storage_af.sum()

9250464.0

In [29]:
melt=pd.melt(latest_df, id_vars="reservoir_id", value_vars=["storage_af", "average_storage_value", "capacity"])

In [30]:
bar_order = {
    "storage_af": 1, 
    "average_storage_value": 2, 
    "capacity":3            
}

In [31]:
melt["bar_order"] = melt.variable.map(bar_order)

In [32]:
domain = ["capacity", "average_storage_value", "storage_af", ]
range_ = ['#ddd', '#83c6e0', '#1281aa', ]

alt.Chart(melt).mark_bar().encode(
    x=alt.X('reservoir_id', axis=alt.Axis(labels=False)),
    y=alt.Y('value',stack="normalize"),
    order="bar_order",
    color=alt.Color(
        'variable', 
        scale=alt.Scale(domain=domain, range=range_), 
        #sort='descending'
    ),
    tooltip=["reservoir_id"]
).properties(title="California's largest reservoirs")

### Export

Trim and rename for export

In [33]:
export_df = latest_df[
    ['reservoir_id','lake', 'date', 'storage_af', 'average_storage_value', 'capacity', 'current_level_pct_of_total',
       'average_level_pct_of_total', 'current_level_pct_of_avg', 'lat', 'lon', 'flag' ]
].rename(columns={
    'lake': 'reservoir_name',
    'average_storage_value': 'historical_average',
    'capacity': 'total_capacity'
}).sort_values("total_capacity", ascending=False)

In [34]:
export_df.to_csv("../data/processed/reservoirs/reservoirs-latest.csv", index=False)