In [2]:
import pytz
from datetime import datetime, date, timedelta
import pandas as pd
import altair as alt
import altair_latimes as lat

In [3]:
alt.themes.register('latimes', lat.theme)
alt.themes.enable('latimes')

ThemeRegistry.enable('latimes')

In [4]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)

### Import

In [7]:
df = pd.read_csv(
    "../../data/raw/reservoirs/major-reservoir-timeseries.csv", 
    parse_dates=["DATE TIME", "OBS DATE"]
)

In [37]:
hist_df = pd.read_csv(
    "../../data/metadata/reservoirs-historical-averages.csv"
)

In [9]:
details_df = pd.read_csv(
    "../../data/metadata/reservoirs-metadata-details.csv"
)

### Clean

In [10]:
df.columns = df.columns.str.lower()

In [11]:
df.columns = df.columns.str.replace(" ","_")

In [12]:
df = df.rename(columns={
    "station_id": "reservoir_id",
    "date_time": "date",
    "value": "storage_af"
})

In [13]:
df["storage_af"] = df["storage_af"].str.replace("---","")
df["storage_af"] = df["storage_af"].str.replace("ART","")

In [15]:
df["storage_af"] = pd.to_numeric(df["storage_af"])

### Filter to last 365 days from today

In [28]:
today = datetime.today().date()
today

datetime.date(2023, 5, 12)

In [29]:
last_year = (today - pd.DateOffset(days=365)).date()
last_year

datetime.date(2022, 5, 12)

Trim to last 365 days and remove invalid values

In [30]:
trim_df = df[
    (df.date >= pd.to_datetime(last_year))
]

### Merge historical average

Create month column for merging

In [31]:
trim_df["month"] = pd.DatetimeIndex(trim_df.date).month

In [34]:
hist_df = hist_df.drop(columns="month", axis=1).rename(columns={"month_int":"month"})

In [39]:
month_names = {
    "January": 1,
    "February": 2,
    "March": 3,
    "April": 4,
    "May": 5,
    "June": 6,
    "July": 7,
    "August": 8,
    "September": 9,
    "October": 10,
    "November": 11,
    "December": 12,
}

In [40]:
hist_df["month"] = hist_df["month"].map(month_names)

In [41]:
merge_historical = pd.merge(
    df[["reservoir_id","date","month","storage_af","units"]],
    hist_df[["reservoir_id", "month", "average_storage_value", "average_storage_unit"]],
    how="left",
    on=["reservoir_id","month"]
)

In [42]:
merge_details = pd.merge(
    merge_historical,
    details_df[["id", "lake", "capacity", "lat", "lon"]],
    how="left",
    left_on=["reservoir_id"],
    right_on=["id"]
).drop(["id", "units", "average_storage_unit"], axis=1)

### Calculate percentages

In [43]:
merge_details["current_level_pct_of_total"] = merge_details["storage_af"] / merge_details["capacity"]

In [44]:
merge_details["average_level_pct_of_total"] = merge_details["average_storage_value"] / merge_details["capacity"]

In [45]:
merge_details["current_level_pct_of_avg"] = merge_details["storage_af"] / merge_details["average_storage_value"]

In [46]:
merge_details

Unnamed: 0,reservoir_id,date,month,storage_af,average_storage_value,lake,capacity,lat,lon,current_level_pct_of_total,average_level_pct_of_total,current_level_pct_of_avg
0,SHA,2022-05-12,5,1821095.000,3815276,Lake Shasta,4552000.000,40.718000,-122.420000,0.400,0.838,0.477
1,SHA,2022-05-13,5,1822384.000,3815276,Lake Shasta,4552000.000,40.718000,-122.420000,0.400,0.838,0.478
2,SHA,2022-05-14,5,1824480.000,3815276,Lake Shasta,4552000.000,40.718000,-122.420000,0.401,0.838,0.478
3,SHA,2022-05-15,5,1823997.000,3815276,Lake Shasta,4552000.000,40.718000,-122.420000,0.401,0.838,0.478
4,SHA,2022-05-16,5,1823190.000,3815276,Lake Shasta,4552000.000,40.718000,-122.420000,0.401,0.838,0.478
...,...,...,...,...,...,...,...,...,...,...,...,...
3289,FOL,2023-05-08,5,834243.000,795899,Folsom Lake,977000.000,38.683000,-121.183000,0.854,0.815,1.048
3290,FOL,2023-05-09,5,834866.000,795899,Folsom Lake,977000.000,38.683000,-121.183000,0.855,0.815,1.049
3291,FOL,2023-05-10,5,834658.000,795899,Folsom Lake,977000.000,38.683000,-121.183000,0.854,0.815,1.049
3292,FOL,2023-05-11,5,834866.000,795899,Folsom Lake,977000.000,38.683000,-121.183000,0.855,0.815,1.049


### Remove bad data

In [47]:
drop_na_df = merge_details.dropna(subset=["storage_af"])

### Chart

In [49]:
melt = pd.melt(
    drop_na_df,
    id_vars=["date","lake"],
    value_vars=["current_level_pct_of_total", "average_level_pct_of_total"]
)

In [52]:
alt.data_transformers.disable_max_rows()

alt.Chart(melt).mark_line().encode(
    x='date:T',
    y='value',
    color='variable',
    column='lake'
).properties(
    width=180,
    height=180
)

### Export

Trim and rename for export

In [55]:
export_df = drop_na_df[
    ['reservoir_id',
     'lake',
     'date', 
     'storage_af', 
     'average_storage_value', 
     'capacity', 
     'current_level_pct_of_total',
     'average_level_pct_of_total', 
     'current_level_pct_of_avg', 
     'lat', 
     'lon'
    ]
].rename(columns={
    'lake': 'reservoir_name',
    'average_storage_value': 'historical_average',
    'capacity': 'total_capacity'
}).sort_values("total_capacity", ascending=False)

In [57]:
export_df.to_csv("../../data/processed/reservoirs/major-reservoirs-timeseries.csv", index=False)

In [58]:
export_df[export_df.date == export_df.date.max()].to_csv(f"../../data/processed/reservoirs/major-reservoirs-latest.csv", index=False)