In [1]:
import pytz
from datetime import datetime, date, timedelta
import pandas as pd
import altair as alt
import altair_latimes as lat

In [2]:
alt.themes.register('latimes', lat.theme)
alt.themes.enable('latimes')

ThemeRegistry.enable('latimes')

In [3]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)

### Import

In [4]:
df = pd.read_csv(
    "../data/raw/dwr/major-reservoir-timeseries.csv", 
    parse_dates=["DATE TIME", "OBS DATE"]
)

In [5]:
hist_df = pd.read_csv(
    "../data/metadata/reservoirs-historical-averages.csv"
)

In [6]:
hist_df

Unnamed: 0,month,average_storage,reservoir_id,average_storage_value,average_storage_unit
0,January,"165,874 af",ATN,165874,af
1,February,"180,481 af",ATN,180481,af
2,March,"194,157 af",ATN,194157,af
3,April,"197,293 af",ATN,197293,af
4,May,"193,537 af",ATN,193537,af
...,...,...,...,...,...
1927,August,"212,098 af",KLM,212098,af
1928,September,"171,318 af",KLM,171318,af
1929,October,"177,005 af",KLM,177005,af
1930,November,"212,214 af",KLM,212214,af


In [7]:
details_df = pd.read_csv(
    "../data/metadata/reservoirs-metadata-details.csv"
)

### Clean

In [8]:
df.columns = df.columns.str.lower()

In [9]:
df.columns = df.columns.str.replace(" ","_")

In [10]:
df = df.rename(columns={
    "station_id": "reservoir_id",
    "date_time": "date",
    "value": "storage_af"
})

In [11]:
df["storage_af"] = df["storage_af"].str.replace("---","")
df["storage_af"] = df["storage_af"].str.replace("ART","")

In [12]:
df["storage_af"] = pd.to_numeric(df["storage_af"])

### Filter to last 365 days from today

In [13]:
today = datetime.today().date()
today

datetime.date(2024, 7, 2)

In [14]:
last_year = (today - pd.DateOffset(days=365)).date()
last_year

datetime.date(2023, 7, 3)

Trim to last 365 days and remove invalid values

In [15]:
trim_df = df[
    (df.date >= pd.to_datetime(last_year))
]

### Merge historical average

Create month column for merging

In [16]:
trim_df["month"] = pd.DatetimeIndex(trim_df.date).month

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  trim_df["month"] = pd.DatetimeIndex(trim_df.date).month


In [17]:
month_names = {
    "January": 1,
    "February": 2,
    "March": 3,
    "April": 4,
    "May": 5,
    "June": 6,
    "July": 7,
    "August": 8,
    "September": 9,
    "October": 10,
    "November": 11,
    "December": 12,
}

In [18]:
hist_df["month"] = hist_df["month"].map(month_names)

In [19]:
merge_historical = pd.merge(
    trim_df[["reservoir_id","date","month","storage_af","units"]],
    hist_df[["reservoir_id", "month", "average_storage_value", "average_storage_unit"]],
    how="left",
    on=["reservoir_id","month"]
)

In [20]:
merge_details = pd.merge(
    merge_historical,
    details_df[["id", "lake", "capacity", "lat", "lon"]],
    how="left",
    left_on=["reservoir_id"],
    right_on=["id"]
).drop(["id", "units", "average_storage_unit"], axis=1)

### Calculate percentages

In [21]:
merge_details["current_level_pct_of_total"] = merge_details["storage_af"] / merge_details["capacity"]

In [22]:
merge_details["average_level_pct_of_total"] = merge_details["average_storage_value"] / merge_details["capacity"]

In [23]:
merge_details["current_level_pct_of_avg"] = merge_details["storage_af"] / merge_details["average_storage_value"]

### Remove bad data

In [24]:
drop_na_df = merge_details.dropna(subset=["storage_af"])

### Chart

In [25]:
melt = pd.melt(
    drop_na_df,
    id_vars=["date","lake"],
    value_vars=["current_level_pct_of_total", "average_level_pct_of_total"]
)

In [26]:
# alt.data_transformers.disable_max_rows()

# alt.Chart(melt).mark_line().encode(
#     x='date:T',
#     y='value',
#     color='variable',
#     column='lake'
# ).properties(
#     width=180,
#     height=180
# )

### Export

Trim and rename for export

In [27]:
export_df = drop_na_df[
    ['reservoir_id',
     'lake',
     'date', 
     'storage_af', 
     'average_storage_value', 
     'capacity', 
     'current_level_pct_of_total',
     'average_level_pct_of_total', 
     'current_level_pct_of_avg', 
     'lat', 
     'lon'
    ]
].rename(columns={
    'lake': 'reservoir_name',
    'average_storage_value': 'historical_average',
    'capacity': 'total_capacity'
}).sort_values("total_capacity", ascending=False)

In [28]:
export_df.to_csv("../data/processed/dwr/major-reservoirs-timeseries.csv", index=False)

In [29]:
export_df[export_df.date == export_df.date.max()].to_csv(f"../data/processed/dwr/major-reservoirs-latest.csv", index=False)