In [141]:
import pytz
from datetime import datetime, date, timedelta
from time import strptime
import pandas as pd
import altair as alt
import altair_latimes as lat

In [112]:
alt.themes.register('latimes', lat.theme)
alt.themes.enable('latimes')

ThemeRegistry.enable('latimes')

In [113]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)

### Import

In [114]:
df = pd.read_csv(
    "../../data/raw/reservoirs/statewide/timeseries.csv", 
    parse_dates=["DATE TIME", "OBS DATE"]
)

In [134]:
hist_df = pd.read_csv(
    "../../data/metadata/reservoirs-statewide-historical-averages.csv",
    dtype={"month":str}
)

In [116]:
details_df = pd.read_csv(
    "../../data/metadata/reservoirs-statewide-details.csv"
)

### Clean

In [117]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 589 entries, 0 to 588
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   STATION_ID     589 non-null    object        
 1   DURATION       589 non-null    object        
 2   SENSOR_NUMBER  589 non-null    int64         
 3   SENSOR_TYPE    589 non-null    object        
 4   DATE TIME      589 non-null    datetime64[ns]
 5   OBS DATE       589 non-null    datetime64[ns]
 6   VALUE          589 non-null    object        
 7   DATA_FLAG      589 non-null    object        
 8   UNITS          589 non-null    object        
dtypes: datetime64[ns](2), int64(1), object(6)
memory usage: 41.5+ KB


In [118]:
df.columns = df.columns.str.lower()

In [119]:
df.columns = df.columns.str.replace(" ","_")

In [120]:
df = df.rename(columns={
    "station_id": "reservoir_id",
    "date_time": "date",
    "value": "storage_af"
})

In [121]:
today = datetime.today().date()
today

datetime.date(2023, 5, 12)

In [122]:
last_year = (today - pd.DateOffset(days=365)).date()
last_year

datetime.date(2022, 5, 12)

Trim to last 365 days and remove invalid values

In [123]:
trim_df = df[
    (df.date >= pd.to_datetime(last_year))
][ (df.storage_af != "---") ].copy()

  trim_df = df[


### Merge historical average

Create month column for merging

In [125]:
trim_df["month"] = pd.DatetimeIndex(trim_df.date).month

In [191]:
hist_df = hist_df.drop(columns="month", axis=1).rename(columns={"month_int":"month"})

In [193]:
hist_df

Unnamed: 0,average_storage,reservoir_id,month
0,22749071.0,SWV,1
1,24230378.0,SWV,2
2,26128851.0,SWV,3
3,27631605.0,SWV,4
4,28861161.0,SWV,5
5,27902990.0,SWV,6
6,25292026.0,SWV,7
7,22758905.0,SWV,8
8,21229285.0,SWV,9
9,20045128.0,SWV,10


In [194]:
merge_historical = pd.merge(
    trim_df[["reservoir_id","date","month","storage_af"]],
    hist_df[["reservoir_id", "month", "average_storage"]],
    how="left",
    left_on=["reservoir_id","month"],
    right_on=["reservoir_id","month"]
)

In [195]:
merge_details = pd.merge(
    merge_historical,
    details_df[["reservoir_id", "lake_name", "capacity", "number_of_dams"]],
    how="left",
    on=["reservoir_id"]               
)

In [196]:
merge_details.loc[merge_details.lake_name == "Total", "lake_name"] = "Statewide"

### Calculate percentages

In [197]:
merge_details["storage_af"] = merge_details["storage_af"].astype(int)

In [198]:
merge_details["current_level_pct_of_total"] = merge_details["storage_af"] / merge_details["capacity"]

In [199]:
merge_details["average_level_pct_of_total"] = merge_details["average_storage"] / merge_details["capacity"]

In [200]:
merge_details["current_level_pct_of_avg"] = merge_details["storage_af"] / merge_details["average_storage"]

In [226]:
merge_details

Unnamed: 0,reservoir_id,date,month,storage_af,average_storage,lake_name,capacity,number_of_dams,current_level_pct_of_total,average_level_pct_of_total,current_level_pct_of_avg
0,SWV,2022-05-12,5,20100000,28861161.000,Statewide,38121900,154,0.527,0.757,0.696
1,SWV,2022-05-13,5,20100000,28861161.000,Statewide,38121900,154,0.527,0.757,0.696
2,SWV,2022-05-14,5,20100000,28861161.000,Statewide,38121900,154,0.527,0.757,0.696
3,SWV,2022-05-15,5,20100000,28861161.000,Statewide,38121900,154,0.527,0.757,0.696
4,SWV,2022-05-16,5,20100000,28861161.000,Statewide,38121900,154,0.527,0.757,0.696
...,...,...,...,...,...,...,...,...,...,...,...
359,SWV,2023-05-06,5,29300000,28861161.000,Statewide,38121900,154,0.769,0.757,1.015
360,SWV,2023-05-07,5,29300000,28861161.000,Statewide,38121900,154,0.769,0.757,1.015
361,SWV,2023-05-08,5,29400000,28861161.000,Statewide,38121900,154,0.771,0.757,1.019
362,SWV,2023-05-09,5,29300000,28861161.000,Statewide,38121900,154,0.769,0.757,1.015


In [231]:
melt=pd.melt(
    merge_details, 
    id_vars="date", 
    value_vars=["current_level_pct_of_total", "average_level_pct_of_total"]
)

In [234]:
alt.Chart(melt).mark_line().encode(
    x="date",
    y="value",
    color="variable"
).properties(width=600)

### Export

Trim and rename for export

In [235]:
export_df = merge_details[
    [
         'date', 
         'storage_af', 
         'average_storage', 
         'capacity', 
         'current_level_pct_of_total',
         'average_level_pct_of_total', 
         'current_level_pct_of_avg' 
    ]
].rename(columns={"lake_name": "reservoir_name"})

In [236]:
export_df.to_csv(f"../../data/processed/reservoirs/reservoirs-statewide-latest.csv", index=False)