In [268]:
import pytz
from datetime import datetime, date, timedelta
import pandas as pd
import altair as alt
import altair_latimes as lat

In [269]:
alt.themes.register('latimes', lat.theme)
alt.themes.enable('latimes')

ThemeRegistry.enable('latimes')

In [270]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)

### Import

In [271]:
df = pd.read_csv(
    "../data/raw/reservoirs/major-reservoir-scrape-latest.csv", 
    parse_dates=["DATE TIME", "OBS DATE"]
)

In [272]:
hist_df = pd.read_csv(
    "../data/metadata/reservoirs-historical-averages.csv"
)

In [273]:
details_df = pd.read_csv(
    "../data/metadata/reservoirs-details.csv"
)

### Clean

In [274]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6496 entries, 0 to 6495
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   STATION_ID     6496 non-null   object        
 1   DURATION       6496 non-null   object        
 2   SENSOR_NUMBER  6496 non-null   int64         
 3   SENSOR_TYPE    6496 non-null   object        
 4   DATE TIME      6496 non-null   datetime64[ns]
 5   OBS DATE       6496 non-null   datetime64[ns]
 6   VALUE          6496 non-null   object        
 7   DATA_FLAG      6496 non-null   object        
 8   UNITS          6496 non-null   object        
dtypes: datetime64[ns](2), int64(1), object(6)
memory usage: 456.9+ KB


In [275]:
df.columns = df.columns.str.lower()

In [276]:
df.columns = df.columns.str.replace(" ","_")

In [277]:
df = df.rename(columns={
    "station_id": "reservoir_id",
    "date_time": "date",
    "value": "storage_af"
})

In [278]:
df[df["reservoir_id"] == "CLA"]

Unnamed: 0,reservoir_id,duration,sensor_number,sensor_type,date,obs_date,storage_af,data_flag,units
4160,CLA,D,15,STORAGE,2022-05-14,2022-05-14,5955,,AF
4161,CLA,D,15,STORAGE,2022-05-15,2022-05-15,5161,,AF
4162,CLA,D,15,STORAGE,2022-05-16,2022-05-16,4764,,AF
4163,CLA,D,15,STORAGE,2022-05-17,2022-05-17,5161,,AF
4164,CLA,D,15,STORAGE,2022-05-18,2022-05-18,3970,,AF
4165,CLA,D,15,STORAGE,2022-05-19,2022-05-19,1985,,AF
4166,CLA,D,15,STORAGE,2022-05-20,2022-05-20,3573,,AF
4167,CLA,D,15,STORAGE,2022-05-21,2022-05-21,2779,,AF
4168,CLA,D,15,STORAGE,2022-05-22,2022-05-22,2382,,AF
4169,CLA,D,15,STORAGE,2022-05-23,2022-05-23,1588,,AF


In [279]:
df["storage_af"] = df["storage_af"].str.replace("---","")

In [280]:
df["storage_af"] = pd.to_numeric(df["storage_af"])

In [281]:
#df = df.dropna(subset="storage_af")

### Check out latest data

In [282]:
tz = pytz.timezone("America/Los_Angeles")

In [283]:
today = datetime.now(tz).date()
today

datetime.date(2022, 6, 14)

In [284]:
yesterday = (today - pd.DateOffset(days=1)).date()
yesterday

datetime.date(2022, 6, 13)

In [285]:
latest_df = df[df.date == pd.to_datetime(yesterday)].copy()

In [286]:
len(latest_df)

203

In [287]:
latest_df.storage_af.sum()

19893430.0

### Merge historical average

Create month column for merging

In [288]:
latest_df["month"] = pd.DatetimeIndex(latest_df.date).month

In [289]:
hist_df["month"] = pd.to_datetime(hist_df["month"], format='%B').dt.month

In [290]:
merge_historical = pd.merge(
    latest_df[["reservoir_id","date","month","storage_af","units"]],
    hist_df[["reservoir_id", "month", "average_storage_value", "average_storage_unit"]],
    how="left",
    on=["reservoir_id","month"]
)

In [301]:
merge_details = pd.merge(
    merge_historical,
    details_df[["reservoir_id", "lake_name", "capacity_value", "capacity_unit"]],
    how="left",
    on=["reservoir_id"]               
)

In [302]:
merge_details.describe()

Unnamed: 0,month,storage_af,average_storage_value,capacity_value
count,203.0,151.0,154.0,154.0
mean,6.0,131744.57,181188.247,247519.617
std,0.0,288363.902,450820.364,602094.926
min,6.0,0.0,750.0,1800.0
25%,6.0,11597.0,20043.0,29581.0
50%,6.0,36823.0,46578.5,64728.0
75%,6.0,97200.5,115561.75,157500.0
max,6.0,1858646.0,3539810.0,4552000.0


In [303]:
largest_50_pct = merge_details[merge_details.capacity_value >= 64728.000]

Drop missing values

In [304]:
drop_na = largest_50_pct.dropna(subset=["storage_af","average_storage_value"])

In [333]:
len(drop_na)

68

In [310]:
melt=pd.melt(drop_na, id_vars="reservoir_id", value_vars=["storage_af", "average_storage_value", "capacity_value"])

In [324]:
bar_order = {
    "storage_af": 1, 
    "average_storage_value": 2, 
    "capacity_value":3            
}

In [325]:
melt["bar_order"] = melt.variable.map(bar_order)

In [332]:
domain = ["capacity_value", "average_storage_value", "storage_af", ]
range_ = ['#ddd', '#83c6e0', '#1281aa', ]

alt.Chart(melt).mark_bar().encode(
    x=alt.X('reservoir_id', axis=alt.Axis(labels=False)),
    y=alt.Y('value',stack="normalize"),
    order="bar_order",
    color=alt.Color(
        'variable', 
        scale=alt.Scale(domain=domain, range=range_), 
        #sort='descending'
    ),
    tooltip=["reservoir_id"]
).properties(title="California's largest reservoirs")

In [337]:
domain = ["capacity_value", "average_storage_value", "storage_af", ]
range_ = ['#ddd', '#83c6e0', '#1281aa', ]

alt.Chart(melt[melt.reservoir_id=="ORO"]).mark_bar().encode(
    x=alt.X('reservoir_id', axis=alt.Axis(labels=False)),
    y=alt.Y('value',stack="normalize"),
    order="bar_order",
    color=alt.Color(
        'variable', 
        scale=alt.Scale(domain=domain, range=range_), 
        #sort='descending'
    ),
    tooltip=["reservoir_id"]
).properties(title="Lake Oroville", width=150)

### Export

Assertion tests to make sure no data comes through with mixed units

In [341]:
assert len(drop_na.units.unique()) == 1

In [343]:
assert len(drop_na.average_storage_unit.unique()) == 1

In [342]:
assert len(drop_na.capacity_unit.unique()) == 1

Trim and rename for export

In [348]:
export_df = drop_na[
    ['reservoir_id','lake_name', 'date', 'storage_af', 'average_storage_value', 'capacity_value', ]
].rename(columns={
    'lake_name': 'reservoir_name',
    'average_storage_value': 'historical_average',
    'capacity_value': 'total_capacity'
})

In [349]:
export_df.to_csv("../data/processed/reservoirs/reservoirs-latest.csv")

In [350]:
export_df.to_csv(f"../data/processed/reservoirs/reservoirs-{today}.csv")