In [1]:
import pytz
from datetime import datetime, date, timedelta
import pandas as pd
import altair as alt
import altair_latimes as lat

In [2]:
alt.themes.register('latimes', lat.theme)
alt.themes.enable('latimes')

ThemeRegistry.enable('latimes')

In [3]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)

### Import

In [6]:
df = pd.read_csv(
    "../data/raw/reservoirs/colorado-river-reservoir-scrape-timeseries.csv", 
    parse_dates=["DATE TIME", "OBS DATE"]
)

In [7]:
hist_df = pd.read_csv(
    "../data/metadata/reservoirs-historical-averages.csv"
)

In [46]:
details_df = pd.read_csv(
    "../data/metadata/reservoirs-metadata-details.csv"
)

### Clean

In [47]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 240 entries, 0 to 239
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   reservoir_id   240 non-null    object        
 1   duration       240 non-null    object        
 2   sensor_number  240 non-null    int64         
 3   sensor_type    240 non-null    object        
 4   date           240 non-null    datetime64[ns]
 5   obs_date       240 non-null    datetime64[ns]
 6   storage_af     236 non-null    float64       
 7   data_flag      231 non-null    object        
 8   units          240 non-null    object        
dtypes: datetime64[ns](2), float64(1), int64(1), object(5)
memory usage: 17.0+ KB


In [48]:
df.columns = df.columns.str.lower()

In [49]:
df.columns = df.columns.str.replace(" ","_")

In [50]:
df = df.rename(columns={
    "station_id": "reservoir_id",
    "date_time": "date",
    "value": "storage_af"
})

In [55]:
df.tail()

Unnamed: 0,reservoir_id,duration,sensor_number,sensor_type,date,obs_date,storage_af,data_flag,units
235,HVS,M,15,STORAGE,2022-02-01,2022-02-01,562800.0,,AF
236,HVS,M,15,STORAGE,2022-03-01,2022-03-01,580400.0,,AF
237,HVS,M,15,STORAGE,2022-04-01,2022-04-01,567000.0,,AF
238,HVS,M,15,STORAGE,2022-05-01,2022-05-01,593200.0,,AF
239,HVS,M,15,STORAGE,2022-06-01,2022-06-01,,,AF


In [56]:
#df["storage_af"] = df["storage_af"].str.replace("---","")

In [57]:
df["storage_af"] = pd.to_numeric(df["storage_af"])

In [58]:
#df = df.dropna(subset="storage_af")

### Check out latest data

Drop na first

In [59]:
drop_na = df.dropna(subset=["storage_af"])

In [60]:
latest_df = drop_na[drop_na.date == drop_na.date.max()].copy()

In [61]:
latest_df.storage_af.sum()

16164400.0

### Merge historical average

Create month column for merging

In [62]:
latest_df["month"] = pd.DatetimeIndex(latest_df.date).month

In [65]:
latest_df

Unnamed: 0,reservoir_id,duration,sensor_number,sensor_type,date,obs_date,storage_af,data_flag,units,month
58,MHV,M,15,STORAGE,2022-05-01,2022-05-01,1708200.0,,AF,5
118,PWL,M,15,STORAGE,2022-05-01,2022-05-01,6346000.0,,AF,5
178,MEA,M,15,STORAGE,2022-05-01,2022-05-01,7517000.0,,AF,5
238,HVS,M,15,STORAGE,2022-05-01,2022-05-01,593200.0,,AF,5


In [64]:
hist_df

Unnamed: 0,month,average_storage,reservoir_id,average_storage_value,average_storage_unit
0,1,"165,874 af",ATN,165874,af
1,2,"180,481 af",ATN,180481,af
2,3,"194,157 af",ATN,194157,af
3,4,"197,293 af",ATN,197293,af
4,5,"193,537 af",ATN,193537,af
...,...,...,...,...,...
1927,8,"212,098 af",KLM,212098,af
1928,9,"171,318 af",KLM,171318,af
1929,10,"177,005 af",KLM,177005,af
1930,11,"212,214 af",KLM,212214,af


In [66]:
#hist_df["month"] = pd.to_datetime(hist_df["month"], format='%B').dt.month

In [67]:
merge_historical = pd.merge(
    latest_df[["reservoir_id","date","month","storage_af","units"]],
    hist_df[["reservoir_id", "month", "average_storage_value", "average_storage_unit"]],
    how="left",
    on=["reservoir_id","month"]
)

In [71]:
merge_details = pd.merge(
    merge_historical,
    details_df[["id", "lake", "capacity"]],
    how="left",
    left_on=["reservoir_id"],
    right_on=["id"]
)

### Calculate percentages

In [73]:
merge_details["current_level_pct_of_total"] = merge_details["storage_af"] / merge_details["capacity"]

In [74]:
merge_details["average_level_pct_of_total"] = merge_details["average_storage_value"] / merge_details["capacity"]

In [76]:
merge_details["current_level_pct_of_avg"] = merge_details["storage_af"] / merge_details["average_storage_value"]

In [77]:
merge_details

Unnamed: 0,reservoir_id,date,month,storage_af,units,average_storage_value,average_storage_unit,id,lake,capacity,current_level_pct_of_total,average_level_pct_of_total,current_level_pct_of_avg
0,MHV,2022-05-01,5,1708200.0,AF,1715533,af,MHV,Lake Mohave,1810000.0,0.944,0.948,0.996
1,PWL,2022-05-01,5,6346000.0,AF,15119302,af,PWL,Lake Powell,24322000.0,0.261,0.622,0.42
2,MEA,2022-05-01,5,7517000.0,AF,15790500,af,MEA,Lake Mead,26159008.0,0.287,0.604,0.476
3,HVS,2022-05-01,5,593200.0,AF,593833,af,HVS,Lake Havasu,648000.0,0.915,0.916,0.999


Drop missing values

In [78]:
#drop_na = major_reservoirs.dropna(subset=["storage_af","average_storage_value"])

In [79]:
#len(drop_na)

In [80]:
melt=pd.melt(merge_details, id_vars="reservoir_id", value_vars=["storage_af", "average_storage_value", "capacity"])

In [81]:
bar_order = {
    "storage_af": 1, 
    "average_storage_value": 2, 
    "capacity":3            
}

In [82]:
melt["bar_order"] = melt.variable.map(bar_order)

In [84]:
domain = ["capacity", "average_storage_value", "storage_af", ]
range_ = ['#ddd', '#83c6e0', '#1281aa', ]

alt.Chart(melt).mark_bar().encode(
    x=alt.X('reservoir_id', axis=alt.Axis(labels=False)),
    y=alt.Y('value',stack="normalize"),
    order="bar_order",
    color=alt.Color(
        'variable', 
        scale=alt.Scale(domain=domain, range=range_), 
        #sort='descending'
    ),
    tooltip=["reservoir_id"]
).properties(title="Reservoirs of the lower Colorado River Basin")

In [89]:
domain = ["capacity", "average_storage_value", "storage_af", ]
range_ = ['#ddd', '#83c6e0', '#1281aa', ]

alt.Chart(melt[melt.variable != "capacity_value"]).mark_bar().encode(
    x=alt.X('reservoir_id', axis=alt.Axis(labels=False)),
    y=alt.Y('value',stack="normalize"),
    order="bar_order",
    color=alt.Color(
        'variable', 
        scale=alt.Scale(domain=domain, range=range_), 
        #sort='descending'
    ),
    tooltip=["reservoir_id"]
).properties(title="Reservoirs of the lower Colorado River Basin")

In [88]:
domain = ["capacity", "average_storage_value", "storage_af", ]
range_ = ['#ddd', '#83c6e0', '#1281aa', ]

alt.Chart(melt[melt.reservoir_id=="MEA"]).mark_bar().encode(
    x=alt.X('reservoir_id', axis=alt.Axis(labels=False)),
    y=alt.Y('value',stack="normalize"),
    order="bar_order",
    color=alt.Color(
        'variable', 
        scale=alt.Scale(domain=domain, range=range_), 
        #sort='descending'
    ),
    tooltip=["reservoir_id"]
).properties(title="Lake Mead", width=150)

In [92]:
domain = ["capacity", "average_storage_value", "storage_af", ]
range_ = ['#ddd', '#83c6e0', '#1281aa', ]

alt.Chart(melt[melt.reservoir_id=="PWL"]).mark_bar().encode(
    x=alt.X('reservoir_id', axis=alt.Axis(labels=False)),
    y=alt.Y('value',stack="normalize"),
    order="bar_order",
    color=alt.Color(
        'variable', 
        scale=alt.Scale(domain=domain, range=range_), 
        #sort='descending'
    ),
    tooltip=["reservoir_id"]
).properties(title="Lake Powell", width=150)

### Export

Assertion tests to make sure no data comes through with mixed units

In [189]:
assert len(drop_na.units.unique()) == 1

In [190]:
assert len(drop_na.average_storage_unit.unique()) == 1

In [191]:
assert len(drop_na.capacity_unit.unique()) == 1

Trim and rename for export

In [192]:
export_df = drop_na[
    ['reservoir_id','lake_name', 'date', 'storage_af', 'average_storage_value', 'capacity_value', 'current_level_pct_of_total',
       'average_level_pct_of_total', 'current_level_pct_of_avg' ]
].rename(columns={
    'lake_name': 'reservoir_name',
    'average_storage_value': 'historical_average',
    'capacity_value': 'total_capacity'
})

In [193]:
export_df.to_csv("../data/processed/reservoirs/reservoirs-latest.csv", index=False)

In [194]:
export_df.to_csv(f"../data/processed/reservoirs/reservoirs-{today}.csv", index=False)

---

### Merge geo data

In [195]:
metadata_df = pd.read_csv("../data/metadata/reservoirs-metadata-details.csv")

In [196]:
metadata_df["lat"] = metadata_df["lat"].str.replace("°","")
metadata_df["lon"] = metadata_df["lon"].str.replace("°","")

In [197]:
merge_coords = pd.merge(
    drop_na,
    metadata_df[["id","lat","lon"]],
    how="left",
    left_on=["reservoir_id"],
    right_on=["id"]
).drop(["id","month","units","average_storage_unit","capacity_unit"], axis=1)

In [198]:
merge_coords.lake_name = merge_coords.lake_name.str.strip()

In [201]:
len(merge_coords)

15

In [202]:
merge_coords#[merge_coords.lake_name=="Lake Shasta"]

Unnamed: 0,reservoir_id,date,storage_af,average_storage_value,lake_name,capacity_value,current_level_pct_of_total,average_level_pct_of_total,current_level_pct_of_avg,lat,lon
0,CLE,2022-06-17,729324.0,1882223.0,Trinity Lake,2447650.0,0.298,0.769,0.387,40.801,-122.762
1,WRS,2022-06-17,134533.0,228117.0,Warm Springs Reservoir,381000.0,0.353,0.599,0.59,38.723,-123.01
2,CCH,2022-06-17,83427.0,144299.0,Cachuma Lake,193305.0,0.432,0.746,0.578,34.583,-119.98
3,CSI,2022-06-17,79009.0,192450.0,Lake Casitas,254000.0,0.311,0.758,0.411,34.373,-119.332
4,CAS,2022-06-17,123162.0,280108.0,Castaic Lake,325000.0,0.379,0.862,0.44,34.5152,-118.6101
5,SHA,2022-06-17,1801651.0,3539810.0,Lake Shasta,4552000.0,0.396,0.778,0.509,40.718,-122.42
6,ORO,2022-06-17,1829337.0,2687874.0,Lake Oroville,3537577.0,0.517,0.76,0.681,39.54,-121.493
7,BUL,2022-06-17,866877.0,831313.0,Bullards Bar Reservoir,966000.0,0.897,0.861,1.043,39.393,-121.14
8,FOL,2022-06-17,858975.0,756266.0,Folsom Lake,977000.0,0.879,0.774,1.136,38.683,-121.183
9,NML,2022-06-17,816710.0,1507818.0,New Melones Reservoir,2400000.0,0.34,0.628,0.542,37.9481,-120.525


In [200]:
merge_coords.to_csv("../data/processed/reservoirs/reservoirs-with-points.csv", index=False)