In [1]:
import pandas as pd
import altair as alt
import altair_latimes as lat
from pandas.tseries.offsets import MonthEnd

In [2]:
alt.themes.register('latimes', lat.theme)
alt.themes.enable('latimes')

ThemeRegistry.enable('latimes')

In [3]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)

### Import

In [4]:
df = pd.read_csv("../data/raw/uw-usage.csv", parse_dates=["Reporting Month"])

In [5]:
baselines_df = pd.read_csv("../data/raw/uw-2020-baseline-values.csv")

### Clean

Remove junk from column names

In [6]:
df.columns = df.columns.str.replace(" ","_").str.replace("-","_").str.replace(r'[#,@,&,(,)]', '', regex=True).str.lower()

Eliminate double spaces in supplier names

In [7]:
df.supplier_name = df.supplier_name.str.replace("  ", " ")

Clean up baselines column names as we did in the previous notebook

In [8]:
baselines_df.columns = baselines_df.columns.str.replace(" ","_").str.replace("-","_").str.replace(r'[#,@,&,(,)]', '', regex=True).str.lower()

In [9]:
baselines_df.supplier_name = baselines_df.supplier_name.str.replace("  ", " ")

In [10]:
baselines_trimmed_df = baselines_df[[
    'supplier_name', 
    'public_water_system_id', 
    'month', 
    #'original_units',
    # 'total_potable_production_original_units',
    # 'potable_commercial_agriculture_original_units',
    'total_potable_production_minus_ag_gallons', 
    # 'staff_notes'
]].copy()

Get rid of some unnecessary columns

In [11]:
trim_df = df[[
    'supplier_name', 
    'public_water_system_id', 
    'reporting_month', 
    'county',
    'hydrologic_region', 
    'climate_zone', 
    'total_population_served',
    'calculated_total_potable_water_production_gallons_ag_excluded',
    'calculated_r_gpcd', 
    # 'month'
]].copy()

### Get populations by supplier, hydrologic region and state total

Using populations from the beginning of the drought emergency

In [12]:
pops_by_supplier = df[
    df.reporting_month == "2021-07-15"
][["supplier_name","hydrologic_region","total_population_served"]]

In [13]:
pops_by_region = pops_by_supplier.groupby(
    ["hydrologic_region"]
)["total_population_served"].sum().reset_index()

In [14]:
pop_state = pops_by_region.total_population_served.sum()
pop_state

37110227

### Merge

Filter df to July 2021 and later

In [91]:
filter_df = trim_df[
    (trim_df.reporting_month >= "2021-07-15")
].copy()

Remove -999999999

In [92]:
filter_df = filter_df[filter_df.calculated_total_potable_water_production_gallons_ag_excluded != -999999999]

Make a month column to merge with baselines

In [93]:
filter_df["month"] = filter_df["reporting_month"].dt.month

In [94]:
merge_df = pd.merge(
    filter_df, 
    baselines_trimmed_df, 
    how="left", 
    on=["supplier_name", "public_water_system_id", "month"]
).rename(
    columns={
        "calculated_total_potable_water_production_gallons_ag_excluded": "total_gallons_current",
        "total_potable_production_minus_ag_gallons": "total_gallons_baseline"
    }
).drop("month", axis=1)

### Remove duplicates

In [95]:
tmp = merge_df.set_index(['supplier_name', 'reporting_month'])

In [96]:
merge_df = tmp[~tmp.index.duplicated()].reset_index()

### Calculate percent changes by month

In [97]:
def pct_change(new, old):
    return (new - old) / old

By supplier

In [98]:
merge_df["gallons_pct_change"] = merge_df.apply(lambda x: pct_change(x.total_gallons_current, x.total_gallons_baseline), axis=1)

Filter out outliers

In [99]:
# lower_thresh=-0.5
# upper_thresh=1.3

In [100]:
# districts_filtered = merge_df[
#     (merge_df['gallons_pct_change']<upper_thresh)&(merge_df['gallons_pct_change']>lower_thresh)
# ]

By region

In [101]:
regions_df = merge_df.groupby(
    ["hydrologic_region", "reporting_month"]
)[["total_gallons_current","total_gallons_baseline", "supplier_name"]].agg({
    "total_gallons_current":"sum",
    "total_gallons_baseline":"sum",
    "supplier_name":"size"
}).reset_index()

In [102]:
regions_df["gallons_pct_change"] = regions_df.apply(lambda x: pct_change(x.total_gallons_current, x.total_gallons_baseline), axis=1)

Statewide

In [103]:
merge_df["state"] = "Statewide"

In [104]:
statewide_df = merge_df.groupby(["state", "reporting_month"])[["total_gallons_current","total_gallons_baseline"]].sum().reset_index()

In [105]:
statewide_df["gallons_pct_change"] = statewide_df.apply(lambda x: pct_change(x.total_gallons_current, x.total_gallons_baseline), axis=1)

### Calculate cumulative savings since July 2021

Set up some vars and function

In [106]:
def month_diff(a, b):
    return 12 * (a.year - b.year) + (a.month - b.month)

In [107]:
def gallons_per_unit(gallons, baseline, population, time):
    return ((baseline - gallons) / population) / time

In [108]:
# voluntary_reductions_begin = pd.to_datetime('2021-07-08')
voluntary_reductions_begin = pd.to_datetime('2021-07-01')
voluntary_reductions_begin

Timestamp('2021-07-01 00:00:00')

In [109]:
latest_month = statewide_df[
    statewide_df.reporting_month == statewide_df.reporting_month.max()
].iloc[0]['reporting_month']

latest_month = pd.to_datetime(latest_month, format="%Y%m") + MonthEnd(0)

latest_month

Timestamp('2022-12-31 00:00:00')

In [110]:
months_since_reductions_begin = month_diff(latest_month, voluntary_reductions_begin)
months_since_reductions_begin

17

In [111]:
days_since_reductions = (latest_month - voluntary_reductions_begin).days
days_since_reductions

548

Get date of this monthly report

In [112]:
date = statewide_df[
    (statewide_df.reporting_month==statewide_df.reporting_month.max())
].iloc[0]["reporting_month"]

Drop suppliers with missing data

In [113]:
counts = merge_df.groupby("supplier_name")["reporting_month"].count().reset_index()

In [114]:
max_count = max(counts.reporting_month)
max_count

18

In [115]:
suppliers_with_complete_data = list(counts[counts.reporting_month == max_count].supplier_name)

In [116]:
complete_data = merge_df[merge_df.supplier_name.isin(suppliers_with_complete_data)].copy()

In [118]:
#complete_data[complete_data.reporting_month == "2023-03-15"]

By district

In [119]:
district_cumulative_savings_df = complete_data.groupby(
    ["supplier_name","hydrologic_region"]
)[["total_gallons_current","total_gallons_baseline"]].sum().reset_index()

In [120]:
district_cumulative_savings_df = pd.merge(
    district_cumulative_savings_df,
    pops_by_supplier[["supplier_name","hydrologic_region","total_population_served"]],
    how="left",
    on=["supplier_name","hydrologic_region"]
)

In [121]:
district_cumulative_savings_df["cumulative_pct_change"] = district_cumulative_savings_df.apply(lambda x: pct_change(x.total_gallons_current, x.total_gallons_baseline), axis=1)

In [122]:
district_cumulative_savings_df['date'] = date

In [123]:
district_cumulative_savings_df['months_since'] = months_since_reductions_begin
district_cumulative_savings_df['days_since'] = days_since_reductions

In [124]:
district_cumulative_savings_df['gallons_saved_per_month'] = district_cumulative_savings_df.apply(
    lambda x: gallons_per_unit(
        x.total_gallons_current, x.total_gallons_baseline, x.total_population_served, x.months_since
    ), axis=1
)

In [125]:
district_cumulative_savings_df['gallons_saved_per_day'] = district_cumulative_savings_df.apply(
    lambda x: gallons_per_unit(
        x.total_gallons_current, x.total_gallons_baseline, x.total_population_served, x.days_since
    ), axis=1
)

In [126]:
district_cumulative_savings_df.sort_values("gallons_saved_per_day", ascending=False)

Unnamed: 0,supplier_name,hydrologic_region,total_gallons_current,total_gallons_baseline,total_population_served,cumulative_pct_change,date,months_since,days_since,gallons_saved_per_month,gallons_saved_per_day
361,Vernon City of,South Coast,3078373050.180,3337882231.753,112,-0.078,2022-12-15,17,548,136296.839,4228.187
162,Humboldt Bay Municipal Water District,North Coast,90795000.000,139435000.000,602,-0.349,2022-12-15,17,548,4752.785,147.440
190,Livingston City of,San Joaquin River,3428397000.000,4238758000.000,15052,-0.191,2022-12-15,17,548,3166.908,98.243
251,"Perris, City of",South Coast,789347979.420,1178756964.453,9000,-0.330,2022-12-15,17,548,2545.157,78.956
54,California Water Service Company Westlake,South Coast,2987825574.300,3636640495.868,19489,-0.178,2022-12-15,17,548,1958.314,60.751
...,...,...,...,...,...,...,...,...,...,...,...
173,"Kingsburg, City of",Tulare Lake,1631208000.000,1571986000.000,12338,0.038,2022-12-15,17,548,-282.351,-8.759
105,El Centro City of,Colorado River,4179427000.000,3830804000.000,46364,0.091,2022-12-15,17,548,-442.309,-13.721
184,Lee Lake Water District,South Coast,1779007152.737,1597216389.254,21038,0.114,2022-12-15,17,548,-508.298,-15.768
245,Paradise Irrigation District,Sacramento River,2073900000.000,1978730000.000,3000,0.048,2022-12-15,17,548,-1866.078,-57.889


In [127]:
chart_data = complete_data[complete_data.supplier_name.str.contains("El Segundo")]
name = chart_data.iloc[0]["supplier_name"]

alt.Chart(
    chart_data
).mark_bar().encode(
    x=alt.X('yearmonth(reporting_month):T', axis=alt.Axis(title="Reporting month")),
    y=alt.Y("gallons_pct_change:Q", axis=alt.Axis(format="%", title="Percent change in gallons used")),
    color=alt.condition(
        alt.datum.gallons_pct_change > 0,
        alt.value("#e6ae56"),  # The positive color
        alt.value("#83c6e0")  # The negative color
    ),
    tooltip=["reporting_month","gallons_pct_change"]
).properties(title=f"Monthly water conservation in {name}", width=600)

In [128]:
df.iloc[36934]["qualification"]

nan

In [129]:
df.iloc[18213]['qualification']

nan

In [130]:
df[
    (df.supplier_name=="Paradise Irrigation District")&
    (df.reporting_month >= "2021-07-15")
]

Unnamed: 0,supplier_name,public_water_system_id,reporting_month,county,hydrologic_region,climate_zone,total_population_served,reference_2014_population,county_under_drought_declaration,water_shortage_contingency_stage_invoked,...,reported_preliminary_commercial_industrial_and_institutional_water,reported_final_commercial_industrial_and_institutional_water,reported_recycled_water,reported_non_revenue_water,calculated_total_potable_water_production_gallons_ag_excluded,calculated_total_potable_water_production_gallons_2013_ag_excluded,calculated_commercial_agricultural_water_gallons,calculated_commercial_agricultural_water_gallons_2013,calculated_r_gpcd,qualification
18387,Paradise Irrigation District,CA0410007,2023-03-15,Butte,Sacramento River,11,8800,26032,Yes,1,...,0.0,0.0,0.0,0.0,72600000.0,101600000.0,0.0,0.0,25.689,still working on meters to all customers.
18388,Paradise Irrigation District,CA0410007,2023-02-15,Butte,Sacramento River,11,8800,26032,Yes,1,...,0.0,0.0,0.0,0.0,69300000.0,76100000.0,0.0,0.0,26.883,Still working on meter system.
18389,Paradise Irrigation District,CA0410007,2023-01-15,Butte,Sacramento River,11,8000,26032,Yes,0,...,0.0,0.0,0.0,0.0,74900000.0,82500000.0,0.0,0.0,29.032,Still working on meters.
18390,Paradise Irrigation District,CA0410007,2022-12-15,Butte,Sacramento River,11,8000,26032,Yes,1,...,0.0,0.0,0.0,0.0,78100000.0,102300000.0,0.0,0.0,30.194,
18391,Paradise Irrigation District,CA0410007,2022-11-15,Butte,Sacramento River,11,8000,26032,Yes,1,...,0.0,0.0,0.0,0.0,82800000.0,117400000.0,0.0,0.0,33.2,Meter project still underway. Majority should ...
18392,Paradise Irrigation District,CA0410007,2022-10-15,Butte,Sacramento River,11,8000,26032,Yes,1,...,0.0,0.0,0.0,0.0,131600000.0,170700000.0,0.0,0.0,51.097,Meter project still underway. Majority should ...
18393,Paradise Irrigation District,CA0410007,2022-09-15,Butte,Sacramento River,11,8000,26032,Yes,1,...,0.0,0.0,0.0,0.0,144400000.0,230100000.0,0.0,0.0,57.6,Still in recovery mode from fire.
18394,Paradise Irrigation District,CA0410007,2022-08-15,Butte,Sacramento River,11,8000,26032,Yes,1,...,0.0,0.0,0.0,0.0,174200000.0,309900000.0,0.0,0.0,67.355,
18395,Paradise Irrigation District,CA0410007,2022-07-15,Butte,Sacramento River,11,8000,26032,Yes,We haven't,...,0.0,0.0,0.0,0.0,162400000.0,327500000.0,0.0,0.0,62.71,Still working on the metering system.
18396,Paradise Irrigation District,CA0410007,2022-06-15,Butte,Sacramento River,11,6000,26032,Yes,We haven't,...,0.0,0.0,0.0,0.0,122600000.0,276200000.0,0.0,0.0,65.6,Still working on meters to homes.


In [131]:
complete_data[
    (complete_data.supplier_name=="El Segundo City of")&
    (complete_data.reporting_month >= "2021-07-15") &
    (complete_data.reporting_month != "2021-08-15")
].groupby("supplier_name")[["total_gallons_current","total_gallons_baseline"]].sum().reset_index()

Unnamed: 0,supplier_name,total_gallons_current,total_gallons_baseline
0,El Segundo City of,3187329135.628,2530903424.018


In [132]:
chart_data = complete_data[complete_data.supplier_name == "Los Angeles Department of Water and Power"]
name = chart_data.iloc[0]["supplier_name"]

alt.Chart(
    chart_data
).mark_bar().encode(
    x=alt.X('yearmonth(reporting_month):T', axis=alt.Axis(title="Reporting month")),
    y=alt.Y("gallons_pct_change:Q", axis=alt.Axis(format="%", title="Percent change in gallons used")),
    color=alt.condition(
        alt.datum.gallons_pct_change > 0,
        alt.value("#e6ae56"),  # The positive color
        alt.value("#83c6e0")  # The negative color
    ),
    tooltip=["reporting_month","gallons_pct_change"]
).properties(title=f"Monthly water conservation in {name}", width=600)

In [133]:
alt.Chart(
    complete_data[complete_data.supplier_name.str.contains("El Segundo")]
).mark_bar().encode(
    x=alt.X('yearmonth(reporting_month):T'),
    y="total_gallons_current",
    tooltip=["reporting_month"]
).properties(width=600)

Drop districts for which we can't calculate a percentage change

In [134]:
drop_na_districts = district_cumulative_savings_df.dropna(subset="cumulative_pct_change")

By region

In [135]:
regions_cumulative_savings_df = district_cumulative_savings_df.groupby(
    ["hydrologic_region"]
)[
    ["total_gallons_current","total_gallons_baseline","supplier_name"]
].agg(
    {"total_gallons_current":"sum","total_gallons_baseline":"sum","supplier_name":"size"}
).reset_index().rename(
    columns={"supplier_name": "total_reports"}
)

In [136]:
regions_cumulative_savings_df = pd.merge(
    regions_cumulative_savings_df,
    pops_by_region[["hydrologic_region","total_population_served"]],
    how="left",
    on="hydrologic_region"
)

In [137]:
regions_cumulative_savings_df["cumulative_pct_change"] = regions_cumulative_savings_df.apply(lambda x: pct_change(x.total_gallons_current, x.total_gallons_baseline), axis=1)

In [138]:
regions_cumulative_savings_df['date'] = date

In [139]:
regions_cumulative_savings_df['months_since'] = months_since_reductions_begin
regions_cumulative_savings_df['days_since'] = days_since_reductions

In [140]:
regions_cumulative_savings_df['gallons_saved_per_month'] = regions_cumulative_savings_df.apply(
    lambda x: gallons_per_unit(
        x.total_gallons_current, x.total_gallons_baseline, x.total_population_served, x.months_since
    ), axis=1
)

In [141]:
regions_cumulative_savings_df['gallons_saved_per_day'] = regions_cumulative_savings_df.apply(
    lambda x: gallons_per_unit(
        x.total_gallons_current, x.total_gallons_baseline, x.total_population_served, x.days_since
    ), axis=1
)

In [142]:
regions_cumulative_savings_df

Unnamed: 0,hydrologic_region,total_gallons_current,total_gallons_baseline,total_reports,total_population_served,cumulative_pct_change,date,months_since,days_since,gallons_saved_per_month,gallons_saved_per_day
0,Central Coast,68867849211.629,72263664590.905,27,1293270,-0.047,2022-12-15,17,548,154.456,4.792
1,Colorado River,101551584092.535,103768495116.159,12,780308,-0.021,2022-12-15,17,548,167.122,5.184
2,North Coast,21594248693.92,25401832502.965,16,414740,-0.15,2022-12-15,17,548,540.038,16.753
3,North Lahontan,7166845519.0,7900971365.0,5,103768,-0.093,2022-12-15,17,548,416.158,12.91
4,Sacramento River,282181222898.392,306040235246.434,43,2843842,-0.078,2022-12-15,17,548,493.512,15.31
5,San Francisco Bay,357990226031.495,403652152366.566,45,6671142,-0.113,2022-12-15,17,548,402.629,12.49
6,San Joaquin River,147394515783.934,154516401013.528,25,1706054,-0.046,2022-12-15,17,548,245.558,7.618
7,South Coast,1378044056828.539,1448241947289.353,161,20450766,-0.048,2022-12-15,17,548,201.914,6.264
8,South Lahontan,69678858952.98,74427286370.858,16,863361,-0.064,2022-12-15,17,548,323.525,10.036
9,Tulare Lake,216447171526.326,223611282792.95,29,1982976,-0.032,2022-12-15,17,548,212.518,6.593


In [143]:
chart_data = regions_df[regions_df.hydrologic_region=="South Coast"]
name = chart_data.iloc[0]["hydrologic_region"]

alt.Chart(
    chart_data
).mark_bar().encode(
    x=alt.X('yearmonth(reporting_month):T'),
    y="gallons_pct_change:Q",
    color=alt.condition(
        alt.datum.gallons_pct_change > 0,
        alt.value("#e6ae56"),  # The positive color
        alt.value("#83c6e0")  # The negative color
    ),
    tooltip=["reporting_month","gallons_pct_change"]
).properties(title=f"Monthly water conservation in {name}", width=600)

Statewide

In [144]:
district_cumulative_savings_df["state"] = "California"

In [145]:
statewide_cumulative_savings_df = district_cumulative_savings_df.groupby(
    ["state"]
)[
    ["total_gallons_current","total_gallons_baseline", "supplier_name"]
].agg({
    "total_gallons_current": "sum",
    "total_gallons_baseline": "sum",
    "supplier_name": "size"
}).reset_index().rename(
    columns={"supplier_name": "total_reports"}
)

In [146]:
#statewide_cumulative_savings_df["total_reports"] = statewide_cumulative_savings_df["total_reports"]

In [147]:
statewide_cumulative_savings_df["total_population_served"] = pop_state

In [148]:
statewide_cumulative_savings_df["cumulative_pct_change"] = statewide_cumulative_savings_df.apply(lambda x: pct_change(x.total_gallons_current, x.total_gallons_baseline), axis=1)

In [149]:
statewide_cumulative_savings_df['date'] = date

In [150]:
statewide_cumulative_savings_df['months_since'] = months_since_reductions_begin
statewide_cumulative_savings_df['days_since'] = days_since_reductions

In [151]:
statewide_cumulative_savings_df['gallons_saved_per_month'] = regions_cumulative_savings_df.apply(
    lambda x: gallons_per_unit(
        x.total_gallons_current, x.total_gallons_baseline, x.total_population_served, x.months_since
    ), axis=1
)

In [152]:
statewide_cumulative_savings_df['gallons_saved_per_day'] = statewide_cumulative_savings_df.apply(
    lambda x: gallons_per_unit(
        x.total_gallons_current, x.total_gallons_baseline, x.total_population_served, x.days_since
    ), axis=1
)

In [153]:
statewide_cumulative_savings_df

Unnamed: 0,state,total_gallons_current,total_gallons_baseline,total_reports,total_population_served,cumulative_pct_change,date,months_since,days_since,gallons_saved_per_month,gallons_saved_per_day
0,California,2650916579538.75,2819824268654.718,379,37110227,-0.06,2022-12-15,17,548,154.456,8.306


In [78]:
alt.Chart(
    statewide_df
).mark_bar().encode(
    x=alt.X('yearmonth(reporting_month):T', axis=alt.Axis(title="Reporting month")),
    y=alt.Y("gallons_pct_change:Q", axis=alt.Axis(format="%", title="Percent change in gallons used")),
    color=alt.condition(
        alt.datum.gallons_pct_change > 0,
        alt.value("#e6ae56"),  # The positive color
        alt.value("#83c6e0")  # The negative color
    ),
    tooltip=["reporting_month","gallons_pct_change"]
).properties(title=f"Monthly water conservation in California", width=600)

### Export

In [80]:
statewide_cumulative_savings_df.to_csv(
    "../data/processed/cumulative-conservation/cumulative-savings/statewide-cumulative-savings.csv",
    index=False
)

In [83]:
regions_cumulative_savings_df.to_csv(
    "../data/processed/cumulative-conservation/cumulative-savings/regional-cumulative-savings.csv",
    index=False
)

In [84]:
district_cumulative_savings_df.to_csv(
    "../data/processed/cumulative-conservation/cumulative-savings/district-cumulative-savings.csv",
    index=False
)