In [212]:
import pandas as pd
import altair as alt
import altair_latimes as lat
from pandas.tseries.offsets import MonthEnd

In [213]:
alt.themes.register('latimes', lat.theme)
alt.themes.enable('latimes')

ThemeRegistry.enable('latimes')

In [214]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)

### Import

In [215]:
df = pd.read_csv("../data/raw/uw-usage.csv", parse_dates=["Reporting Month"])

In [216]:
baselines_df = pd.read_csv("../data/raw/uw-2020-baseline-values.csv")

### Clean

Remove junk from column names

In [217]:
df.columns = df.columns.str.replace(" ","_").str.replace("-","_").str.replace(r'[#,@,&,(,)]', '', regex=True).str.lower()

Eliminate double spaces in supplier names

In [218]:
df.supplier_name = df.supplier_name.str.replace("  ", " ")

Clean up baselines column names as we did in the previous notebook

In [219]:
baselines_df.columns = baselines_df.columns.str.replace(" ","_").str.replace("-","_").str.replace(r'[#,@,&,(,)]', '', regex=True).str.lower()

In [220]:
baselines_df.supplier_name = baselines_df.supplier_name.str.replace("  ", " ")

In [221]:
baselines_trimmed_df = baselines_df[[
    'supplier_name', 
    'public_water_system_id', 
    'month', 
    #'original_units',
    # 'total_potable_production_original_units',
    # 'potable_commercial_agriculture_original_units',
    'total_potable_production_minus_ag_gallons', 
    # 'staff_notes'
]].copy()

Get rid of some unnecessary columns

In [222]:
trim_df = df[[
    'supplier_name', 
    'public_water_system_id', 
    'reporting_month', 
    'county',
    'hydrologic_region', 
    'climate_zone', 
    'total_population_served',
    'calculated_total_potable_water_production_gallons_ag_excluded',
    'calculated_r_gpcd', 
    # 'month'
]].copy()

### Get populations by supplier, hydrologic region and state total

Using populations from the beginning of the drought emergency

In [223]:
pops_by_supplier = df[
    df.reporting_month == "2021-07-15"
][["supplier_name","hydrologic_region","total_population_served"]]

In [224]:
pops_by_region = pops_by_supplier.groupby(
    ["hydrologic_region"]
)["total_population_served"].sum().reset_index()

In [225]:
pop_state = pops_by_region.total_population_served.sum()
pop_state

37110170

### Merge

Filter df to July 2021 and later

In [226]:
filter_df = trim_df[trim_df.reporting_month >= "2021-07-15"].copy()

Remove -999999999

In [227]:
filter_df = filter_df[filter_df.calculated_total_potable_water_production_gallons_ag_excluded != -999999999]

Make a month column to merge with baselines

In [228]:
filter_df["month"] = filter_df["reporting_month"].dt.month

In [229]:
merge_df = pd.merge(
    filter_df, 
    baselines_trimmed_df, 
    how="left", 
    on=["supplier_name", "public_water_system_id", "month"]
).rename(
    columns={
        "calculated_total_potable_water_production_gallons_ag_excluded": "total_gallons_current",
        "total_potable_production_minus_ag_gallons": "total_gallons_baseline"
    }
).drop("month", axis=1)

### Remove duplicates

In [230]:
tmp = merge_df.set_index(['supplier_name', 'reporting_month'])

In [231]:
merge_df = tmp[~tmp.index.duplicated()].reset_index()

### Calculate percent changes by month

In [232]:
def pct_change(new, old):
    return (new - old) / old

By supplier

In [233]:
merge_df["gallons_pct_change"] = merge_df.apply(lambda x: pct_change(x.total_gallons_current, x.total_gallons_baseline), axis=1)

Filter out outliers

In [234]:
# lower_thresh=-0.5
# upper_thresh=1.3

In [235]:
# districts_filtered = merge_df[
#     (merge_df['gallons_pct_change']<upper_thresh)&(merge_df['gallons_pct_change']>lower_thresh)
# ]

By region

In [236]:
regions_df = merge_df.groupby(
    ["hydrologic_region", "reporting_month"]
)[["total_gallons_current","total_gallons_baseline", "supplier_name"]].agg({
    "total_gallons_current":"sum",
    "total_gallons_baseline":"sum",
    "supplier_name":"size"
}).reset_index()

In [237]:
regions_df["gallons_pct_change"] = regions_df.apply(lambda x: pct_change(x.total_gallons_current, x.total_gallons_baseline), axis=1)

Statewide

In [238]:
merge_df["state"] = "Statewide"

In [239]:
statewide_df = merge_df.groupby(["state", "reporting_month"])[["total_gallons_current","total_gallons_baseline"]].sum().reset_index()

In [240]:
statewide_df["gallons_pct_change"] = statewide_df.apply(lambda x: pct_change(x.total_gallons_current, x.total_gallons_baseline), axis=1)

### Calculate cumulative savings since July 2021

Set up some vars and function

In [241]:
def month_diff(a, b):
    return 12 * (a.year - b.year) + (a.month - b.month)

In [242]:
def gallons_per_unit(gallons, baseline, population, time):
    return ((baseline - gallons) / population) / time

In [243]:
# voluntary_reductions_begin = pd.to_datetime('2021-07-08')
voluntary_reductions_begin = pd.to_datetime('2021-07-01')
voluntary_reductions_begin

Timestamp('2021-07-01 00:00:00')

In [244]:
latest_month = statewide_df[
    statewide_df.reporting_month == statewide_df.reporting_month.max()
].iloc[0]['reporting_month']

latest_month = pd.to_datetime(latest_month, format="%Y%m") + MonthEnd(0)

latest_month

Timestamp('2023-02-28 00:00:00')

In [245]:
months_since_reductions_begin = month_diff(latest_month, voluntary_reductions_begin)
months_since_reductions_begin

19

In [246]:
days_since_reductions = (latest_month - voluntary_reductions_begin).days
days_since_reductions

607

Get date of this monthly report

In [247]:
date = statewide_df[
    (statewide_df.reporting_month==statewide_df.reporting_month.max())
].iloc[0]["reporting_month"]

Drop suppliers with missing data

In [248]:
counts = merge_df.groupby("supplier_name")["reporting_month"].count().reset_index()

In [249]:
max_count = max(counts.reporting_month)
max_count

20

In [250]:
suppliers_with_complete_data = list(counts[counts.reporting_month == max_count].supplier_name)

In [251]:
complete_data = merge_df[merge_df.supplier_name.isin(suppliers_with_complete_data)].copy()

In [252]:
complete_data[complete_data.reporting_month == "2022-02-15"]

Unnamed: 0,supplier_name,reporting_month,public_water_system_id,county,hydrologic_region,climate_zone,total_population_served,total_gallons_current,calculated_r_gpcd,total_gallons_baseline,gallons_pct_change,state
12,East Bay Municipal Utilities District,2022-02-15,CA0110005,"Alameda,Contra Costa",San Francisco Bay,3,1430000,3898000000.000,55.491,4084900000.000,-0.046,Statewide
49,Yorba Linda Water District,2022-02-15,CA3010037,Orange,South Coast,8,80540,430256918.910,157.926,518275791.030,-0.170,Statewide
69,Long Beach City of,2022-02-15,CA1910065,Los Angeles,South Coast,6,467730,1158726156.000,56.625,1252247033.961,-0.075,Statewide
89,"Los Banos, City of",2022-02-15,CA2410005,Merced,San Joaquin River,12,44391,150586000.000,77.537,144716000.000,0.041,Statewide
109,Turlock City of,2022-02-15,CA5010019,Stanislaus,San Joaquin River,12,74297,364551000.000,88.183,382770000.000,-0.048,Statewide
...,...,...,...,...,...,...,...,...,...,...,...,...
7890,"Corcoran, City of",2022-02-15,CA1610004,Kings,Tulare Lake,13,13593,103533000.000,45.359,100475000.000,0.030,Statewide
7910,City of Newman Water Department,2022-02-15,CA5010013,Stanislaus,San Joaquin River,12,11801,38631264.000,75.760,39505000.000,-0.022,Statewide
8004,Greenfield County Water District,2022-02-15,CA1510024,Kern,Tulare Lake,4,11428,46588339.000,119.534,43408000.000,0.073,Statewide
8043,Thermalito Water and Sewer District,2022-02-15,CA0410008,Butte,Sacramento River,11,10339,34724573.840,99.319,33420581.964,0.039,Statewide


By district

In [253]:
district_cumulative_savings_df = complete_data.groupby(
    ["supplier_name","hydrologic_region"]
)[["total_gallons_current","total_gallons_baseline"]].sum().reset_index()

In [254]:
district_cumulative_savings_df = pd.merge(
    district_cumulative_savings_df,
    pops_by_supplier[["supplier_name","hydrologic_region","total_population_served"]],
    how="left",
    on=["supplier_name","hydrologic_region"]
)

In [255]:
district_cumulative_savings_df["cumulative_pct_change"] = district_cumulative_savings_df.apply(lambda x: pct_change(x.total_gallons_current, x.total_gallons_baseline), axis=1)

In [256]:
district_cumulative_savings_df['date'] = date

In [257]:
district_cumulative_savings_df['months_since'] = months_since_reductions_begin
district_cumulative_savings_df['days_since'] = days_since_reductions

In [258]:
district_cumulative_savings_df['gallons_saved_per_month'] = district_cumulative_savings_df.apply(
    lambda x: gallons_per_unit(
        x.total_gallons_current, x.total_gallons_baseline, x.total_population_served, x.months_since
    ), axis=1
)

In [259]:
district_cumulative_savings_df['gallons_saved_per_day'] = district_cumulative_savings_df.apply(
    lambda x: gallons_per_unit(
        x.total_gallons_current, x.total_gallons_baseline, x.total_population_served, x.days_since
    ), axis=1
)

In [301]:
district_cumulative_savings_df.sort_values("gallons_saved_per_day", ascending=False)

Unnamed: 0,supplier_name,hydrologic_region,total_gallons_current,total_gallons_baseline,total_population_served,cumulative_pct_change,date,months_since,days_since,gallons_saved_per_month,gallons_saved_per_day,state
315,Vernon City of,South Coast,3346040095.620,3701434192.453,112,-0.096,2023-02-15,19,607,167008.504,5227.614,California
171,Livingston City of,San Joaquin River,3682207000.000,4623882000.000,15052,-0.204,2023-02-15,19,607,3292.708,103.067,California
52,California Water Service Company Westlake,South Coast,3148600457.700,3915712467.357,19489,-0.196,2023-02-15,19,607,2071.646,64.846,California
313,Ventura County Waterworks District No 1,South Coast,3581167660.200,4775584874.894,36284,-0.250,2023-02-15,19,607,1732.556,54.232,California
142,Healdsburg City of,North Coast,699538372.000,1078507656.000,11800,-0.351,2023-02-15,19,607,1690.318,52.909,California
...,...,...,...,...,...,...,...,...,...,...,...,...
156,"Kingsburg, City of",Tulare Lake,1716940000.000,1670046000.000,12338,0.028,2023-02-15,19,607,-200.041,-6.262,California
206,Oildale Mutual Water Company,Tulare Lake,4939852282.350,4790309243.184,34133,0.031,2023-02-15,19,607,-230.589,-7.218,California
97,El Centro City of,Colorado River,4470845000.000,4116148000.000,46364,0.086,2023-02-15,19,607,-402.646,-12.603,California
217,Paradise Irrigation District,Sacramento River,2218100000.000,2084610000.000,3000,0.064,2023-02-15,19,607,-2341.930,-73.306,California


In [288]:
chart_data = complete_data[complete_data.supplier_name.str.contains("El Segundo")]
name = chart_data.iloc[0]["supplier_name"]

alt.Chart(
    chart_data
).mark_bar().encode(
    x=alt.X('yearmonth(reporting_month):T', axis=alt.Axis(title="Reporting month")),
    y=alt.Y("gallons_pct_change:Q", axis=alt.Axis(format="%", title="Percent change in gallons used")),
    color=alt.condition(
        alt.datum.gallons_pct_change > 0,
        alt.value("#e6ae56"),  # The positive color
        alt.value("#83c6e0")  # The negative color
    ),
    tooltip=["reporting_month","gallons_pct_change"]
).properties(title=f"Monthly water conservation in {name}", width=600)

In [308]:
df.iloc[36934]["qualification"]

'The Hyperion Treatment Plant that is normally the source of the high volumes of recycled water reported from the City had issues causing our potable water customer West Basin to pull large volumes of potable water from our water system. The results will show in the higher than average potable production number on this report that is also present in the Commercial potable number. As well as the lower than average recycled water number.'

In [324]:
df.iloc[18213]['qualification']

'Meter project still underway. Majority should be online by end of first quarter 2023'

In [319]:
df[
    (df.supplier_name=="Paradise Irrigation District")&
    (df.reporting_month >= "2021-07-15")
]

Unnamed: 0,supplier_name,public_water_system_id,reporting_month,county,hydrologic_region,climate_zone,total_population_served,reference_2014_population,county_under_drought_declaration,water_shortage_contingency_stage_invoked,...,reported_preliminary_commercial_industrial_and_institutional_water,reported_final_commercial_industrial_and_institutional_water,reported_recycled_water,reported_non_revenue_water,calculated_total_potable_water_production_gallons_ag_excluded,calculated_total_potable_water_production_gallons_2013_ag_excluded,calculated_commercial_agricultural_water_gallons,calculated_commercial_agricultural_water_gallons_2013,calculated_r_gpcd,qualification
18210,Paradise Irrigation District,CA0410007,2023-02-15,Butte,Sacramento River,11,8800,26032,Yes,1,...,0.0,0.0,0.0,0.0,69300000.0,76100000.0,0.0,0.0,26.883,Still working on meter system.
18211,Paradise Irrigation District,CA0410007,2023-01-15,Butte,Sacramento River,11,8000,26032,Yes,0,...,0.0,0.0,0.0,0.0,74900000.0,82500000.0,0.0,0.0,29.032,Still working on meters.
18212,Paradise Irrigation District,CA0410007,2022-12-15,Butte,Sacramento River,11,8000,26032,Yes,1,...,0.0,0.0,0.0,0.0,78100000.0,102300000.0,0.0,0.0,30.194,
18213,Paradise Irrigation District,CA0410007,2022-11-15,Butte,Sacramento River,11,8000,26032,Yes,1,...,0.0,0.0,0.0,0.0,82800000.0,117400000.0,0.0,0.0,33.2,Meter project still underway. Majority should ...
18214,Paradise Irrigation District,CA0410007,2022-10-15,Butte,Sacramento River,11,8000,26032,Yes,1,...,0.0,0.0,0.0,0.0,131600000.0,170700000.0,0.0,0.0,51.097,Meter project still underway. Majority should ...
18215,Paradise Irrigation District,CA0410007,2022-09-15,Butte,Sacramento River,11,8000,26032,Yes,1,...,0.0,0.0,0.0,0.0,144400000.0,230100000.0,0.0,0.0,57.6,Still in recovery mode from fire.
18216,Paradise Irrigation District,CA0410007,2022-08-15,Butte,Sacramento River,11,8000,26032,Yes,1,...,0.0,0.0,0.0,0.0,174200000.0,309900000.0,0.0,0.0,67.355,
18217,Paradise Irrigation District,CA0410007,2022-07-15,Butte,Sacramento River,11,8000,26032,Yes,We haven't,...,0.0,0.0,0.0,0.0,162400000.0,327500000.0,0.0,0.0,62.71,Still working on the metering system.
18218,Paradise Irrigation District,CA0410007,2022-06-15,Butte,Sacramento River,11,6000,26032,Yes,We haven't,...,0.0,0.0,0.0,0.0,122600000.0,276200000.0,0.0,0.0,65.6,Still working on meters to homes.
18219,Paradise Irrigation District,CA0410007,2022-05-15,Butte,Sacramento River,11,6000,26032,Yes,We haven't,...,0.0,0.0,0.0,0.0,105300000.0,241600000.0,0.0,0.0,54.194,We are still working on our metering system d...


In [312]:
complete_data[
    (complete_data.supplier_name=="El Segundo City of")&
    (complete_data.reporting_month >= "2021-07-15") &
    (complete_data.reporting_month != "2021-08-15")
].groupby("supplier_name")[["total_gallons_current","total_gallons_baseline"]].sum().reset_index()

Unnamed: 0,supplier_name,total_gallons_current,total_gallons_baseline
0,El Segundo City of,3476540999.868,2817199764.284


In [313]:
( (3476540999.868-2817199764.284) / 16654) / 607

65.22333272305075

In [289]:
chart_data = complete_data[complete_data.supplier_name == "Los Angeles Department of Water and Power"]
name = chart_data.iloc[0]["supplier_name"]

alt.Chart(
    chart_data
).mark_bar().encode(
    x=alt.X('yearmonth(reporting_month):T', axis=alt.Axis(title="Reporting month")),
    y=alt.Y("gallons_pct_change:Q", axis=alt.Axis(format="%", title="Percent change in gallons used")),
    color=alt.condition(
        alt.datum.gallons_pct_change > 0,
        alt.value("#e6ae56"),  # The positive color
        alt.value("#83c6e0")  # The negative color
    ),
    tooltip=["reporting_month","gallons_pct_change"]
).properties(title=f"Monthly water conservation in {name}", width=600)

In [262]:
alt.Chart(
    complete_data[complete_data.supplier_name.str.contains("El Segundo")]
).mark_bar().encode(
    x=alt.X('yearmonth(reporting_month):T'),
    y="total_gallons_current",
    tooltip=["reporting_month"]
).properties(width=600)

Drop districts for which we can't calculate a percentage change

In [263]:
drop_na_districts = district_cumulative_savings_df.dropna(subset="cumulative_pct_change")

By region

In [264]:
regions_cumulative_savings_df = district_cumulative_savings_df.groupby(
    ["hydrologic_region"]
)[
    ["total_gallons_current","total_gallons_baseline","supplier_name"]
].agg(
    {"total_gallons_current":"sum","total_gallons_baseline":"sum","supplier_name":"size"}
).reset_index().rename(
    columns={"supplier_name": "total_reports"}
)

In [265]:
regions_cumulative_savings_df = pd.merge(
    regions_cumulative_savings_df,
    pops_by_region[["hydrologic_region","total_population_served"]],
    how="left",
    on="hydrologic_region"
)

In [266]:
regions_cumulative_savings_df["cumulative_pct_change"] = regions_cumulative_savings_df.apply(lambda x: pct_change(x.total_gallons_current, x.total_gallons_baseline), axis=1)

In [267]:
regions_cumulative_savings_df['date'] = date

In [268]:
regions_cumulative_savings_df['months_since'] = months_since_reductions_begin
regions_cumulative_savings_df['days_since'] = days_since_reductions

In [269]:
regions_cumulative_savings_df['gallons_saved_per_month'] = regions_cumulative_savings_df.apply(
    lambda x: gallons_per_unit(
        x.total_gallons_current, x.total_gallons_baseline, x.total_population_served, x.months_since
    ), axis=1
)

In [270]:
regions_cumulative_savings_df['gallons_saved_per_day'] = regions_cumulative_savings_df.apply(
    lambda x: gallons_per_unit(
        x.total_gallons_current, x.total_gallons_baseline, x.total_population_served, x.days_since
    ), axis=1
)

In [271]:
regions_cumulative_savings_df

Unnamed: 0,hydrologic_region,total_gallons_current,total_gallons_baseline,total_reports,total_population_served,cumulative_pct_change,date,months_since,days_since,gallons_saved_per_month,gallons_saved_per_day
0,Central Coast,73865571281.509,77922514628.424,27,1293270,-0.052,2023-02-15,19,607,165.103,5.168
1,Colorado River,108069739159.527,110652807284.326,11,780308,-0.023,2023-02-15,19,607,174.227,5.454
2,North Coast,17913929375.749,21147007717.48,11,414740,-0.153,2023-02-15,19,607,410.286,12.843
3,North Lahontan,5762871698.0,6347444088.0,3,103768,-0.092,2023-02-15,19,607,296.498,9.281
4,Sacramento River,277510747649.635,301350744163.494,38,2843785,-0.079,2023-02-15,19,607,441.221,13.811
5,San Francisco Bay,366506427115.116,414252529720.338,39,6671142,-0.115,2023-02-15,19,607,376.69,11.791
6,San Joaquin River,140029015637.925,147748201736.937,19,1706054,-0.052,2023-02-15,19,607,238.136,7.454
7,South Coast,1427063906220.59,1513832634601.93,147,20450766,-0.057,2023-02-15,19,607,223.306,6.99
8,South Lahontan,64471209610.508,69254042032.883,13,863361,-0.069,2023-02-15,19,607,291.568,9.126
9,Tulare Lake,210975670096.034,218920780615.957,24,1982976,-0.036,2023-02-15,19,607,210.877,6.601


In [272]:
chart_data = regions_df[regions_df.hydrologic_region=="South Coast"]
name = chart_data.iloc[0]["hydrologic_region"]

alt.Chart(
    chart_data
).mark_bar().encode(
    x=alt.X('yearmonth(reporting_month):T'),
    y="gallons_pct_change:Q",
    color=alt.condition(
        alt.datum.gallons_pct_change > 0,
        alt.value("#e6ae56"),  # The positive color
        alt.value("#83c6e0")  # The negative color
    ),
    tooltip=["reporting_month","gallons_pct_change"]
).properties(title=f"Monthly water conservation in {name}", width=600)

Statewide

In [273]:
district_cumulative_savings_df["state"] = "California"

In [274]:
statewide_cumulative_savings_df = district_cumulative_savings_df.groupby(
    ["state"]
)[
    ["total_gallons_current","total_gallons_baseline", "supplier_name"]
].agg({
    "total_gallons_current": "sum",
    "total_gallons_baseline": "sum",
    "supplier_name": "size"
}).reset_index().rename(
    columns={"supplier_name": "total_reports"}
)

In [275]:
#statewide_cumulative_savings_df["total_reports"] = statewide_cumulative_savings_df["total_reports"]

In [276]:
statewide_cumulative_savings_df["total_population_served"] = pop_state

In [277]:
statewide_cumulative_savings_df["cumulative_pct_change"] = statewide_cumulative_savings_df.apply(lambda x: pct_change(x.total_gallons_current, x.total_gallons_baseline), axis=1)

In [278]:
statewide_cumulative_savings_df['date'] = date

In [279]:
statewide_cumulative_savings_df['months_since'] = months_since_reductions_begin
statewide_cumulative_savings_df['days_since'] = days_since_reductions

In [280]:
statewide_cumulative_savings_df['gallons_saved_per_month'] = regions_cumulative_savings_df.apply(
    lambda x: gallons_per_unit(
        x.total_gallons_current, x.total_gallons_baseline, x.total_population_served, x.months_since
    ), axis=1
)

In [281]:
statewide_cumulative_savings_df['gallons_saved_per_day'] = statewide_cumulative_savings_df.apply(
    lambda x: gallons_per_unit(
        x.total_gallons_current, x.total_gallons_baseline, x.total_population_served, x.days_since
    ), axis=1
)

In [282]:
statewide_cumulative_savings_df

Unnamed: 0,state,total_gallons_current,total_gallons_baseline,total_reports,total_population_served,cumulative_pct_change,date,months_since,days_since,gallons_saved_per_month,gallons_saved_per_day
0,California,2692169087844.593,2881428706589.769,332,37110170,-0.066,2023-02-15,19,607,165.103,8.402


In [287]:
alt.Chart(
    statewide_df
).mark_bar().encode(
    x=alt.X('yearmonth(reporting_month):T', axis=alt.Axis(title="Reporting month")),
    y=alt.Y("gallons_pct_change:Q", axis=alt.Axis(format="%", title="Percent change in gallons used")),
    color=alt.condition(
        alt.datum.gallons_pct_change > 0,
        alt.value("#e6ae56"),  # The positive color
        alt.value("#83c6e0")  # The negative color
    ),
    tooltip=["reporting_month","gallons_pct_change"]
).properties(title=f"Monthly water conservation in California", width=600)

### Export