In [630]:
import pandas as pd
import altair as alt
import altair_latimes as lat

In [631]:
alt.themes.register('latimes', lat.theme)
alt.themes.enable('latimes')

ThemeRegistry.enable('latimes')

In [632]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)

### Import

In [633]:
df = pd.read_csv("../data/processed/district-level-residential-use.csv", parse_dates=["reporting_month"])

In [634]:
baselines_df = pd.read_csv("../data/raw/uw-2020-baseline-values.csv")

### Clean

Clean up baselines column names as we did in the previous notebook

In [635]:
baselines_df.columns = baselines_df.columns.str.replace(" ","_").str.replace("-","_").str.replace(r'[#,@,&,(,)]', '', regex=True).str.lower()

In [636]:
baselines_df.supplier_name = baselines_df.supplier_name.str.replace("  ", " ")

In [637]:
baselines_trimmed_df = baselines_df[[
    'supplier_name', 
    'public_water_system_id', 
    'month', 
    #'original_units',
    # 'total_potable_production_original_units',
    # 'potable_commercial_agriculture_original_units',
    'total_potable_production_minus_ag_gallons', 
    # 'staff_notes'
]].copy()

Get rid of some unnecessary columns

In [639]:
trim_df = df[[
    'supplier_name', 
    'public_water_system_id', 
    'reporting_month', 
    'county',
    'hydrologic_region', 
    # 'climate_zone', 
    'total_population_served',
    'calculated_total_potable_water_production_gallons_ag_excluded',
    'r_gpcd', 
    'regional_r_gpcd', 
    # 'month'
]].copy()

### Merge

Filter df to July 2021 and later

In [640]:
filter_df = trim_df[trim_df.reporting_month >= "2021-07-15"].copy()

Remove -999999999

In [641]:
filter_df = filter_df[filter_df.calculated_total_potable_water_production_gallons_ag_excluded != -999999999]

Make a month column to merge with baselines

In [642]:
filter_df["month"] = filter_df["reporting_month"].dt.month

In [643]:
merge_df = pd.merge(
    filter_df, 
    baselines_trimmed_df, 
    how="left", 
    on=["supplier_name", "public_water_system_id", "month"]
).rename(
    columns={
        "calculated_total_potable_water_production_gallons_ag_excluded": "total_gallons_current",
        "total_potable_production_minus_ag_gallons": "total_gallons_baseline"
    }
).drop("month", axis=1)

### Calculate percent changes by month...

In [644]:
def pct_change(new, old):
    return (new - old) / old

### ...by district

In [645]:
merge_df["gallons_pct_change"] = merge_df.apply(lambda x: pct_change(x.total_gallons_current, x.total_gallons_baseline), axis=1)

Filter out outliers

In [646]:
lower_thresh=-0.5
upper_thresh=1.3

In [647]:
districts_filtered = merge_df[
    (merge_df['gallons_pct_change']<upper_thresh)&(merge_df['gallons_pct_change']>lower_thresh)
]

### ...by region

In [648]:
regions_df = districts_filtered.groupby(
    ["hydrologic_region", "reporting_month"]
)[["total_gallons_current","total_gallons_baseline", "supplier_name"]].agg({
    "total_gallons_current":"sum",
    "total_gallons_baseline":"sum",
    "supplier_name":"size"
}).reset_index()

In [649]:
regions_df["gallons_pct_change"] = regions_df.apply(lambda x: pct_change(x.total_gallons_current, x.total_gallons_baseline), axis=1)

In [650]:
regions_df[
    (regions_df.reporting_month==regions_df.reporting_month.max())
]

Unnamed: 0,hydrologic_region,reporting_month,total_gallons_current,total_gallons_baseline,supplier_name,gallons_pct_change
9,Central Coast,2022-04-15,3501099408.15,2921707276.372,27,0.198
19,Colorado River,2022-04-15,5678818785.739,4035752997.688,12,0.407
29,North Coast,2022-04-15,938709149.351,1091202015.882,13,-0.14
39,North Lahontan,2022-04-15,221893577.0,246256129.0,5,-0.099
49,Sacramento River,2022-04-15,11579489065.468,11334449823.54,41,0.022
59,San Francisco Bay,2022-04-15,17989618834.774,17997974516.468,40,-0.0
69,San Joaquin River,2022-04-15,7559404881.167,6931184682.739,26,0.091
79,South Coast,2022-04-15,70630927401.74,56230594858.935,153,0.256
89,South Lahontan,2022-04-15,3573899302.266,2867226732.837,16,0.246
99,Tulare Lake,2022-04-15,10334241108.127,8631076619.853,29,0.197


### ...and statewide

In [651]:
districts_filtered["state"] = "Statewide"

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  districts_filtered["state"] = "Statewide"


In [652]:
statewide_df = districts_filtered.groupby(["state", "reporting_month"])[["total_gallons_current","total_gallons_baseline"]].sum().reset_index()

In [653]:
statewide_df["gallons_pct_change"] = statewide_df.apply(lambda x: pct_change(x.total_gallons_current, x.total_gallons_baseline), axis=1)

In [654]:
statewide_df[
    (statewide_df.reporting_month==statewide_df.reporting_month.max())
]

Unnamed: 0,state,reporting_month,total_gallons_current,total_gallons_baseline,gallons_pct_change
9,Statewide,2022-04-15,132008101513.781,112287425653.314,0.176


### Calculate cumulative savings since July 2021

Drop suppliers with missing data

In [655]:
counts = districts_filtered.groupby("supplier_name")["reporting_month"].count().reset_index()

In [656]:
max_count = max(counts.reporting_month)
max_count

10

In [657]:
suppliers_with_complete_data = list(counts[counts.reporting_month == max_count].supplier_name)

In [658]:
complete_data = districts_filtered[districts_filtered.supplier_name.isin(suppliers_with_complete_data)].copy()

By district

In [659]:
district_cumulative_savings_df = complete_data.groupby(
    ["supplier_name","hydrologic_region"]
)[["total_gallons_current","total_gallons_baseline", "total_population_served"]].sum().reset_index()

In [660]:
district_cumulative_savings_df["cumulative_pct_change"] = complete_data.apply(lambda x: pct_change(x.total_gallons_current, x.total_gallons_baseline), axis=1)

Drop districts for which we can't calculate a percentage change

In [661]:
drop_na_districts = district_cumulative_savings_df.dropna(subset="cumulative_pct_change")

By region

In [662]:
regions_cumulative_savings_df = district_cumulative_savings_df.groupby(
    ["hydrologic_region"]
)[
    ["total_gallons_current","total_gallons_baseline","supplier_name"]
].agg(
    {"total_gallons_current":"sum","total_gallons_baseline":"sum","supplier_name":"size"}
).reset_index().rename(
    columns={"supplier_name": "total_reports"}
)

In [663]:
regions_cumulative_savings_df["cumulative_pct_change"] = regions_cumulative_savings_df.apply(lambda x: pct_change(x.total_gallons_current, x.total_gallons_baseline), axis=1)

In [664]:
regions_cumulative_savings_df

Unnamed: 0,hydrologic_region,total_gallons_current,total_gallons_baseline,total_reports,cumulative_pct_change
0,Central Coast,33294697179.19,33845686693.082,23,-0.016
1,Colorado River,53828971621.457,51740437203.435,11,0.04
2,North Coast,10343156971.652,11991899594.348,12,-0.137
3,North Lahontan,3037442426.0,3303905365.0,4,-0.081
4,Sacramento River,131339223221.773,139117833103.523,40,-0.056
5,San Francisco Bay,186441827721.584,203452108123.022,37,-0.084
6,San Joaquin River,76043723066.579,77608889227.917,23,-0.02
7,South Coast,703785946051.744,701982585778.268,143,0.003
8,South Lahontan,32721861344.733,32754725236.455,13,-0.001
9,Tulare Lake,108199040221.13,109366176445.438,28,-0.011


Statewide

In [665]:
district_cumulative_savings_df["state"] = "California"

In [666]:
statewide_cumulative_savings_df = district_cumulative_savings_df.groupby(
    ["state"]
)[
    ["total_gallons_current","total_gallons_baseline", "supplier_name"]
].agg({
    "total_gallons_current": "sum",
    "total_gallons_baseline": "sum",
    "supplier_name": "size"
}).reset_index().rename(
    columns={"supplier_name": "total_reports"}
)

In [667]:
#statewide_cumulative_savings_df["total_reports"] = statewide_cumulative_savings_df["total_reports"]

In [668]:
statewide_cumulative_savings_df["cumulative_pct_change"] = statewide_cumulative_savings_df.apply(lambda x: pct_change(x.total_gallons_current, x.total_gallons_baseline), axis=1)

In [669]:
statewide_cumulative_savings_df

Unnamed: 0,state,total_gallons_current,total_gallons_baseline,total_reports,cumulative_pct_change
0,California,1339035889825.843,1365164246770.487,334,-0.019


### Charts

In [670]:
alt.Chart(
    regions_df[regions_df.hydrologic_region=="South Coast"]
).mark_bar().encode(
    x="reporting_month:O",
    y="gallons_pct_change:Q",
    color=alt.condition(
        alt.datum.gallons_pct_change > 0,
        alt.value("#e6ae56"),  # The positive color
        alt.value("#83c6e0")  # The negative color
    ),
    tooltip=["gallons_pct_change"]
).properties(title="Monthly water conservation in the South Coast", width=600)

### Export

Monthly

In [671]:
statewide_df.to_csv("../data/processed/monthly-conservation/statewide-conservation-monthly.csv", index=False)

In [672]:
regions_df.to_csv("../data/processed/monthly-conservation/regional-conservation-monthly.csv", index=False)

In [673]:
merge_df.to_csv("../data/processed/monthly-conservation/district-level-conservation-monthly.csv", index=False)

Cumulative

In [674]:
statewide_cumulative_savings_df.to_csv("../data/processed/cumulative-conservation/statewide-conservation-cumulative.csv", index=False)

In [675]:
regions_cumulative_savings_df.to_csv("../data/processed/cumulative-conservation/monthly-conservation-cumulative.csv", index=False)

In [676]:
district_cumulative_savings_df.to_csv("../data/processed/cumulative-conservation/district-level-conservation-cumulative.csv", index=False)