In [53]:
import pandas as pd
import altair as alt
import altair_latimes as lat

In [54]:
alt.themes.register('latimes', lat.theme)
alt.themes.enable('latimes')

ThemeRegistry.enable('latimes')

### Import

In [106]:
df = pd.read_csv("../data/raw/uw-usage.csv", parse_dates=["Reporting Month"])

In [107]:
baselines_df = pd.read_csv("../data/raw/uw-2020-baseline-values.csv")

### Clean

Remove junk from column names

In [108]:
df.columns = df.columns.str.replace(" ","_").str.replace("-","_").str.replace(r'[#,@,&,(,)]', '', regex=True).str.lower()

Eliminate double spaces in supplier names

In [109]:
df.supplier_name = df.supplier_name.str.replace("  ", " ")

Clean up baselines column names as we did in the previous notebook

In [111]:
baselines_df.columns = baselines_df.columns.str.replace(" ","_").str.replace("-","_").str.replace(r'[#,@,&,(,)]', '', regex=True).str.lower()

In [112]:
baselines_df.supplier_name = baselines_df.supplier_name.str.replace("  ", " ")

In [113]:
baselines_trimmed_df = baselines_df[[
    'supplier_name', 
    'public_water_system_id', 
    'month', 
    #'original_units',
    # 'total_potable_production_original_units',
    # 'potable_commercial_agriculture_original_units',
    'total_potable_production_minus_ag_gallons', 
    # 'staff_notes'
]].copy()

Get rid of some unnecessary columns

In [114]:
trim_df = df[[
    'supplier_name', 
    'public_water_system_id', 
    'reporting_month', 
    'county',
    'hydrologic_region', 
    'climate_zone', 
    'total_population_served',
    'calculated_total_potable_water_production_gallons_ag_excluded',
    'calculated_r_gpcd', 
    # 'month'
]].copy()

### Merge

Filter df to July 2021 and later

In [63]:
filter_df = trim_df[trim_df.reporting_month >= "2021-07-15"].copy()

Remove -999999999

In [117]:
filter_df = filter_df[filter_df.calculated_total_potable_water_production_gallons_ag_excluded != -999999999]

Make a month column to merge with baselines

In [118]:
filter_df["month"] = filter_df["reporting_month"].dt.month

In [119]:
merge_df = pd.merge(
    filter_df, 
    baselines_trimmed_df, 
    how="left", 
    on=["supplier_name", "public_water_system_id", "month"]
).rename(
    columns={
        "calculated_total_potable_water_production_gallons_ag_excluded": "total_gallons_current",
        "total_potable_production_minus_ag_gallons": "total_gallons_baseline"
    }
).drop("month", axis=1)

### Remove duplicates

In [145]:
tmp = merge_df.set_index(['supplier_name', 'reporting_month'])

In [149]:
merge_df = tmp[~tmp.index.duplicated()].reset_index()

### Calculate percent changes by month...

In [150]:
def pct_change(new, old):
    return (new - old) / old

### ...by district

In [151]:
merge_df["gallons_pct_change"] = merge_df.apply(lambda x: pct_change(x.total_gallons_current, x.total_gallons_baseline), axis=1)

Filter out outliers

In [152]:
# lower_thresh=-0.5
# upper_thresh=1.3

In [153]:
# districts_filtered = merge_df[
#     (merge_df['gallons_pct_change']<upper_thresh)&(merge_df['gallons_pct_change']>lower_thresh)
# ]

### ...by region

In [154]:
regions_df = merge_df.groupby(
    ["hydrologic_region", "reporting_month"]
)[["total_gallons_current","total_gallons_baseline", "supplier_name"]].agg({
    "total_gallons_current":"sum",
    "total_gallons_baseline":"sum",
    "supplier_name":"size"
}).reset_index()

In [155]:
regions_df["gallons_pct_change"] = regions_df.apply(lambda x: pct_change(x.total_gallons_current, x.total_gallons_baseline), axis=1)

In [156]:
# regions_df[
#     (regions_df.reporting_month==regions_df.reporting_month.max())
# ]

### ...and statewide

In [157]:
merge_df["state"] = "Statewide"

In [158]:
statewide_df = merge_df.groupby(["state", "reporting_month"])[["total_gallons_current","total_gallons_baseline"]].sum().reset_index()

In [159]:
statewide_df["gallons_pct_change"] = statewide_df.apply(lambda x: pct_change(x.total_gallons_current, x.total_gallons_baseline), axis=1)

### Calculate cumulative savings since July 2021

Get date of this monthly report

In [160]:
date = statewide_df[
    (statewide_df.reporting_month==statewide_df.reporting_month.max())
].iloc[0]["reporting_month"]

Drop suppliers with missing data

In [161]:
counts = merge_df.groupby("supplier_name")["reporting_month"].count().reset_index()

In [162]:
max_count = max(counts.reporting_month)
max_count

13

In [163]:
suppliers_with_complete_data = list(counts[counts.reporting_month == max_count].supplier_name)

In [164]:
complete_data = merge_df[merge_df.supplier_name.isin(suppliers_with_complete_data)].copy()

By district

In [165]:
district_cumulative_savings_df = complete_data.groupby(
    ["supplier_name","hydrologic_region"]
)[["total_gallons_current","total_gallons_baseline", "total_population_served"]].sum().reset_index()

In [166]:
district_cumulative_savings_df["cumulative_pct_change"] = complete_data.apply(lambda x: pct_change(x.total_gallons_current, x.total_gallons_baseline), axis=1)

Drop districts for which we can't calculate a percentage change

In [167]:
drop_na_districts = district_cumulative_savings_df.dropna(subset="cumulative_pct_change")

By region

In [170]:
regions_cumulative_savings_df = district_cumulative_savings_df.groupby(
    ["hydrologic_region"]
)[
    ["total_gallons_current","total_gallons_baseline","supplier_name"]
].agg(
    {"total_gallons_current":"sum","total_gallons_baseline":"sum","supplier_name":"size"}
).reset_index().rename(
    columns={"supplier_name": "total_reports"}
)

In [171]:
regions_cumulative_savings_df["cumulative_pct_change"] = regions_cumulative_savings_df.apply(lambda x: pct_change(x.total_gallons_current, x.total_gallons_baseline), axis=1)

In [172]:
regions_cumulative_savings_df['date'] = date

Statewide

In [89]:
district_cumulative_savings_df["state"] = "California"

In [90]:
statewide_cumulative_savings_df = district_cumulative_savings_df.groupby(
    ["state"]
)[
    ["total_gallons_current","total_gallons_baseline", "supplier_name"]
].agg({
    "total_gallons_current": "sum",
    "total_gallons_baseline": "sum",
    "supplier_name": "size"
}).reset_index().rename(
    columns={"supplier_name": "total_reports"}
)

In [91]:
#statewide_cumulative_savings_df["total_reports"] = statewide_cumulative_savings_df["total_reports"]

In [92]:
statewide_cumulative_savings_df["cumulative_pct_change"] = statewide_cumulative_savings_df.apply(lambda x: pct_change(x.total_gallons_current, x.total_gallons_baseline), axis=1)

In [93]:
statewide_cumulative_savings_df['date'] = date

### Round numbers

In [94]:
statewide_cumulative_savings_df["total_gallons_current"] = statewide_cumulative_savings_df["total_gallons_current"].round(0)
statewide_cumulative_savings_df["total_gallons_baseline"] = statewide_cumulative_savings_df["total_gallons_baseline"].round(0)

In [95]:
statewide_cumulative_savings_df["cumulative_pct_change"] = statewide_cumulative_savings_df["cumulative_pct_change"].round(3)

In [96]:
regions_cumulative_savings_df["total_gallons_current"] = regions_cumulative_savings_df["total_gallons_current"].round(0)
regions_cumulative_savings_df["total_gallons_baseline"] = regions_cumulative_savings_df["total_gallons_baseline"].round(0)

In [97]:
regions_cumulative_savings_df["cumulative_pct_change"] = regions_cumulative_savings_df["cumulative_pct_change"].round(3)

### Charts

In [98]:
# alt.Chart(
#     regions_df[regions_df.hydrologic_region=="South Coast"]
# ).mark_bar().encode(
#     x="reporting_month:O",
#     y="gallons_pct_change:Q",
#     color=alt.condition(
#         alt.datum.gallons_pct_change > 0,
#         alt.value("#e6ae56"),  # The positive color
#         alt.value("#83c6e0")  # The negative color
#     ),
#     tooltip=["gallons_pct_change"]
# ).properties(title="Monthly water conservation in the South Coast", width=600)

### Export

Monthly

In [99]:
statewide_df.to_csv("../data/processed/monthly-conservation/statewide-conservation-monthly.csv", index=False)

In [100]:
regions_df.to_csv("../data/processed/monthly-conservation/regional-conservation-monthly.csv", index=False)

In [101]:
merge_df.to_csv("../data/processed/monthly-conservation/district-level-conservation-monthly.csv", index=False)

Cumulative

In [102]:
statewide_cumulative_savings_df.to_csv("../data/processed/cumulative-conservation/statewide-conservation-cumulative.csv", index=False)

In [103]:
regions_cumulative_savings_df.to_csv("../data/processed/cumulative-conservation/monthly-conservation-cumulative.csv", index=False)

In [104]:
district_cumulative_savings_df.to_csv("../data/processed/cumulative-conservation/district-level-conservation-cumulative.csv", index=False)