In [81]:
import pandas as pd
import altair as alt
import altair_latimes as lat

In [82]:
alt.themes.register('latimes', lat.theme)
alt.themes.enable('latimes')

ThemeRegistry.enable('latimes')

### Import

In [83]:
df = pd.read_csv("../data/raw/uw-usage.csv", parse_dates=["Reporting Month"])

In [84]:
baselines_df = pd.read_csv("../data/raw/uw-2020-baseline-values.csv")

### Clean

Remove junk from column names

In [85]:
df.columns = df.columns.str.replace(" ","_").str.replace("-","_").str.replace(r'[#,@,&,(,)]', '', regex=True).str.lower()

Eliminate double spaces in supplier names

In [86]:
df.supplier_name = df.supplier_name.str.replace("  ", " ")

Clean up baselines column names as we did in the previous notebook

In [87]:
baselines_df.columns = baselines_df.columns.str.replace(" ","_").str.replace("-","_").str.replace(r'[#,@,&,(,)]', '', regex=True).str.lower()

In [88]:
baselines_df.supplier_name = baselines_df.supplier_name.str.replace("  ", " ")

In [89]:
baselines_trimmed_df = baselines_df[[
    'supplier_name', 
    'public_water_system_id', 
    'month', 
    #'original_units',
    # 'total_potable_production_original_units',
    # 'potable_commercial_agriculture_original_units',
    'total_potable_production_minus_ag_gallons', 
    # 'staff_notes'
]].copy()

Get rid of some unnecessary columns

In [90]:
trim_df = df[[
    'supplier_name', 
    'public_water_system_id', 
    'reporting_month', 
    'county',
    'hydrologic_region', 
    'climate_zone', 
    'total_population_served',
    'calculated_total_potable_water_production_gallons_ag_excluded',
    'calculated_r_gpcd', 
    # 'month'
]].copy()

### Merge

Filter df to July 2021 and later

In [91]:
filter_df = trim_df[trim_df.reporting_month >= "2021-07-15"].copy()

Remove -999999999

In [92]:
filter_df = filter_df[filter_df.calculated_total_potable_water_production_gallons_ag_excluded != -999999999]

Make a month column to merge with baselines

In [93]:
filter_df["month"] = filter_df["reporting_month"].dt.month

In [94]:
merge_df = pd.merge(
    filter_df, 
    baselines_trimmed_df, 
    how="left", 
    on=["supplier_name", "public_water_system_id", "month"]
).rename(
    columns={
        "calculated_total_potable_water_production_gallons_ag_excluded": "total_gallons_current",
        "total_potable_production_minus_ag_gallons": "total_gallons_baseline"
    }
).drop("month", axis=1)

### Remove duplicates

In [95]:
tmp = merge_df.set_index(['supplier_name', 'reporting_month'])

In [96]:
merge_df = tmp[~tmp.index.duplicated()].reset_index()

### Calculate percent changes by month...

In [97]:
def pct_change(new, old):
    return (new - old) / old

### ...by district

In [98]:
merge_df["gallons_pct_change"] = merge_df.apply(lambda x: pct_change(x.total_gallons_current, x.total_gallons_baseline), axis=1)

Filter out outliers

In [99]:
# lower_thresh=-0.5
# upper_thresh=1.3

In [100]:
# districts_filtered = merge_df[
#     (merge_df['gallons_pct_change']<upper_thresh)&(merge_df['gallons_pct_change']>lower_thresh)
# ]

### ...by region

In [101]:
regions_df = merge_df.groupby(
    ["hydrologic_region", "reporting_month"]
)[["total_gallons_current","total_gallons_baseline", "supplier_name"]].agg({
    "total_gallons_current":"sum",
    "total_gallons_baseline":"sum",
    "supplier_name":"size"
}).reset_index()

In [102]:
regions_df["gallons_pct_change"] = regions_df.apply(lambda x: pct_change(x.total_gallons_current, x.total_gallons_baseline), axis=1)

In [103]:
# regions_df[
#     (regions_df.reporting_month==regions_df.reporting_month.max())
# ]

### ...and statewide

In [104]:
merge_df["state"] = "Statewide"

In [105]:
statewide_df = merge_df.groupby(["state", "reporting_month"])[["total_gallons_current","total_gallons_baseline"]].sum().reset_index()

In [106]:
statewide_df["gallons_pct_change"] = statewide_df.apply(lambda x: pct_change(x.total_gallons_current, x.total_gallons_baseline), axis=1)

### Calculate cumulative savings since July 2021

Set up some vars and function

In [107]:
def month_diff(a, b):
    return 12 * (a.year - b.year) + (a.month - b.month)

In [108]:
def gallons_per_unit(gallons, baseline, population, time):
    return (baseline - gallons) / population / time

In [109]:
voluntary_reductions_begin = pd.to_datetime('2021-07-15')
voluntary_reductions_begin

Timestamp('2021-07-15 00:00:00')

In [110]:
latest_month = statewide_df[statewide_df.reporting_month ==statewide_df.reporting_month.max() ].iloc[0]['reporting_month']

In [111]:
months_since_reductions_begin = month_diff(latest_month, voluntary_reductions_begin)
months_since_reductions_begin

18

In [112]:
days_since_reductions = (latest_month - voluntary_reductions_begin).days
days_since_reductions

549

Get date of this monthly report

In [113]:
date = statewide_df[
    (statewide_df.reporting_month==statewide_df.reporting_month.max())
].iloc[0]["reporting_month"]

Drop suppliers with missing data

In [114]:
counts = merge_df.groupby("supplier_name")["reporting_month"].count().reset_index()

In [115]:
max_count = max(counts.reporting_month)
max_count

19

In [116]:
suppliers_with_complete_data = list(counts[counts.reporting_month == max_count].supplier_name)

In [117]:
complete_data = merge_df[merge_df.supplier_name.isin(suppliers_with_complete_data)].copy()

By district

In [118]:
district_cumulative_savings_df = complete_data.groupby(
    ["supplier_name","hydrologic_region"]
)[["total_gallons_current","total_gallons_baseline", "total_population_served"]].sum().reset_index()

In [119]:
district_cumulative_savings_df["cumulative_pct_change"] = complete_data.apply(lambda x: pct_change(x.total_gallons_current, x.total_gallons_baseline), axis=1)

Drop districts for which we can't calculate a percentage change

In [120]:
drop_na_districts = district_cumulative_savings_df.dropna(subset="cumulative_pct_change")

By region

In [158]:
regions_cumulative_savings_df = district_cumulative_savings_df.groupby(
    ["hydrologic_region"]
)[
    ["total_gallons_current","total_gallons_baseline","total_population_served","supplier_name"]
].agg(
    {"total_gallons_current":"sum","total_gallons_baseline":"sum","total_population_served":"mean","supplier_name":"size"}
).reset_index().rename(
    columns={"supplier_name": "total_reports"}
)

In [159]:
regions_cumulative_savings_df["cumulative_pct_change"] = regions_cumulative_savings_df.apply(lambda x: pct_change(x.total_gallons_current, x.total_gallons_baseline), axis=1)

In [160]:
regions_cumulative_savings_df['date'] = date

In [161]:
regions_cumulative_savings_df['months_since'] = months_since_reductions_begin
regions_cumulative_savings_df['days_since'] = days_since_reductions

In [162]:
regions_cumulative_savings_df['gallons_saved_per_month'] = regions_cumulative_savings_df.apply(
    lambda x: gallons_per_unit(
        x.total_gallons_current, x.total_gallons_baseline, x.total_population_served, x.months_since
    ), axis=1
)

In [163]:
regions_cumulative_savings_df['gallons_saved_per_day'] = regions_cumulative_savings_df.apply(
    lambda x: gallons_per_unit(
        x.total_gallons_current, x.total_gallons_baseline, x.total_population_served, x.days_since
    ), axis=1
)

In [164]:
regions_cumulative_savings_df

Unnamed: 0,hydrologic_region,total_gallons_current,total_gallons_baseline,total_population_served,total_reports,cumulative_pct_change,date,months_since,days_since,gallons_saved_per_month,gallons_saved_per_day
0,Central Coast,71408710000.0,74878510000.0,810274.2,27,-0.046339,2023-01-15,18,549,237.903007,7.800099
1,Colorado River,101101200000.0,102871500000.0,1145109.0,11,-0.017209,2023-01-15,18,549,85.887807,2.815994
2,North Coast,16757060000.0,20081550000.0,522155.9,12,-0.16555,2023-01-15,18,549,353.714359,11.597192
3,North Lahontan,7430619000.0,8155529000.0,354425.8,5,-0.088886,2023-01-15,18,549,113.628155,3.725513
4,Sacramento River,282005300000.0,305050200000.0,1313972.0,40,-0.075544,2023-01-15,18,549,974.350365,31.945914
5,San Francisco Bay,349424600000.0,393598200000.0,3187859.0,37,-0.11223,2023-01-15,18,549,769.823427,25.240112
6,San Joaquin River,128730600000.0,135698400000.0,1244461.0,21,-0.051347,2023-01-15,18,549,311.056293,10.198567
7,South Coast,1397174000000.0,1473347000000.0,2459906.0,149,-0.051701,2023-01-15,18,549,1720.330169,56.404268
8,South Lahontan,62524000000.0,66887080000.0,1030816.0,13,-0.06523,2023-01-15,18,549,235.146737,7.709729
9,Tulare Lake,207984700000.0,214913600000.0,1434569.0,24,-0.032241,2023-01-15,18,549,268.332445,8.797785


Statewide

In [128]:
district_cumulative_savings_df["state"] = "California"

In [173]:
statewide_cumulative_savings_df = district_cumulative_savings_df.groupby(
    ["state"]
)[
    ["total_gallons_current","total_gallons_baseline", "supplier_name", "total_population_served"]
].agg({
    "total_gallons_current": "sum",
    "total_gallons_baseline": "sum",
    "total_population_served": "sum",
    "supplier_name": "size"
}).reset_index().rename(
    columns={"supplier_name": "total_reports"}
)

In [174]:
#statewide_cumulative_savings_df["total_reports"] = statewide_cumulative_savings_df["total_reports"]

In [175]:
statewide_cumulative_savings_df["cumulative_pct_change"] = statewide_cumulative_savings_df.apply(lambda x: pct_change(x.total_gallons_current, x.total_gallons_baseline), axis=1)

In [176]:
statewide_cumulative_savings_df['date'] = date

In [177]:
statewide_cumulative_savings_df['months_since'] = months_since_reductions_begin
statewide_cumulative_savings_df['days_since'] = days_since_reductions

In [178]:
statewide_cumulative_savings_df['gallons_saved_per_month'] = regions_cumulative_savings_df.apply(
    lambda x: gallons_per_unit(
        x.total_gallons_current, x.total_gallons_baseline, x.total_population_served, x.months_since
    ), axis=1
)

In [179]:
statewide_cumulative_savings_df['gallons_saved_per_day'] = statewide_cumulative_savings_df.apply(
    lambda x: gallons_per_unit(
        x.total_gallons_current, x.total_gallons_baseline, x.total_population_served, x.days_since
    ), axis=1
)

In [180]:
statewide_cumulative_savings_df

Unnamed: 0,state,total_gallons_current,total_gallons_baseline,total_population_served,total_reports,cumulative_pct_change,date,months_since,days_since,gallons_saved_per_month,gallons_saved_per_day
0,California,2624541000000.0,2795482000000.0,653511233,339,-0.061149,2023-01-15,18,549,237.903007,0.476454


### Round numbers

In [137]:
statewide_cumulative_savings_df["total_gallons_current"] = statewide_cumulative_savings_df["total_gallons_current"].round(0)
statewide_cumulative_savings_df["total_gallons_baseline"] = statewide_cumulative_savings_df["total_gallons_baseline"].round(0)

In [138]:
statewide_cumulative_savings_df["cumulative_pct_change"] = statewide_cumulative_savings_df["cumulative_pct_change"].round(3)

In [139]:
regions_cumulative_savings_df["total_gallons_current"] = regions_cumulative_savings_df["total_gallons_current"].round(0)
regions_cumulative_savings_df["total_gallons_baseline"] = regions_cumulative_savings_df["total_gallons_baseline"].round(0)

In [140]:
regions_cumulative_savings_df["cumulative_pct_change"] = regions_cumulative_savings_df["cumulative_pct_change"].round(3)

### Charts

In [141]:
# alt.Chart(
#     regions_df[regions_df.hydrologic_region=="South Coast"]
# ).mark_bar().encode(
#     x="reporting_month:O",
#     y="gallons_pct_change:Q",
#     color=alt.condition(
#         alt.datum.gallons_pct_change > 0,
#         alt.value("#e6ae56"),  # The positive color
#         alt.value("#83c6e0")  # The negative color
#     ),
#     tooltip=["gallons_pct_change"]
# ).properties(title="Monthly water conservation in the South Coast", width=600)

### Export

Monthly

In [142]:
statewide_df.to_csv("../data/processed/monthly-conservation/statewide-conservation-monthly.csv", index=False)

In [143]:
regions_df.to_csv("../data/processed/monthly-conservation/regional-conservation-monthly.csv", index=False)

In [144]:
merge_df.to_csv("../data/processed/monthly-conservation/district-level-conservation-monthly.csv", index=False)

Cumulative

In [145]:
statewide_cumulative_savings_df.to_csv("../data/processed/cumulative-conservation/statewide-conservation-cumulative.csv", index=False)

In [146]:
regions_cumulative_savings_df.to_csv("../data/processed/cumulative-conservation/monthly-conservation-cumulative.csv", index=False)

In [147]:
district_cumulative_savings_df.to_csv("../data/processed/cumulative-conservation/district-level-conservation-cumulative.csv", index=False)