In [2]:
import pandas as pd
import altair as alt
import altair_latimes as lat

In [3]:
alt.themes.register('latimes', lat.theme)
alt.themes.enable('latimes')

ThemeRegistry.enable('latimes')

### Import

In [4]:
df = pd.read_csv("../data/raw/uw-usage.csv", parse_dates=["Reporting Month"])

In [5]:
baselines_df = pd.read_csv("../data/raw/uw-2020-baseline-values.csv")

### Clean

Remove junk from column names

In [6]:
df.columns = df.columns.str.replace(" ","_").str.replace("-","_").str.replace(r'[#,@,&,(,)]', '', regex=True).str.lower()

Eliminate double spaces in supplier names

In [7]:
df.supplier_name = df.supplier_name.str.replace("  ", " ")

Clean up baselines column names as we did in the previous notebook

In [8]:
baselines_df.columns = baselines_df.columns.str.replace(" ","_").str.replace("-","_").str.replace(r'[#,@,&,(,)]', '', regex=True).str.lower()

In [9]:
baselines_df.supplier_name = baselines_df.supplier_name.str.replace("  ", " ")

In [10]:
baselines_trimmed_df = baselines_df[[
    'supplier_name', 
    'public_water_system_id', 
    'month', 
    #'original_units',
    # 'total_potable_production_original_units',
    # 'potable_commercial_agriculture_original_units',
    'total_potable_production_minus_ag_gallons', 
    # 'staff_notes'
]].copy()

Get rid of some unnecessary columns

In [11]:
trim_df = df[[
    'supplier_name', 
    'public_water_system_id', 
    'reporting_month', 
    'county',
    'hydrologic_region', 
    'climate_zone', 
    'total_population_served',
    'calculated_total_potable_water_production_gallons_ag_excluded',
    'calculated_r_gpcd', 
    # 'month'
]].copy()

### Merge

Filter df to July 2021 and later

In [12]:
filter_df = trim_df[trim_df.reporting_month >= "2021-07-15"].copy()

Remove -999999999

In [13]:
filter_df = filter_df[filter_df.calculated_total_potable_water_production_gallons_ag_excluded != -999999999]

Make a month column to merge with baselines

In [14]:
filter_df["month"] = filter_df["reporting_month"].dt.month

In [15]:
merge_df = pd.merge(
    filter_df, 
    baselines_trimmed_df, 
    how="left", 
    on=["supplier_name", "public_water_system_id", "month"]
).rename(
    columns={
        "calculated_total_potable_water_production_gallons_ag_excluded": "total_gallons_current",
        "total_potable_production_minus_ag_gallons": "total_gallons_baseline"
    }
).drop("month", axis=1)

### Calculate percent changes by month...

In [16]:
def pct_change(new, old):
    return (new - old) / old

### ...by district

In [17]:
merge_df["gallons_pct_change"] = merge_df.apply(lambda x: pct_change(x.total_gallons_current, x.total_gallons_baseline), axis=1)

Filter out outliers

In [18]:
lower_thresh=-0.5
upper_thresh=1.3

In [19]:
districts_filtered = merge_df[
    (merge_df['gallons_pct_change']<upper_thresh)&(merge_df['gallons_pct_change']>lower_thresh)
]

### ...by region

In [20]:
regions_df = districts_filtered.groupby(
    ["hydrologic_region", "reporting_month"]
)[["total_gallons_current","total_gallons_baseline", "supplier_name"]].agg({
    "total_gallons_current":"sum",
    "total_gallons_baseline":"sum",
    "supplier_name":"size"
}).reset_index()

In [21]:
regions_df["gallons_pct_change"] = regions_df.apply(lambda x: pct_change(x.total_gallons_current, x.total_gallons_baseline), axis=1)

In [22]:
regions_df[
    (regions_df.reporting_month==regions_df.reporting_month.max())
]

Unnamed: 0,hydrologic_region,reporting_month,total_gallons_current,total_gallons_baseline,supplier_name,gallons_pct_change
10,Central Coast,2022-05-15,4351937000.0,4450432000.0,28,-0.022132
21,Colorado River,2022-05-15,6085725000.0,5586228000.0,11,0.089416
32,North Coast,2022-05-15,1179134000.0,1222518000.0,14,-0.035487
43,North Lahontan,2022-05-15,358507100.0,440603300.0,5,-0.186327
54,Sacramento River,2022-05-15,14388380000.0,14745350000.0,42,-0.024209
65,San Francisco Bay,2022-05-15,21476610000.0,23459560000.0,43,-0.084526
76,San Joaquin River,2022-05-15,9525491000.0,9978599000.0,27,-0.045408
87,South Coast,2022-05-15,80079010000.0,81893920000.0,161,-0.022162
98,South Lahontan,2022-05-15,4187908000.0,4716062000.0,16,-0.111991
109,Tulare Lake,2022-05-15,13021440000.0,13089400000.0,27,-0.005192


### ...and statewide

In [23]:
districts_filtered["state"] = "Statewide"

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  districts_filtered["state"] = "Statewide"


In [24]:
statewide_df = districts_filtered.groupby(["state", "reporting_month"])[["total_gallons_current","total_gallons_baseline"]].sum().reset_index()

In [25]:
statewide_df["gallons_pct_change"] = statewide_df.apply(lambda x: pct_change(x.total_gallons_current, x.total_gallons_baseline), axis=1)

### Calculate cumulative savings since July 2021

Get date of this monthly report

In [35]:
date = statewide_df[
    (statewide_df.reporting_month==statewide_df.reporting_month.max())
].iloc[0]["reporting_month"]

Drop suppliers with missing data

In [36]:
counts = districts_filtered.groupby("supplier_name")["reporting_month"].count().reset_index()

In [37]:
max_count = max(counts.reporting_month)
max_count

11

In [38]:
suppliers_with_complete_data = list(counts[counts.reporting_month == max_count].supplier_name)

In [39]:
complete_data = districts_filtered[districts_filtered.supplier_name.isin(suppliers_with_complete_data)].copy()

By district

In [40]:
district_cumulative_savings_df = complete_data.groupby(
    ["supplier_name","hydrologic_region"]
)[["total_gallons_current","total_gallons_baseline", "total_population_served"]].sum().reset_index()

In [41]:
district_cumulative_savings_df["cumulative_pct_change"] = complete_data.apply(lambda x: pct_change(x.total_gallons_current, x.total_gallons_baseline), axis=1)

Drop districts for which we can't calculate a percentage change

In [42]:
drop_na_districts = district_cumulative_savings_df.dropna(subset="cumulative_pct_change")

By region

In [43]:
regions_cumulative_savings_df = district_cumulative_savings_df.groupby(
    ["hydrologic_region"]
)[
    ["total_gallons_current","total_gallons_baseline","supplier_name"]
].agg(
    {"total_gallons_current":"sum","total_gallons_baseline":"sum","supplier_name":"size"}
).reset_index().rename(
    columns={"supplier_name": "total_reports"}
)

In [44]:
regions_cumulative_savings_df["cumulative_pct_change"] = regions_cumulative_savings_df.apply(lambda x: pct_change(x.total_gallons_current, x.total_gallons_baseline), axis=1)

In [45]:
regions_cumulative_savings_df['date'] = date

Statewide

In [46]:
district_cumulative_savings_df["state"] = "California"

In [47]:
statewide_cumulative_savings_df = district_cumulative_savings_df.groupby(
    ["state"]
)[
    ["total_gallons_current","total_gallons_baseline", "supplier_name"]
].agg({
    "total_gallons_current": "sum",
    "total_gallons_baseline": "sum",
    "supplier_name": "size"
}).reset_index().rename(
    columns={"supplier_name": "total_reports"}
)

In [48]:
#statewide_cumulative_savings_df["total_reports"] = statewide_cumulative_savings_df["total_reports"]

In [49]:
statewide_cumulative_savings_df["cumulative_pct_change"] = statewide_cumulative_savings_df.apply(lambda x: pct_change(x.total_gallons_current, x.total_gallons_baseline), axis=1)

In [50]:
statewide_cumulative_savings_df['date'] = date

### Round numbers

In [52]:
statewide_cumulative_savings_df["total_gallons_current"] = statewide_cumulative_savings_df["total_gallons_current"].round(0)
statewide_cumulative_savings_df["total_gallons_baseline"] = statewide_cumulative_savings_df["total_gallons_baseline"].round(0)

In [53]:
statewide_cumulative_savings_df["cumulative_pct_change"] = statewide_cumulative_savings_df["cumulative_pct_change"].round(3)

In [54]:
regions_cumulative_savings_df["total_gallons_current"] = regions_cumulative_savings_df["total_gallons_current"].round(0)
regions_cumulative_savings_df["total_gallons_baseline"] = regions_cumulative_savings_df["total_gallons_baseline"].round(0)

In [55]:
regions_cumulative_savings_df["cumulative_pct_change"] = regions_cumulative_savings_df["cumulative_pct_change"].round(3)

### Charts

In [56]:
# alt.Chart(
#     regions_df[regions_df.hydrologic_region=="South Coast"]
# ).mark_bar().encode(
#     x="reporting_month:O",
#     y="gallons_pct_change:Q",
#     color=alt.condition(
#         alt.datum.gallons_pct_change > 0,
#         alt.value("#e6ae56"),  # The positive color
#         alt.value("#83c6e0")  # The negative color
#     ),
#     tooltip=["gallons_pct_change"]
# ).properties(title="Monthly water conservation in the South Coast", width=600)

### Export

Monthly

In [57]:
statewide_df.to_csv("../data/processed/monthly-conservation/statewide-conservation-monthly.csv", index=False)

In [58]:
regions_df.to_csv("../data/processed/monthly-conservation/regional-conservation-monthly.csv", index=False)

In [59]:
merge_df.to_csv("../data/processed/monthly-conservation/district-level-conservation-monthly.csv", index=False)

Cumulative

In [60]:
statewide_cumulative_savings_df.to_csv("../data/processed/cumulative-conservation/statewide-conservation-cumulative.csv", index=False)

In [61]:
regions_cumulative_savings_df.to_csv("../data/processed/cumulative-conservation/monthly-conservation-cumulative.csv", index=False)

In [62]:
district_cumulative_savings_df.to_csv("../data/processed/cumulative-conservation/district-level-conservation-cumulative.csv", index=False)