In [190]:
import pandas as pd
import altair as alt
import altair_latimes as lat

In [191]:
alt.themes.register('latimes', lat.theme)
alt.themes.enable('latimes')

ThemeRegistry.enable('latimes')

### Import

Monthly reports timeseries

In [192]:
df = pd.read_csv("../data/raw/uw-usage.csv", parse_dates=["Reporting Month"])

In [193]:
clean_names = pd.read_csv("../data/metadata/urban-water-suppliers-clean-names.csv")

### Clean

Remove junk from column names

In [194]:
df.columns = df.columns.str.replace(" ","_").str.replace("-","_").str.replace(r'[#,@,&,(,)]', '', regex=True).str.lower()

Eliminate double spaces in supplier names

In [195]:
df.supplier_name = df.supplier_name.str.replace("  ", " ")

Trim this downn to just the columns we need

### Merge clean names

In [196]:
merge_names_df = pd.merge(
    df,
    clean_names,
    how="left",
    left_on=["public_water_system_id" , "supplier_name"],
    right_on=["id" , "supplier_name"]
)

### Trim

In [197]:
keeps = [
    'supplier_name', 
    'display_name',
    'public_water_system_id', 
    'reporting_month', 
    'county',
    'hydrologic_region', 
    # 'climate_zone', 
    'total_population_served',
    ### MAYBE PILE
        'county_under_drought_declaration',
        'water_shortage_contingency_stage_invoked',
        'water_shortage_level_indicator',
        'dwr_state_standard_level_corresponding_to_stage',
    ### NEED THIS IF YOU WANT TO RECALCULATE R-GPCD BY DISTRICT
        # 'water_production_units',
        # 'reported_preliminary_total_potable_water_production',
        # 'reported_final_total_potable_water_production',
        # 'preliminary_percent_residential_use', 
        # 'final_percent_residential_use',
    ### THESE COLUMNS ARE OPTIONAL FOR SUPPLIERS TO FILL OUT
        # 'reported_preliminary_commercial_agricultural_water',
        # 'reported_final_commercial_agricultural_water',
        # 'reported_preliminary_commercial,_industrial_and_institutional_water',
        # 'reported_final_commercial_industrial_and_institutional_water',
        # 'reported_recycled_water', 
        # 'reported_non_revenue_water',
    'calculated_total_potable_water_production_gallons_ag_excluded',
    # 'calculated_total_potable_water_production_gallons_2013_ag_excluded',
    # 'calculated_commercial_agricultural_water_gallons',
    # 'calculated_commercial_agricultural_water_gallons_2013',
    'calculated_r_gpcd', 
    # 'qualification'
]

In [198]:
trim_df = merge_names_df[keeps]

### Calculate

Calculate population-weighted r-gpcd for hydrologic regions

In [199]:
def regional_calcs(df, gals, rgpcd, pop):
    val = df[rgpcd]
    wt = df[pop]
    wt_avg = (val * wt).sum() / wt.sum()
    #return (val * wt).sum() / wt.sum()
    total_gals = df[gals].sum()
    total_pop = df[pop].sum()
    return pd.Series([total_pop, total_gals, wt_avg], index=['total_pop', 'total_gallons', 'pop_weighted_rgpcd'])

In [200]:
region_df = trim_df.groupby(
    ['reporting_month','hydrologic_region']
).apply(
    regional_calcs,
    "calculated_total_potable_water_production_gallons_ag_excluded",
    'calculated_r_gpcd', 
    'total_population_served'
).reset_index()

Now do it for the entire state

In [201]:
statewide_df = trim_df.groupby(
    ['reporting_month']
).apply(
    regional_calcs,
    "calculated_total_potable_water_production_gallons_ag_excluded",
    'calculated_r_gpcd', 
    'total_population_served'
).reset_index()

In [202]:
# in case we need to recalculate r-gpcd, use this dict for days per month
# days_per_month = {
#     "1": 31,
#     "2": 28,
#     "3": 31,
#     "4": 30,
#     "5": 31,
#     "6": 30,
#     "7": 31,
#     "8": 31,
#     "9": 30,
#     "10": 31,
#     "11": 30,
#     "12": 31
# }

### Backfill missing dates

In [203]:
min_date = trim_df.reporting_month.min()
min_date

Timestamp('2014-06-15 00:00:00')

In [204]:
max_date = trim_df.reporting_month.max()
max_date

Timestamp('2022-04-15 00:00:00')

In [205]:
def backfill(agency_group):
    """
    Backfills empty dates in the provided county group.

    Runs from the earliest date in the group to the latest.

    Filled in dates are given the previous day's case count with an `ffill` technique.

    The expanded group is returned.
    """
    agency_df = agency_group.sort_values(["supplier_name", "reporting_month"]).set_index(
        ["supplier_name", "reporting_month"]
    )

    # Backfill the daterange
    ## Get the full range of values from the extent of dates in the dataframe
    date_range = pd.date_range(
        min_date,
        max_date,
        freq=pd.DateOffset(months=1, day=15),
    )
    ## Get the full range of unique place names
    name_range = agency_df.index.unique(level="supplier_name")
    ## Create a new index that has an entry for every place on every date
    namedate_index = pd.MultiIndex.from_product(
        iterables=[name_range, date_range], names=["supplier_name", "reporting_month"]
    )
    ## Reindex the dataframe using that complete list of places and dates
    backfilled_df = agency_df.reindex(namedate_index)

    # Zero out missing data
    backfilled_df.calculated_r_gpcd.fillna(0, inplace=True)
    backfilled_df.calculated_total_potable_water_production_gallons_ag_excluded.fillna(0, inplace=True)

    # Foward-fill the other remaining columns
    backfilled_df = backfilled_df.groupby("supplier_name").ffill()

    # Reset it
    reset_df = backfilled_df.reset_index()

    # Return it
    return reset_df

In [206]:
backfilled_df = (
    trim_df.groupby("supplier_name").apply(backfill).reset_index(drop=True)
)

### Merge regional r-gpcd values to district df

In [207]:
merge_regions_df = pd.merge(
    backfilled_df, 
    region_df[["hydrologic_region","reporting_month","pop_weighted_rgpcd"]], 
    how="left", 
    on=["hydrologic_region","reporting_month"]
)

### Rename r-gpcd columns for clarity

In [208]:
rename_df = merge_regions_df.rename(columns={
    "calculated_r_gpcd": "r_gpcd",
    "pop_weighted_rgpcd": "regional_r_gpcd"
})

In [209]:
rename_df[rename_df.display_name == "City of Grover Beach"].tail(12)

Unnamed: 0,supplier_name,reporting_month,display_name,public_water_system_id,county,hydrologic_region,total_population_served,county_under_drought_declaration,water_shortage_contingency_stage_invoked,water_shortage_level_indicator,dwr_state_standard_level_corresponding_to_stage,calculated_total_potable_water_production_gallons_ag_excluded,r_gpcd,regional_r_gpcd
15473,Grover Beach City of,2021-05-15,City of Grover Beach,CA4010004,San Luis Obispo,Central Coast,13459.0,,1,No,,38271199.95,71.7391,80.784047
15474,Grover Beach City of,2021-06-15,City of Grover Beach,CA4010004,San Luis Obispo,Central Coast,13459.0,,1,No,,37749838.35,73.4968,88.273503
15475,Grover Beach City of,2021-07-15,City of Grover Beach,CA4010004,San Luis Obispo,Central Coast,13459.0,Yes,1,No,,39499658.22,74.1917,85.908112
15476,Grover Beach City of,2021-08-15,City of Grover Beach,CA4010004,San Luis Obispo,Central Coast,13459.0,Yes,1,No,,37283871.42,69.8996,85.824215
15477,Grover Beach City of,2021-09-15,City of Grover Beach,CA4010004,San Luis Obispo,Central Coast,13459.0,Yes,1,No,,35749113.21,69.6953,84.64933
15478,Grover Beach City of,2021-10-15,City of Grover Beach,CA4010004,San Luis Obispo,Central Coast,13459.0,Yes,1,No,,34543464.51,64.9944,73.326462
15479,Grover Beach City of,2021-11-15,City of Grover Beach,CA4010004,San Luis Obispo,Central Coast,13459.0,Yes,1,No,,32529705.33,63.3593,63.671441
15480,Grover Beach City of,2021-12-15,City of Grover Beach,CA4010004,San Luis Obispo,Central Coast,13459.0,Yes,1,No,,30502912.11,57.6365,56.101092
15481,Grover Beach City of,2022-01-15,City of Grover Beach,CA4010004,San Luis Obispo,Central Coast,13459.0,Yes,1,No,,28453309.32,53.3445,55.681674
15482,Grover Beach City of,2022-02-15,City of Grover Beach,CA4010004,San Luis Obispo,Central Coast,13459.0,Yes,1,No,,0.0,0.0,64.880666


In [210]:
rename_df[rename_df.display_name == "City of Pismo Beach"].tail(12)

Unnamed: 0,supplier_name,reporting_month,display_name,public_water_system_id,county,hydrologic_region,total_population_served,county_under_drought_declaration,water_shortage_contingency_stage_invoked,water_shortage_level_indicator,dwr_state_standard_level_corresponding_to_stage,calculated_total_potable_water_production_gallons_ag_excluded,r_gpcd,regional_r_gpcd
26398,Pismo Beach City of,2021-05-15,City of Pismo Beach,CA4010008,San Luis Obispo,Central Coast,8233.0,,2nd,No,,54567008.46,98.0786,80.784047
26399,Pismo Beach City of,2021-06-15,City of Pismo Beach,CA4010008,San Luis Obispo,Central Coast,8233.0,,2nd of 4,No,,54834206.28,101.955,88.273503
26400,Pismo Beach City of,2021-07-15,City of Pismo Beach,CA4010008,San Luis Obispo,Central Coast,8233.0,Yes,2nd of 4,No,,60618061.53,109.237,85.908112
26401,Pismo Beach City of,2021-08-15,City of Pismo Beach,CA4010008,San Luis Obispo,Central Coast,8233.0,No,1st of 4,No,,52644487.56,95.1421,85.824215
26402,Pismo Beach City of,2021-09-15,City of Pismo Beach,CA4010008,San Luis Obispo,Central Coast,8233.0,Yes,2nd of 4,No,,39297630.6,73.4317,84.64933
26403,Pismo Beach City of,2021-10-15,City of Pismo Beach,CA4010008,San Luis Obispo,Central Coast,8233.0,Yes,2nd of 4,No,,0.0,0.0,73.326462
26404,Pismo Beach City of,2021-11-15,City of Pismo Beach,CA4010008,San Luis Obispo,Central Coast,8233.0,No,1st of 4,No,,42370405.53,78.8935,63.671441
26405,Pismo Beach City of,2021-12-15,City of Pismo Beach,CA4010008,San Luis Obispo,Central Coast,8233.0,Yes,2nd of 4,No,,38808854.1,69.8883,56.101092
26406,Pismo Beach City of,2022-01-15,City of Pismo Beach,CA4010008,San Luis Obispo,Central Coast,8233.0,Yes,3rd of 4,Yes,,34973587.83,62.8408,55.681674
26407,Pismo Beach City of,2022-02-15,City of Pismo Beach,CA4010008,San Luis Obispo,Central Coast,8233.0,Yes,3rd of 4,Yes,,0.0,0.0,64.880666


### Chart

In [211]:
melt = pd.melt(
    rename_df, 
    id_vars=["display_name","hydrologic_region","reporting_month"], 
    value_vars=["r_gpcd","regional_r_gpcd"]
)

In [212]:
agency_name = "City of Pismo Beach"

base = alt.Chart(
    rename_df[
        (rename_df.display_name == agency_name)
    ].tail(12)
).encode(
    x=alt.X("yearmonth(reporting_month):O"),
    tooltip=["reporting_month"]
)

bar = base.mark_bar(color="#83c6e0").encode(
    y=alt.Y("r_gpcd", stack=None)
)

avg_line = base.mark_line(interpolate='step', color='#1281aa').encode(
    y=alt.Y("regional_r_gpcd")
)

goal_line = alt.Chart(pd.DataFrame({'y': [80]})).mark_rule(color="#b75a36",strokeDash=[10,11]).encode(y='y')

(bar + avg_line + goal_line).properties(title=f"{agency_name} residential water usage compared to regional average", width=600)

### Sort data

In [213]:
sort_district_df = rename_df.sort_values(["reporting_month","supplier_name"])

In [214]:
sort_region_df = region_df.sort_values(["reporting_month","hydrologic_region"])

In [215]:
sort_state_df = statewide_df.sort_values(["reporting_month"])

### Filter dataframe to be a bit more manageable

In [216]:
#filtered_district_df = sort_district_df[sort_district_df.reporting_month >= "2021-01-15"]

In [217]:
last_twelve_months_df = sort_district_df.sort_values('reporting_month').groupby('public_water_system_id').tail(12)

In [218]:
latest_df = sort_district_df[
    (sort_district_df.r_gpcd > 0)
].sort_values('reporting_month').groupby('public_water_system_id').tail(1)

### Export

In [219]:
last_twelve_months_df.to_csv("../data/processed/district-level-residential-use.csv", index=False)

In [220]:
latest_df.to_csv("../data/processed/latest-district-level-residential-use.csv", index=False)

In [221]:
sort_region_df.to_csv("../data/processed/regional-residential-usage.csv", index=False)

In [222]:
sort_region_df[
    sort_region_df.reporting_month == sort_district_df.reporting_month.max()
].to_csv("../data/processed/latest-regional-residential-use.csv", index=False)

In [223]:
sort_state_df.to_csv("../data/processed/statewide-residential-usage.csv", index=False)

In [224]:
sort_state_df[
    sort_state_df.reporting_month == sort_district_df.reporting_month.max()
].to_csv("../data/processed/latest-statewide-level-residential-use.csv", index=False)