In [1]:
import pandas as pd
import altair as alt
import altair_latimes as lat

In [2]:
alt.themes.register('latimes', lat.theme)
alt.themes.enable('latimes')

ThemeRegistry.enable('latimes')

### Import

Monthly reports timeseries

In [45]:
df = pd.read_csv("../data/raw/uw-usage.csv", parse_dates=["Reporting Month"])

In [46]:
clean_names = pd.read_csv("../data/metadata/urban-water-suppliers-clean-names.csv")

### Clean

Remove junk from column names

In [47]:
df.columns = df.columns.str.replace(" ","_").str.replace("-","_").str.replace(r'[#,@,&,(,)]', '', regex=True).str.lower()

Eliminate double spaces in supplier names

In [48]:
df.supplier_name = df.supplier_name.str.replace("  ", " ")

In [49]:
df['supplier_name'] = df['supplier_name'].str.strip()

### Merge clean names

In [50]:
merge_names_df = pd.merge(
    df,
    clean_names[["id","display_name"]],
    how="left",
    left_on=["public_water_system_id"],
    right_on=["id"]
)

In [51]:
merge_names_df[merge_names_df.display_name.isnull()].supplier_name.unique()

array([], dtype=object)

### Trim

Trim this down to just the columns we need

In [52]:
keeps = [
    'supplier_name', 
    'display_name',
    'public_water_system_id', 
    'reporting_month', 
    'county',
    'hydrologic_region', 
    # 'climate_zone', 
    'total_population_served',
    ### MAYBE PILE
        # 'county_under_drought_declaration',
        # 'water_shortage_contingency_stage_invoked',
        # 'water_shortage_level_indicator',
        'dwr_state_standard_level_corresponding_to_stage',
    ### NEED THIS IF YOU WANT TO RECALCULATE R-GPCD BY DISTRICT
        # 'water_production_units',
        # 'reported_preliminary_total_potable_water_production',
        # 'reported_final_total_potable_water_production',
        # 'preliminary_percent_residential_use', 
        # 'final_percent_residential_use',
    ### THESE COLUMNS ARE OPTIONAL FOR SUPPLIERS TO FILL OUT
        # 'reported_preliminary_commercial_agricultural_water',
        # 'reported_final_commercial_agricultural_water',
        # 'reported_preliminary_commercial,_industrial_and_institutional_water',
        # 'reported_final_commercial_industrial_and_institutional_water',
        # 'reported_recycled_water', 
        # 'reported_non_revenue_water',
    'calculated_total_potable_water_production_gallons_ag_excluded',
    # 'calculated_total_potable_water_production_gallons_2013_ag_excluded',
    # 'calculated_commercial_agricultural_water_gallons',
    # 'calculated_commercial_agricultural_water_gallons_2013',
    'calculated_r_gpcd', 
    # 'qualification'
]

In [11]:
trim_df = merge_names_df[keeps]

### Calculate

Calculate population-weighted r-gpcd for hydrologic regions

In [12]:
def regional_calcs(df, gals, rgpcd, pop):
    val = df[rgpcd]
    wt = df[pop]
    wt_avg = (val * wt).sum() / wt.sum()
    #return (val * wt).sum() / wt.sum()
    total_gals = df[gals].sum()
    total_pop = df[pop].sum()
    return pd.Series([total_pop, total_gals, wt_avg], index=['total_pop', 'total_gallons', 'pop_weighted_rgpcd'])

In [13]:
region_df = trim_df.groupby(
    ['reporting_month','hydrologic_region']
).apply(
    regional_calcs,
    "calculated_total_potable_water_production_gallons_ag_excluded",
    'calculated_r_gpcd', 
    'total_population_served'
).reset_index()

Now do it for the entire state

In [14]:
statewide_df = trim_df.groupby(
    ['reporting_month']
).apply(
    regional_calcs,
    "calculated_total_potable_water_production_gallons_ag_excluded",
    'calculated_r_gpcd', 
    'total_population_served'
).reset_index()

In [15]:
# in case we need to recalculate r-gpcd, use this dict for days per month
# days_per_month = {
#     "1": 31,
#     "2": 28,
#     "3": 31,
#     "4": 30,
#     "5": 31,
#     "6": 30,
#     "7": 31,
#     "8": 31,
#     "9": 30,
#     "10": 31,
#     "11": 30,
#     "12": 31
# }

### Remove duplicates

In [42]:
tmp = trim_df.set_index(['supplier_name', 'reporting_month'])

In [43]:
tmp1 = tmp[~tmp.index.duplicated()].reset_index()

In [44]:
tmp1.reset_index()

Unnamed: 0,index,supplier_name,reporting_month,display_name,public_water_system_id,county,hydrologic_region,total_population_served,dwr_state_standard_level_corresponding_to_stage,calculated_total_potable_water_production_gallons_ag_excluded,calculated_r_gpcd
0,0,East Bay Municipal Utilities District,2022-07-15,East Bay Municipal Utilities District,CA0110005,"Alameda,Contra Costa",San Francisco Bay,1430000,5.0,5.447000e+09,76.1818
1,1,East Bay Municipal Utilities District,2022-06-15,East Bay Municipal Utilities District,CA0110005,"Alameda,Contra Costa",San Francisco Bay,1430000,5.0,5.300000e+09,75.3613
2,2,East Bay Municipal Utilities District,2022-05-15,East Bay Municipal Utilities District,CA0110005,"Alameda,Contra Costa",San Francisco Bay,1430000,5.0,5.118000e+09,66.9623
3,3,East Bay Municipal Utilities District,2022-04-15,East Bay Municipal Utilities District,CA0110005,"Alameda,Contra Costa",San Francisco Bay,1430000,5.0,4.368000e+09,62.1091
4,4,East Bay Municipal Utilities District,2022-03-15,East Bay Municipal Utilities District,CA0110005,"Alameda,Contra Costa",San Francisco Bay,1430000,5.0,4.481000e+09,58.6280
...,...,...,...,...,...,...,...,...,...,...,...
38578,38578,Santa Clarita Valley Water Agency,2014-10-15,Santa Clarita Valley Water Agency,"CA1910255,CA1910247,CA1910096,CA1910250,CA1910...",Los Angeles,South Coast,281937,,1.859661e+09,126.1590
38579,38579,Santa Clarita Valley Water Agency,2014-09-15,Santa Clarita Valley Water Agency,"CA1910255,CA1910247,CA1910096,CA1910250,CA1910...",Los Angeles,South Coast,281937,,2.059538e+09,141.1990
38580,38580,Santa Clarita Valley Water Agency,2014-08-15,Santa Clarita Valley Water Agency,"CA1910255,CA1910247,CA1910096,CA1910250,CA1910...",Los Angeles,South Coast,281087,,2.331438e+09,149.3480
38581,38581,Santa Clarita Valley Water Agency,2014-07-15,Santa Clarita Valley Water Agency,"CA1910255,CA1910247,CA1910096,CA1910250,CA1910...",Los Angeles,South Coast,281087,,2.557028e+09,169.7540


### Backfill missing dates

In [16]:
min_date = trim_df.reporting_month.min()
min_date

Timestamp('2014-06-15 00:00:00')

In [17]:
max_date = trim_df.reporting_month.max()
max_date

Timestamp('2022-07-15 00:00:00')

In [18]:
def backfill(agency_group):
    """
    Backfills empty dates in the provided county group.

    Runs from the earliest date in the group to the latest.

    Filled in dates are given the previous day's case count with an `ffill` technique.

    The expanded group is returned.
    """
    agency_df = agency_group.sort_values(["supplier_name", "reporting_month"]).set_index(
        ["supplier_name", "reporting_month"]
    )

    # Backfill the daterange
    ## Get the full range of values from the extent of dates in the dataframe
    date_range = pd.date_range(
        min_date,
        max_date,
        freq=pd.DateOffset(months=1, day=15),
    )
    ## Get the full range of unique place names
    name_range = agency_df.index.unique(level="supplier_name")
    ## Create a new index that has an entry for every place on every date
    namedate_index = pd.MultiIndex.from_product(
        iterables=[name_range, date_range], names=["supplier_name", "reporting_month"]
    )
    ## Reindex the dataframe using that complete list of places and dates
    backfilled_df = agency_df.reindex(namedate_index)

    # Zero out missing data
    backfilled_df.calculated_r_gpcd.fillna(0, inplace=True)
    backfilled_df.calculated_total_potable_water_production_gallons_ag_excluded.fillna(0, inplace=True)

    # Foward-fill the other remaining columns
    backfilled_df = backfilled_df.groupby("supplier_name").ffill()

    # Reset it
    reset_df = backfilled_df.reset_index()

    # Return it
    return reset_df

In [34]:
backfilled_df = (
    remove_duplicates.groupby("supplier_name").apply(backfill).reset_index(drop=True)
)

ValueError: cannot handle a non-unique multi-index!

### Merge regional r-gpcd values to district df

In [None]:
merge_regions_df = pd.merge(
    backfilled_df, 
    region_df[["hydrologic_region","reporting_month","pop_weighted_rgpcd"]], 
    how="left", 
    on=["hydrologic_region","reporting_month"]
)

### Round water use figures to save space

In [None]:
merge_regions_df["calculated_total_potable_water_production_gallons_ag_excluded"] = merge_regions_df["calculated_total_potable_water_production_gallons_ag_excluded"].round(0)

In [None]:
merge_regions_df["calculated_r_gpcd"] = merge_regions_df["calculated_r_gpcd"].round(1)

In [None]:
merge_regions_df["pop_weighted_rgpcd"] = merge_regions_df["pop_weighted_rgpcd"].round(1)

### Rename columns for clarity and brevity

In [None]:
rename_df = merge_regions_df.rename(columns={
    "public_water_system_id": "pwsid",
    "total_population_served": "population",
    "dwr_state_standard_level_corresponding_to_stage": "dwr_stage",
    "calculated_total_potable_water_production_gallons_ag_excluded": "total_water_production",
    "calculated_r_gpcd": "r_gpcd",
    "pop_weighted_rgpcd": "regional_r_gpcd"
})

### Chart

In [None]:
melt = pd.melt(
    rename_df, 
    id_vars=["display_name","hydrologic_region","reporting_month"], 
    value_vars=["r_gpcd","regional_r_gpcd"]
)

In [None]:
agency_name = "City of Pismo Beach"

base = alt.Chart(
    rename_df[
        (rename_df.display_name == agency_name)
    ].tail(12)
).encode(
    x=alt.X("yearmonth(reporting_month):O"),
    tooltip=["reporting_month"]
)

bar = base.mark_bar(color="#83c6e0").encode(
    y=alt.Y("r_gpcd", stack=None)
)

avg_line = base.mark_line(interpolate='step', color='#1281aa').encode(
    y=alt.Y("regional_r_gpcd")
)

goal_line = alt.Chart(pd.DataFrame({'y': [80]})).mark_rule(color="#b75a36",strokeDash=[10,11]).encode(y='y')

(bar + avg_line + goal_line).properties(title=f"{agency_name} residential water usage compared to regional average", width=600)

In [None]:
rename_df[rename_df.supplier_name.isin([ "Las Virgenes Municipal Water District", "Los Angeles Department of Water and Power" ])].to_csv("../data/processed/lv-ladwp.csv", index=False)

### Sort data

In [None]:
sort_district_df = rename_df.sort_values(["reporting_month","supplier_name"])

In [None]:
sort_region_df = region_df.sort_values(["reporting_month","hydrologic_region"])

In [None]:
sort_state_df = statewide_df.sort_values(["reporting_month"])

### Filter dataframe to be a bit more manageable

In [None]:
#filtered_district_df = sort_district_df[sort_district_df.reporting_month >= "2021-01-15"]

In [None]:
last_twelve_months_df = sort_district_df.sort_values('reporting_month').groupby('pwsid').tail(12)

In [None]:
latest_df = sort_district_df[
    (sort_district_df.r_gpcd > 0)
].sort_values('reporting_month').groupby('pwsid').tail(1)

### Export

In [None]:
last_twelve_months_df.to_csv("../data/processed/district-level-residential-use.csv", index=False)

In [None]:
latest_df.to_csv("../data/processed/latest-district-level-residential-use.csv", index=False)

In [None]:
sort_region_df.to_csv("../data/processed/regional-residential-usage.csv", index=False)

In [None]:
sort_region_df[
    sort_region_df.reporting_month == sort_district_df.reporting_month.max()
].to_csv("../data/processed/latest-regional-residential-use.csv", index=False)

In [None]:
sort_state_df.to_csv("../data/processed/statewide-residential-usage.csv", index=False)

In [None]:
sort_state_df[
    sort_state_df.reporting_month == sort_district_df.reporting_month.max()
].to_csv("../data/processed/latest-statewide-level-residential-use.csv", index=False)