In [175]:
import pandas as pd
import altair as alt
import altair_latimes as lat

In [176]:
alt.themes.register('latimes', lat.theme)
alt.themes.enable('latimes')

ThemeRegistry.enable('latimes')

### Import

Monthly reports timeseries

In [177]:
df = pd.read_csv("../data/raw/uw-usage.csv", parse_dates=["Reporting Month"])

### Clean

Remove junk from column names

In [178]:
df.columns = df.columns.str.replace(" ","_").str.replace("-","_").str.replace(r'[#,@,&,(,)]', '', regex=True).str.lower()

Eliminate double spaces in supplier names

In [179]:
df.supplier_name = df.supplier_name.str.replace("  ", " ")

Trim this downn to just the columns we need

In [180]:
keeps = [
    'supplier_name', 
    'public_water_system_id', 
    'reporting_month', 
    'county',
    'hydrologic_region', 
    'climate_zone', 
    'total_population_served',
    ### MAYBE PILE
        # 'county_under_drought_declaration',
        # 'water_shortage_contingency_stage_invoked',
        # 'water_shortage_level_indicator',
        # 'dwr_state_standard_level_corresponding_to_stage',
    ### NEED THIS IF YOU WANT TO RECALCULATE R-GPCD BY DISTRICT
        # 'water_production_units',
        # 'reported_preliminary_total_potable_water_production',
        # 'reported_final_total_potable_water_production',
        # 'preliminary_percent_residential_use', 
        # 'final_percent_residential_use',
    ### THESE COLUMNS ARE OPTIONAL FOR SUPPLIERS TO FILL OUT
        # 'reported_preliminary_commercial_agricultural_water',
        # 'reported_final_commercial_agricultural_water',
        # 'reported_preliminary_commercial,_industrial_and_institutional_water',
        # 'reported_final_commercial_industrial_and_institutional_water',
        # 'reported_recycled_water', 
        # 'reported_non_revenue_water',
    'calculated_total_potable_water_production_gallons_ag_excluded',
    # 'calculated_total_potable_water_production_gallons_2013_ag_excluded',
    # 'calculated_commercial_agricultural_water_gallons',
    # 'calculated_commercial_agricultural_water_gallons_2013',
    'calculated_r_gpcd', 
    # 'qualification'
]

In [181]:
trim_df = df[keeps]

### Calculate

Calculate population-weighted r-gpcd for hydrologic regions

In [182]:
def regional_calcs(df, gals, rgpcd, pop):
    val = df[rgpcd]
    wt = df[pop]
    wt_avg = (val * wt).sum() / wt.sum()
    #return (val * wt).sum() / wt.sum()
    total_gals = df[gals].sum()
    total_pop = df[pop].sum()
    return pd.Series([total_pop, total_gals, wt_avg], index=['total_pop', 'total_gallons', 'pop_weighted_rgpcd'])

In [183]:
region_df = trim_df.groupby(
    ['reporting_month','hydrologic_region']
).apply(
    regional_calcs,
    "calculated_total_potable_water_production_gallons_ag_excluded",
    'calculated_r_gpcd', 
    'total_population_served'
).reset_index()

Now do it for the entire state

In [184]:
statewide_df = trim_df.groupby(
    ['reporting_month']
).apply(
    regional_calcs,
    "calculated_total_potable_water_production_gallons_ag_excluded",
    'calculated_r_gpcd', 
    'total_population_served'
).reset_index()

In [185]:
# in case we need to recalculate r-gpcd, use this dict for days per month
# days_per_month = {
#     "1": 31,
#     "2": 28,
#     "3": 31,
#     "4": 30,
#     "5": 31,
#     "6": 30,
#     "7": 31,
#     "8": 31,
#     "9": 30,
#     "10": 31,
#     "11": 30,
#     "12": 31
# }

### Merge regional r-gpcd values to district df

In [186]:
merge_df = pd.merge(
    trim_df, 
    region_df[["hydrologic_region","reporting_month","pop_weighted_rgpcd"]], 
    how="left", 
    on=["hydrologic_region","reporting_month"]
)

### Rename r-gpcd columns for clarity

In [187]:
rename_df = merge_df.rename(columns={
    "calculated_r_gpcd": "r_gpcd",
    "pop_weighted_rgpcd": "regional_r_gpcd"
})

### Chart

In [188]:
melt = pd.melt(
    rename_df, 
    id_vars=["supplier_name","hydrologic_region","reporting_month"], 
    value_vars=["r_gpcd","regional_r_gpcd"]
)

In [189]:
base = alt.Chart(
    rename_df[
        (rename_df.supplier_name == "Los Angeles Department of Water and Power") & 
        (merge_df.reporting_month > "2021-04-01")
    ]
).encode(
    x=alt.X("yearmonth(reporting_month):O"),
)

bar = base.mark_bar(color="#83c6e0").encode(
    y=alt.Y("r_gpcd", stack=None)
)

avg_line = base.mark_line(interpolate='step', color='#1281aa').encode(
    y=alt.Y("regional_r_gpcd")
)

goal_line = alt.Chart(pd.DataFrame({'y': [80]})).mark_rule(color="#b75a36",strokeDash=[10,11]).encode(y='y')

(bar + avg_line + goal_line).properties(title="LADWP residential water usage compared to regional average", width=600)

### Sort data

In [190]:
sort_district_df = merge_df.sort_values(["reporting_month","supplier_name"])

In [191]:
sort_region_df = region_df.sort_values(["reporting_month","hydrologic_region"])

In [192]:
sort_state_df = statewide_df.sort_values(["reporting_month"])

### Export

In [193]:
sort_district_df.to_csv("../data/processed/district-level-residential-use.csv", index=False)

In [194]:
sort_region_df.to_csv("../data/processed/regional-residential-usage.csv", index=False)

In [195]:
sort_state_df.to_csv("../data/processed/statewide-residential-usage.csv", index=False)