In [2]:
import pandas as pd
import altair as alt
import altair_latimes as lat

In [3]:
alt.themes.register('latimes', lat.theme)
alt.themes.enable('latimes')

ThemeRegistry.enable('latimes')

### Import

Monthly reports timeseries

In [180]:
# df = pd.read_csv("../data/raw/uw-usage.csv", parse_dates=["Reporting Month"])
df = pd.read_csv(
    "../../downloaders/use-conservation/data/latest.csv", 
    parse_dates=["report_period_start_date", "report_period_end_date"]
)

In [181]:
clean_names = pd.read_csv("../../data/metadata/urban-water-suppliers-clean-names.csv")

### Clean

Remove junk from column names

In [182]:
df.columns = df.columns.str.strip(' ').str.replace("-","_")

Eliminate double spaces in supplier names

In [183]:
df.supplier_name = df.supplier_name.str.replace("  ", " ")

In [184]:
df['supplier_name'] = df['supplier_name'].str.strip()

### Merge clean names

In [185]:
merge_names_df = pd.merge(
    df,
    clean_names[["id","display_name"]],
    how="left",
    left_on=["water_system_id"],
    right_on=["id"]
)

In [186]:
merge_names_df[merge_names_df.display_name.isnull()].supplier_name.unique()

array(['Amador Water Agency', 'Apple Valley Ranchos Water Company',
       'City of Arcata', 'Bear Valley Community Services District',
       'City of Big Bear Lake', 'City of Blythe',
       'California American Water Company - Los Angeles Division',
       'California American Water Company - Monterey District',
       'California American Water Company - Sacramento District',
       'California Water Service Company Kern River Valley',
       'California Water Service Company Salinas District',
       'California Water Service Company Visalia',
       'Casitas Municipal Water District',
       'Coachella Valley Water District', 'Crescent City',
       'El Dorado Irrigation District',
       'Elsinore Valley Municipal Water District', 'City of Lindsay',
       'Los Angeles County Waterworks District 40 - Antelope Valley',
       'City of Modesto', 'Nevada Irrigation District',
       'North Tahoe Public Utilities District',
       'Olivehurst Public Utilities District',
       'Libe

In [187]:
merge_names_df.loc[merge_names_df.display_name.isnull(), "display_name"] = merge_names_df["supplier_name"]

### Trim

Remove flagged `r-gpcd` values

In [188]:
remove_flagged = merge_names_df[merge_names_df.res_flag != 'Flagged']

Trim this down to just the columns we need

In [189]:
keeps = [
    'supplier_name', 
    'display_name',
    'water_system_id', 
    'report_period_start_date', 
    'report_period_end_date',
    'county',
    'hydro_region', 
    # 'climate_zone', 
    'pop_report_period',
    'potable_supply_minus_sold_minus_ag_gal',
    'potable_supply_minus_sold_minus_ag_gal_flag',
    'r_gpcd', 
    'res_flag'
]

In [190]:
trim_df = remove_flagged[keeps]

### Calculate

Calculate population-weighted r-gpcd for hydrologic regions

In [191]:
def regional_calcs(df, gals, rgpcd, pop):
    val = df[rgpcd]
    wt = df[pop]
    wt_avg = (val * wt).sum() / wt.sum()
    #return (val * wt).sum() / wt.sum()
    total_gals = df[gals].sum()
    total_pop = df[pop].sum()
    return pd.Series([total_pop, total_gals, wt_avg], index=['total_pop', 'total_gallons', 'pop_weighted_rgpcd'])

In [192]:
region_df = trim_df.groupby(
    ['report_period_start_date','hydro_region']
).apply(
    regional_calcs,
    "potable_supply_minus_sold_minus_ag_gal",
    'r_gpcd', 
    'pop_report_period'
).reset_index()

  region_df = trim_df.groupby(


Now do it for the entire state

In [193]:
statewide_df = trim_df.groupby(
    ['report_period_start_date']
).apply(
    regional_calcs,
    "potable_supply_minus_sold_minus_ag_gal",
    'r_gpcd', 
    'pop_report_period'
).reset_index()

  statewide_df = trim_df.groupby(


In [194]:
# in case we need to recalculate r-gpcd, use this dict for days per month
# days_per_month = {
#     "1": 31,
#     "2": 28,
#     "3": 31,
#     "4": 30,
#     "5": 31,
#     "6": 30,
#     "7": 31,
#     "8": 31,
#     "9": 30,
#     "10": 31,
#     "11": 30,
#     "12": 31
# }

### Remove duplicates

In [195]:
len(trim_df)

45875

In [196]:
tmp = trim_df.set_index(['supplier_name', 'report_period_start_date'])

In [197]:
remove_duplicates = tmp[~tmp.index.duplicated()].reset_index().copy()

In [198]:
len(remove_duplicates)

45875

### Backfill missing dates

In [199]:
min_date = trim_df.report_period_start_date.min()
min_date

Timestamp('2014-06-01 00:00:00')

In [200]:
max_date = trim_df.report_period_start_date.max()
max_date

Timestamp('2024-04-01 00:00:00')

In [201]:
def backfill(agency_group):
    """
    Backfills empty dates in the provided county group.

    Runs from the earliest date in the group to the latest.

    Filled in dates are given the previous day's case count with an `ffill` technique.

    The expanded group is returned.
    """
    agency_df = agency_group.sort_values(["supplier_name", "report_period_start_date"]).set_index(
        ["supplier_name", "report_period_start_date"]
    )

    # Backfill the daterange
    ## Get the full range of values from the extent of dates in the dataframe
    date_range = pd.date_range(
        min_date,
        max_date,
        freq=pd.DateOffset(months=1, day=1),
    )
    ## Get the full range of unique place names
    name_range = agency_df.index.unique(level="supplier_name")
    ## Create a new index that has an entry for every place on every date
    namedate_index = pd.MultiIndex.from_product(
        iterables=[name_range, date_range], names=["supplier_name", "report_period_start_date"]
    )
    ## Reindex the dataframe using that complete list of places and dates
    backfilled_df = agency_df.reindex(namedate_index)

    # Zero out missing data
    # backfilled_df.r_gpcd.fillna(0, inplace=True)
    backfilled_df.fillna({'r_gpcd': 0}, inplace=True)
    # backfilled_df.potable_supply_minus_sold_minus_ag_gal.fillna(0, inplace=True)
    backfilled_df.fillna({'potable_supply_minus_sold_minus_ag_gal': 0}, inplace=True)

    # Foward-fill the other remaining columns
    backfilled_df = backfilled_df.groupby("supplier_name").ffill()

    # Reset it
    reset_df = backfilled_df.reset_index()

    # Return it
    return reset_df

In [202]:
backfilled_df = (
    remove_duplicates.groupby("supplier_name").apply(backfill).reset_index(drop=True)
)

  remove_duplicates.groupby("supplier_name").apply(backfill).reset_index(drop=True)


### Merge regional r-gpcd values to district df

In [203]:
merge_regions_df = pd.merge(
    remove_duplicates, 
    region_df[["hydro_region","report_period_start_date","pop_weighted_rgpcd"]], 
    how="left", 
    on=["hydro_region","report_period_start_date"]
)

### Round water use figures to save space

In [204]:
merge_regions_df["potable_supply_minus_sold_minus_ag_gal"] = merge_regions_df["potable_supply_minus_sold_minus_ag_gal"].round(0)

In [205]:
merge_regions_df["r_gpcd"] = merge_regions_df["r_gpcd"].round(1)

In [206]:
merge_regions_df["pop_weighted_rgpcd"] = merge_regions_df["pop_weighted_rgpcd"].round(1)

### Rename columns for clarity and brevity

In [216]:
rename_df = merge_regions_df.rename(columns={
    "water_system_id": "pwsid",
    "report_period_start_date": "reporting_month",
    "pop_report_period": "population",
    "dwr_standard_level": "dwr_stage",
    "potable_supply_minus_sold_minus_ag_gal": "total_water_production",
    "r_gpcd": "r_gpcd",
    "pop_weighted_rgpcd": "regional_r_gpcd"
})

### Chart

In [217]:
melt = pd.melt(
    rename_df, 
    id_vars=["display_name","hydro_region","reporting_month"], 
    value_vars=["r_gpcd","regional_r_gpcd"]
)

In [218]:
melt[(melt.hydro_region == 'San Francisco Bay') & (melt.display_name.str.contains('East'))]

Unnamed: 0,display_name,hydro_region,reporting_month,variable,value
11398,East Bay Municipal Utilities District,San Francisco Bay,2024-03-01,r_gpcd,48.2
11399,East Bay Municipal Utilities District,San Francisco Bay,2023-12-01,r_gpcd,51.5
11400,East Bay Municipal Utilities District,San Francisco Bay,2023-11-01,r_gpcd,57.8
11401,East Bay Municipal Utilities District,San Francisco Bay,2023-10-01,r_gpcd,65.9
11402,East Bay Municipal Utilities District,San Francisco Bay,2023-09-01,r_gpcd,72.8
...,...,...,...,...,...
57587,City of East Palo Alto,San Francisco Bay,2014-10-01,regional_r_gpcd,75.6
57588,City of East Palo Alto,San Francisco Bay,2014-09-01,regional_r_gpcd,83.8
57589,City of East Palo Alto,San Francisco Bay,2014-08-01,regional_r_gpcd,90.6
57590,City of East Palo Alto,San Francisco Bay,2014-07-01,regional_r_gpcd,94.6


In [219]:
# agency_name = "Los Angeles Department of Water and Power"
agency_name = "East Bay Municipal Utilities District"

base = alt.Chart(
    rename_df[
        (rename_df.display_name == agency_name)
    ].head(12)
).encode(
    x=alt.X("yearmonth(reporting_month):O").axis(title=""),
    tooltip=["r_gpcd","reporting_month"]
)

bar = base.mark_bar(color="#83c6e0").encode(
    y=alt.Y("r_gpcd", stack=None).axis(title="Residential gallons per capita per day"),
    text="r_gpcd"
)

avg_line = base.mark_line(interpolate='step', color='#1281aa').encode(
    y=alt.Y("regional_r_gpcd"),
    text="regional_r_gpcd"
)

# goal_line = alt.Chart(pd.DataFrame({'y': [80]})).mark_rule(color="#b75a36",strokeDash=[10,11]).encode(y='y')

(
    bar + 
    avg_line + 
    bar.mark_text(align='center', dy=-7) +
    avg_line.mark_text(align='center', dy=-7)
).properties(title=f"{agency_name} residential water usage compared to regional average", width=600)

### Sort data

In [220]:
sort_district_df = rename_df.sort_values(["reporting_month","supplier_name"])

In [222]:
sort_region_df = region_df.rename(columns={"report_period_start_date":"reporting_month"}).sort_values(["reporting_month","hydro_region"])

In [224]:
sort_state_df = statewide_df.rename(columns={"report_period_start_date":"reporting_month"}).sort_values(["reporting_month"])

### Filter dataframe to be a bit more manageable

In [225]:
#filtered_district_df = sort_district_df[sort_district_df.reporting_month >= "2021-01-15"]

In [226]:
last_twelve_months_df = sort_district_df.sort_values('reporting_month').groupby('pwsid').tail(12)

In [227]:
latest_df = sort_district_df[
    (sort_district_df.r_gpcd > 0)
].sort_values('reporting_month').groupby('pwsid').tail(1)

### Export

In [228]:
last_twelve_months_df.to_csv("../data/processed/district-level-residential-use.csv", index=False)

OSError: Cannot save file into a non-existent directory: '../data/processed'

In [37]:
latest_df.to_csv("../data/processed/latest-district-level-residential-use.csv", index=False)

In [38]:
sort_region_df.to_csv("../data/processed/regional-residential-usage.csv", index=False)

In [39]:
sort_region_df[
    sort_region_df.reporting_month == sort_district_df.reporting_month.max()
].to_csv("../data/processed/latest-regional-residential-use.csv", index=False)

In [40]:
sort_state_df.to_csv("../data/processed/statewide-residential-usage.csv", index=False)

In [41]:
sort_state_df[
    sort_state_df.reporting_month == sort_district_df.reporting_month.max()
].to_csv("../data/processed/latest-statewide-level-residential-use.csv", index=False)