In [342]:
import pandas as pd
import altair as alt
import altair_latimes as lat

In [343]:
alt.themes.register('latimes', lat.theme)
alt.themes.enable('latimes')

ThemeRegistry.enable('latimes')

### Import

Monthly reports timeseries

In [344]:
# df = pd.read_csv("../data/raw/uw-usage.csv", parse_dates=["Reporting Month"])
df = pd.read_csv(
    "data/latest.csv", 
    parse_dates=["report_period_start_date", "report_period_end_date"]
)

In [345]:
crosswalk = pd.read_csv("data/metadata/crosswalk.csv")

### Clean

Remove any whitespace from column names

In [346]:
df.columns = df.columns.str.strip(' ').str.replace("-","_")

Eliminate double spaces in supplier names

In [347]:
df.supplier_name = df.supplier_name.str.replace("  ", " ")

In [348]:
df['supplier_name'] = df['supplier_name'].str.strip()

In [349]:
df['county'] = df.county.str.title()

### Merge clean names

In [350]:
merge_names_df = pd.merge(
    df,
    crosswalk[['org_id', 'longitude', 'latitude', 'main_county']],
    how="left",
    on="org_id"
)

### Trim

Remove flagged `r-gpcd` values

In [400]:
remove_flagged = merge_names_df[merge_names_df.res_flag != 'Flagged']

Trim this down to just the columns we need

In [403]:
keeps = [
    'org_id',
    'supplier_name', 
    # 'display_name',
    'water_system_id', 
    'report_period_start_date', 
    'report_period_end_date',
    'main_county',
    'hydro_region', 
    # 'climate_zone', 
    'pop_report_period',
    'potable_supply_minus_sold_minus_ag_gal',
    'potable_supply_minus_sold_minus_ag_gal_flag',
    'r_gpcd', 
    'res_flag',
    'latitude',
    'longitude'
]

In [405]:
trim_df = remove_flagged[keeps]

### Calculate

Calculate population-weighted r-gpcd for hydrologic regions

In [406]:
def regional_calcs(df, gals, rgpcd, pop, geography):
    val = df[rgpcd]
    wt = df[pop]
    wt_avg = (val * wt).sum() / wt.sum()
    #return (val * wt).sum() / wt.sum()
    total_gals = df[gals].sum()
    total_pop = df[pop].sum()
    return pd.Series([total_pop, total_gals, wt_avg], index=['total_pop', 'total_gallons', f'{geography}_pop_weighted_rgpcd'])

In [407]:
region_df = trim_df.groupby(
    ['report_period_start_date','hydro_region']
).apply(
    regional_calcs,
    "potable_supply_minus_sold_minus_ag_gal",
    'r_gpcd', 
    'pop_report_period', 
    'region',
    include_groups=False
).reset_index()

In [408]:
county_df = trim_df.groupby(
    ['report_period_start_date','main_county']
).apply(
    regional_calcs,
    "potable_supply_minus_sold_minus_ag_gal",
    'r_gpcd', 
    'pop_report_period', 
    'county',
    include_groups=False
).reset_index()

In [409]:
len(county_df[county_df.report_period_start_date == county_df.report_period_start_date.max()])

46

In [410]:
len(county_df.main_county.unique())

48

In [411]:
# county_df[county_df.report_period_start_date == county_df.report_period_start_date.max()]\
#     .sort_values('pop_weighted_rgpcd', ascending=False)

Now do it for the entire state

In [412]:
statewide_df = trim_df.groupby(
    ['report_period_start_date']
).apply(
    regional_calcs,
    "potable_supply_minus_sold_minus_ag_gal",
    'r_gpcd', 
    'pop_report_period',
    'state',
    include_groups=False
).reset_index()

In [413]:
statewide_df.total_pop.max()

37169224.0

### Remove duplicates

In [414]:
len(trim_df)

45875

In [415]:
tmp = trim_df.set_index(['supplier_name', 'report_period_start_date'])

In [416]:
remove_duplicates = tmp[~tmp.index.duplicated()].reset_index().copy()

In [417]:
len(remove_duplicates)

45875

### Backfill missing dates

In [418]:
min_date = trim_df.report_period_start_date.min()
min_date

Timestamp('2014-06-01 00:00:00')

In [419]:
max_date = trim_df.report_period_start_date.max()
max_date

Timestamp('2024-04-01 00:00:00')

In [420]:
def backfill(agency_group):
    """
    Backfills empty dates in the provided county group.

    Runs from the earliest date in the group to the latest.

    Filled in dates are given the previous day's case count with an `ffill` technique.

    The expanded group is returned.
    """
    agency_df = agency_group.sort_values(["supplier_name", "report_period_start_date"]).set_index(
        ["supplier_name", "report_period_start_date"]
    )

    # Backfill the daterange
    ## Get the full range of values from the extent of dates in the dataframe
    date_range = pd.date_range(
        min_date,
        max_date,
        freq=pd.DateOffset(months=1, day=1),
    )
    ## Get the full range of unique place names
    name_range = agency_df.index.unique(level="supplier_name")
    ## Create a new index that has an entry for every place on every date
    namedate_index = pd.MultiIndex.from_product(
        iterables=[name_range, date_range], names=["supplier_name", "report_period_start_date"]
    )
    ## Reindex the dataframe using that complete list of places and dates
    backfilled_df = agency_df.reindex(namedate_index)

    # Zero out missing data
    # backfilled_df.r_gpcd.fillna(0, inplace=True)
    backfilled_df.fillna({'r_gpcd': 0}, inplace=True)
    # backfilled_df.potable_supply_minus_sold_minus_ag_gal.fillna(0, inplace=True)
    backfilled_df.fillna({'potable_supply_minus_sold_minus_ag_gal': 0}, inplace=True)

    # Foward-fill the other remaining columns
    backfilled_df = backfilled_df.groupby("supplier_name").ffill()

    # Reset it
    reset_df = backfilled_df.reset_index()

    # Return it
    return reset_df

In [421]:
backfilled_df = (
    remove_duplicates.groupby("supplier_name").apply(backfill).reset_index(drop=True)
)

  remove_duplicates.groupby("supplier_name").apply(backfill).reset_index(drop=True)


In [422]:
len(backfilled_df)

48195

### Merge regional r-gpcd values to district df

In [423]:
merge_regions_df = pd.merge(
    backfilled_df, 
    region_df[["hydro_region","report_period_start_date","region_pop_weighted_rgpcd"]], 
    how="left", 
    on=["hydro_region","report_period_start_date"]
).merge(county_df[["main_county","report_period_start_date","county_pop_weighted_rgpcd"]], how="left", on=["main_county","report_period_start_date"])

In [424]:
merge_regions_df.head()

Unnamed: 0,supplier_name,report_period_start_date,org_id,water_system_id,report_period_end_date,main_county,hydro_region,pop_report_period,potable_supply_minus_sold_minus_ag_gal,potable_supply_minus_sold_minus_ag_gal_flag,r_gpcd,res_flag,latitude,longitude,region_pop_weighted_rgpcd,county_pop_weighted_rgpcd
0,Alameda County Water District,2014-06-01,,,NaT,,,,0.0,,0.0,,,,,
1,Alameda County Water District,2014-07-01,23.0,CA0110001,2014-07-31,Alameda,San Francisco Bay,337562.0,1401200000.0,,95.07,,37.538,-122.021,94.644776,91.559497
2,Alameda County Water District,2014-08-01,23.0,CA0110001,2014-08-31,Alameda,San Francisco Bay,340000.0,1326800000.0,,89.38,,37.538,-122.021,90.589083,88.912539
3,Alameda County Water District,2014-09-01,23.0,CA0110001,2014-09-30,Alameda,San Francisco Bay,340000.0,1221000000.0,,80.2,,37.538,-122.021,83.774729,82.626534
4,Alameda County Water District,2014-10-01,23.0,CA0110001,2014-10-31,Alameda,San Francisco Bay,340000.0,1143900000.0,,74.89,,37.538,-122.021,75.618738,73.910935


### Round water use figures to save space

In [425]:
merge_regions_df["potable_supply_minus_sold_minus_ag_gal"] = merge_regions_df["potable_supply_minus_sold_minus_ag_gal"].round(0)

In [426]:
merge_regions_df["r_gpcd"] = merge_regions_df["r_gpcd"].round(1)

In [427]:
merge_regions_df["region_pop_weighted_rgpcd"] = merge_regions_df["region_pop_weighted_rgpcd"].round(1)

In [428]:
merge_regions_df["county_pop_weighted_rgpcd"] = merge_regions_df["county_pop_weighted_rgpcd"].round(1)

In [429]:
statewide_df["state_pop_weighted_rgpcd"] = statewide_df["state_pop_weighted_rgpcd"].round(1)

### Rename columns

In [430]:
rename_df = merge_regions_df.rename(columns={
    "water_system_id": "pwsid",
    "report_period_start_date": "reporting_month",
    "pop_report_period": "population",
    "dwr_standard_level": "dwr_stage",
    "potable_supply_minus_sold_minus_ag_gal": "total_water_production",
    "r_gpcd": "r_gpcd",
    "region_pop_weighted_rgpcd": "regional_r_gpcd",
    "county_pop_weighted_rgpcd": "county_r_gpcd"
})

### Chart

In [431]:
melt = pd.melt(
    rename_df, 
    id_vars=["supplier_name","main_county","reporting_month"], 
    value_vars=["r_gpcd","county_r_gpcd"]
)

In [433]:
agency_name = "Los Angeles City Department of Water And Power"
# agency_name = "East Bay Municipal Utility District"

base = alt.Chart(
    rename_df[
        (rename_df.supplier_name == agency_name)
    ].head(12)
).encode(
    x=alt.X("yearmonth(reporting_month):O").axis(title=""),
    tooltip=["r_gpcd","reporting_month"]
)

bar = base.mark_bar(color="#83c6e0").encode(
    y=alt.Y("r_gpcd", stack=None).axis(title="Residential gallons per capita per day"),
    text="r_gpcd"
)

avg_line = base.mark_line(interpolate='step', color='#1281aa').encode(
    y=alt.Y("county_r_gpcd"),
    text="county_r_gpcd"
)

# goal_line = alt.Chart(pd.DataFrame({'y': [80]})).mark_rule(color="#b75a36",strokeDash=[10,11]).encode(y='y')

(
    bar + 
    avg_line + 
    bar.mark_text(align='center', dy=-7) +
    avg_line.mark_text(align='center', dy=-7)
).properties(title=f"{agency_name} residential water usage compared to county average", width=600)

In [434]:
base = alt.Chart(
    statewide_df.tail(12)
).encode(
    x=alt.X("yearmonth(report_period_start_date):O").axis(title=""),
    tooltip=["state_pop_weighted_rgpcd","report_period_start_date"]
)

bar = base.mark_bar(color="#83c6e0").encode(
    y=alt.Y("state_pop_weighted_rgpcd", stack=None).axis(title="Residential gallons per capita per day"),
    text="state_pop_weighted_rgpcd"
)

# avg_line = base.mark_line(interpolate='step', color='#1281aa').encode(
#     y=alt.Y("regional_r_gpcd"),
#     text="regional_r_gpcd"
# )

# goal_line = alt.Chart(pd.DataFrame({'y': [80]})).mark_rule(color="#b75a36",strokeDash=[10,11]).encode(y='y')

(
    bar + 
    # avg_line + 
    bar.mark_text(align='center', dy=-7) 
    # avg_line.mark_text(align='center', dy=-7)
).properties(title=f"Statewide residential water usage", width=600)

### Sort data

In [385]:
sort_district_df = rename_df.sort_values(["reporting_month","main_county","supplier_name"])

In [386]:
sort_region_df = region_df.rename(columns={"report_period_start_date":"reporting_month"}).sort_values(["reporting_month","hydro_region"])

In [387]:
sort_county_df = county_df.rename(columns={"report_period_start_date":"reporting_month"}).sort_values(["reporting_month","main_county"])

In [388]:
sort_state_df = statewide_df.rename(columns={"report_period_start_date":"reporting_month"}).sort_values(["reporting_month"])

### Remove flagged `total_water_production` values before exporting

In [448]:
sort_district_df.loc[(sort_district_df.potable_supply_minus_sold_minus_ag_gal_flag == 'Flagged'), 'total_water_production'] = pd.NA

### Filter dataframe to last 12 months

In [450]:
last_twelve_months_df = sort_district_df.sort_values('reporting_month').groupby('pwsid').tail(12)

In [451]:
latest_df = sort_district_df[
    (sort_district_df.r_gpcd > 0)
].sort_values('reporting_month').groupby('pwsid').tail(1)

### Export

In [452]:
last_twelve_months_df.to_csv("data/processed/district-level-residential-use.csv", index=False)

In [453]:
latest_df.to_csv("data/processed/latest-district-level-residential-use.csv", index=False)

In [454]:
sort_region_df.to_csv("data/processed/regional-residential-usage.csv", index=False)

In [455]:
sort_region_df[
    sort_region_df.reporting_month == sort_district_df.reporting_month.max()
].to_csv("data/processed/latest-regional-residential-use.csv", index=False)

In [456]:
sort_county_df.to_csv("data/processed/county-residential-usage.csv", index=False)

In [457]:
sort_county_df[
    sort_county_df.reporting_month == sort_county_df.reporting_month.max()
].to_csv("data/processed/latest-county-residential-use.csv", index=False)

In [458]:
sort_state_df.to_csv("data/processed/statewide-residential-usage.csv", index=False)

In [459]:
sort_state_df[
    sort_state_df.reporting_month == sort_district_df.reporting_month.max()
].to_csv("data/processed/latest-statewide-level-residential-use.csv", index=False)