In [148]:
import pandas as pd
import altair as alt
import altair_latimes as lat

In [149]:
alt.themes.register('latimes', lat.theme)
alt.themes.enable('latimes')

ThemeRegistry.enable('latimes')

### Import

Monthly reports timeseries

In [150]:
# df = pd.read_csv("../data/raw/uw-usage.csv", parse_dates=["Reporting Month"])
df = pd.read_csv(
    "data/latest.csv", 
    parse_dates=["report_period_start_date", "report_period_end_date"]
)

In [151]:
df[ df.supplier_name == 'City of Thousand Oaks' ]

Unnamed: 0,org_id,supplier_name,water_system_id,county,hydro_region,climate_zone,report_period_start_date,report_period_end_date,pop_report_period,dwr_standard_level,...,potable_demand_o_gal,potable_demand_prelim_est,recycled_demand_gal,non_potable_demand_prelim_est,potable_supply_minus_sold_minus_ag_gal,potable_supply_minus_sold_minus_ag_gal_flag,potable_supply_minus_sold_zscore,potable_demand_res_zscore,r-gpcd_zscore,potable_supply_minus_sold_minus_ag_zscore
40925,2469,City of Thousand Oaks,CA5610020,VENTURA,South Coast,9,2024-04-01,2024-04-30,53157,1 (Less than 10% Shortage),...,1496.10,,0.0,,167845850.1,,-1.53,-1.85,-1.87,-1.53
40926,2469,City of Thousand Oaks,CA5610020,VENTURA,South Coast,9,2024-03-01,2024-03-31,53157,1 (Less than 10% Shortage),...,47127.28,,0.0,,145916077.8,,-1.91,-1.51,-1.63,-1.91
40927,2469,City of Thousand Oaks,CA5610020,VENTURA,South Coast,9,2024-02-01,2024-02-29,53157,1 (Less than 10% Shortage),...,9724.68,,0.0,,117860306.7,,-1.19,-1.04,-0.94,-1.19
40928,2469,City of Thousand Oaks,CA5610020,VENTURA,South Coast,9,2024-01-01,2024-01-31,53157,1 (Less than 10% Shortage),...,53111.69,,0.0,,152400512.7,,-0.60,1.83,1.66,-0.60
40929,2469,City of Thousand Oaks,CA5610020,VENTURA,South Coast,9,2023-12-01,2023-12-31,53157,1 (Less than 10% Shortage),...,0.00,,0.0,,190850930.7,,0.05,-0.38,-0.47,0.05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41039,2469,City of Thousand Oaks,CA5610020,VENTURA,South Coast,9,2014-10-01,2014-10-31,51609,,...,,,,,351267378.0,,1.29,1.62,1.57,1.29
41040,2469,City of Thousand Oaks,CA5610020,VENTURA,South Coast,9,2014-09-01,2014-09-30,51609,,...,,,,,368179044.9,,1.60,1.05,1.25,1.60
41041,2469,City of Thousand Oaks,CA5610020,VENTURA,South Coast,9,2014-08-01,2014-08-31,51609,,...,,,,,390695349.0,,1.24,1.68,2.24,1.24
41042,2469,City of Thousand Oaks,CA5610020,VENTURA,South Coast,9,2014-07-01,2014-07-31,51609,,...,,,,,402751836.0,,1.49,0.12,0.08,1.49


In [152]:
# clean_names = pd.read_csv("data/metadata/urban-water-suppliers-clean-names.csv")

In [153]:
crosswalk = pd.read_csv("data/metadata/crosswalk.csv")

### Clean

Remove any whitespace from column names

In [154]:
df.columns = df.columns.str.strip(' ').str.replace("-","_")

Eliminate double spaces in supplier names

In [155]:
df.supplier_name = df.supplier_name.str.replace("  ", " ")

In [156]:
df['supplier_name'] = df['supplier_name'].str.strip()

In [157]:
df['county'] = df.county.str.title()

### Merge clean names

In [158]:
merge_names_df = pd.merge(
    df,
    crosswalk[['org_id', 'longitude', 'latitude', 'main_county']],
    how="left",
    on="org_id"
)

### Trim

Remove flagged `r-gpcd` values

In [159]:
remove_flagged = merge_names_df[merge_names_df.res_flag != 'Flagged']

Trim this down to just the columns we need

In [160]:
keeps = [
    'org_id',
    'supplier_name', 
    # 'display_name',
    'water_system_id', 
    'report_period_start_date', 
    'report_period_end_date',
    'main_county',
    'hydro_region', 
    # 'climate_zone', 
    'pop_report_period',
    'potable_supply_minus_sold_minus_ag_gal',
    'potable_supply_minus_sold_minus_ag_gal_flag',
    'r_gpcd', 
    'res_flag',
    'latitude',
    'longitude'
]

In [161]:
trim_df = remove_flagged[keeps]

### Calculate

Calculate population-weighted r-gpcd for hydrologic regions

In [162]:
def regional_calcs(df, gals, rgpcd, pop, geography):
    val = df[rgpcd]
    wt = df[pop]
    wt_avg = (val * wt).sum() / wt.sum()
    #return (val * wt).sum() / wt.sum()
    total_gals = df[gals].sum()
    total_pop = df[pop].sum()
    return pd.Series([total_pop, total_gals, wt_avg], index=['total_pop', 'total_gallons', f'{geography}_pop_weighted_rgpcd'])

In [163]:
region_df = trim_df.groupby(
    ['report_period_start_date','hydro_region']
).apply(
    regional_calcs,
    "potable_supply_minus_sold_minus_ag_gal",
    'r_gpcd', 
    'pop_report_period', 
    'region',
    include_groups=False
).reset_index()

In [164]:
county_df = trim_df.groupby(
    ['report_period_start_date','main_county']
).apply(
    regional_calcs,
    "potable_supply_minus_sold_minus_ag_gal",
    'r_gpcd', 
    'pop_report_period', 
    'county',
    include_groups=False
).reset_index()

In [165]:
len(county_df[county_df.report_period_start_date == county_df.report_period_start_date.max()])

46

In [166]:
len(county_df.main_county.unique())

48

In [167]:
# county_df[county_df.report_period_start_date == county_df.report_period_start_date.max()]\
#     .sort_values('pop_weighted_rgpcd', ascending=False)

Now do it for the entire state

In [168]:
statewide_df = trim_df.groupby(
    ['report_period_start_date']
).apply(
    regional_calcs,
    "potable_supply_minus_sold_minus_ag_gal",
    'r_gpcd', 
    'pop_report_period',
    'state',
    include_groups=False
).reset_index()

In [169]:
statewide_df.total_pop.max()

37169224.0

### Remove duplicates

In [170]:
len(trim_df)

45875

In [171]:
tmp = trim_df.set_index(['supplier_name', 'report_period_start_date'])

In [172]:
remove_duplicates = tmp[~tmp.index.duplicated()].reset_index().copy()

In [173]:
len(remove_duplicates)

45875

### Backfill missing dates

In [174]:
min_date = trim_df.report_period_start_date.min()
min_date

Timestamp('2014-06-01 00:00:00')

In [175]:
max_date = trim_df.report_period_start_date.max()
max_date

Timestamp('2024-04-01 00:00:00')

In [176]:
def backfill(agency_group):
    """
    Backfills empty dates in the provided county group.

    Runs from the earliest date in the group to the latest.

    Filled in dates are given the previous day's case count with an `ffill` technique.

    The expanded group is returned.
    """
    agency_df = agency_group.sort_values(["supplier_name", "report_period_start_date"]).set_index(
        ["supplier_name", "report_period_start_date"]
    )

    # Backfill the daterange
    ## Get the full range of values from the extent of dates in the dataframe
    date_range = pd.date_range(
        min_date,
        max_date,
        freq=pd.DateOffset(months=1, day=1),
    )
    ## Get the full range of unique place names
    name_range = agency_df.index.unique(level="supplier_name")
    ## Create a new index that has an entry for every place on every date
    namedate_index = pd.MultiIndex.from_product(
        iterables=[name_range, date_range], names=["supplier_name", "report_period_start_date"]
    )
    ## Reindex the dataframe using that complete list of places and dates
    backfilled_df = agency_df.reindex(namedate_index)

    # Zero out missing data
    # backfilled_df.r_gpcd.fillna(0, inplace=True)
    backfilled_df.fillna({'r_gpcd': 0}, inplace=True)
    # backfilled_df.potable_supply_minus_sold_minus_ag_gal.fillna(0, inplace=True)
    backfilled_df.fillna({'potable_supply_minus_sold_minus_ag_gal': 0}, inplace=True)

    # Foward-fill the other remaining columns
    backfilled_df = backfilled_df.groupby("supplier_name").ffill()

    # Reset it
    reset_df = backfilled_df.reset_index()

    # Return it
    return reset_df

In [177]:
backfilled_df = (
    remove_duplicates.groupby("supplier_name").apply(backfill).reset_index(drop=True)
)

  remove_duplicates.groupby("supplier_name").apply(backfill).reset_index(drop=True)


In [178]:
len(backfilled_df)

48195

### Merge regional r-gpcd values to district df

In [179]:
merge_regions_df = pd.merge(
    remove_duplicates, 
    region_df[["hydro_region","report_period_start_date","region_pop_weighted_rgpcd"]], 
    how="left", 
    on=["hydro_region","report_period_start_date"]
).merge(county_df[["main_county","report_period_start_date","county_pop_weighted_rgpcd"]], how="left", on=["main_county","report_period_start_date"])

In [180]:
merge_regions_df.head()

Unnamed: 0,supplier_name,report_period_start_date,org_id,water_system_id,report_period_end_date,main_county,hydro_region,pop_report_period,potable_supply_minus_sold_minus_ag_gal,potable_supply_minus_sold_minus_ag_gal_flag,r_gpcd,res_flag,latitude,longitude,region_pop_weighted_rgpcd,county_pop_weighted_rgpcd
0,City of Adelanto,2024-04-01,13,CA3610001,2024-04-30,San Bernardino,South Lahontan,39930,111625700.0,,59.67,,34.582,-117.419,71.394218,72.595237
1,City of Adelanto,2024-03-01,13,CA3610001,2024-03-31,San Bernardino,South Lahontan,39930,107349400.0,,41.33,,34.582,-117.419,52.337422,62.583979
2,City of Adelanto,2024-02-01,13,CA3610001,2024-02-29,San Bernardino,South Lahontan,39930,71419060.0,,41.65,,34.582,-117.419,65.945242,67.664387
3,City of Adelanto,2024-01-01,13,CA3610001,2024-01-31,San Bernardino,South Lahontan,39930,70028640.0,,43.01,,34.582,-117.419,69.937341,77.570275
4,City of Adelanto,2023-12-01,13,CA3610001,2023-12-31,San Bernardino,South Lahontan,39930,106863900.0,,44.39,,34.582,-117.419,70.85796,81.129522


### Round water use figures to save space

In [181]:
merge_regions_df["potable_supply_minus_sold_minus_ag_gal"] = merge_regions_df["potable_supply_minus_sold_minus_ag_gal"].round(0)

In [182]:
merge_regions_df["r_gpcd"] = merge_regions_df["r_gpcd"].round(1)

In [183]:
merge_regions_df["region_pop_weighted_rgpcd"] = merge_regions_df["region_pop_weighted_rgpcd"].round(1)

In [184]:
merge_regions_df["county_pop_weighted_rgpcd"] = merge_regions_df["county_pop_weighted_rgpcd"].round(1)

In [185]:
statewide_df["state_pop_weighted_rgpcd"] = statewide_df["state_pop_weighted_rgpcd"].round(1)

### Rename columns for clarity and brevity

In [186]:
rename_df = merge_regions_df.rename(columns={
    "water_system_id": "pwsid",
    "report_period_start_date": "reporting_month",
    "pop_report_period": "population",
    "dwr_standard_level": "dwr_stage",
    "potable_supply_minus_sold_minus_ag_gal": "total_water_production",
    "r_gpcd": "r_gpcd",
    "region_pop_weighted_rgpcd": "regional_r_gpcd",
    "county_pop_weighted_rgpcd": "county_r_gpcd"
})

### Chart

In [187]:
melt = pd.melt(
    rename_df, 
    id_vars=["supplier_name","main_county","reporting_month"], 
    value_vars=["r_gpcd","county_r_gpcd"]
)

In [188]:
# melt[(melt.hydro_region == 'South Coast') & (melt.supplier_name.str.contains('Los Angeles'))].supplier_name.unique()

In [189]:
agency_name = "Los Angeles City Department of Water And Power"
# agency_name = "East Bay Municipal Utility District"

base = alt.Chart(
    rename_df[
        (rename_df.supplier_name == agency_name)
    ].head(12)
).encode(
    x=alt.X("yearmonth(reporting_month):O").axis(title=""),
    tooltip=["r_gpcd","reporting_month"]
)

bar = base.mark_bar(color="#83c6e0").encode(
    y=alt.Y("r_gpcd", stack=None).axis(title="Residential gallons per capita per day"),
    text="r_gpcd"
)

avg_line = base.mark_line(interpolate='step', color='#1281aa').encode(
    y=alt.Y("county_r_gpcd"),
    text="county_r_gpcd"
)

# goal_line = alt.Chart(pd.DataFrame({'y': [80]})).mark_rule(color="#b75a36",strokeDash=[10,11]).encode(y='y')

(
    bar + 
    avg_line + 
    bar.mark_text(align='center', dy=-7) +
    avg_line.mark_text(align='center', dy=-7)
).properties(title=f"{agency_name} residential water usage compared to county average", width=600)

In [190]:
base = alt.Chart(
    statewide_df.tail(12)
).encode(
    x=alt.X("yearmonth(report_period_start_date):O").axis(title=""),
    tooltip=["state_pop_weighted_rgpcd","report_period_start_date"]
)

bar = base.mark_bar(color="#83c6e0").encode(
    y=alt.Y("state_pop_weighted_rgpcd", stack=None).axis(title="Residential gallons per capita per day"),
    text="state_pop_weighted_rgpcd"
)

# avg_line = base.mark_line(interpolate='step', color='#1281aa').encode(
#     y=alt.Y("regional_r_gpcd"),
#     text="regional_r_gpcd"
# )

# goal_line = alt.Chart(pd.DataFrame({'y': [80]})).mark_rule(color="#b75a36",strokeDash=[10,11]).encode(y='y')

(
    bar + 
    # avg_line + 
    bar.mark_text(align='center', dy=-7) 
    # avg_line.mark_text(align='center', dy=-7)
).properties(title=f"Statewide residential water usage", width=600)

### Sort data

In [191]:
sort_district_df = rename_df.sort_values(["reporting_month","main_county","supplier_name"])

In [192]:
sort_region_df = region_df.rename(columns={"report_period_start_date":"reporting_month"}).sort_values(["reporting_month","hydro_region"])

In [193]:
sort_county_df = county_df.rename(columns={"report_period_start_date":"reporting_month"}).sort_values(["reporting_month","main_county"])

In [194]:
sort_state_df = statewide_df.rename(columns={"report_period_start_date":"reporting_month"}).sort_values(["reporting_month"])

### Filter dataframe to be a bit more manageable

In [195]:
#filtered_district_df = sort_district_df[sort_district_df.reporting_month >= "2021-01-15"]

In [196]:
last_twelve_months_df = sort_district_df.sort_values('reporting_month').groupby('pwsid').tail(12)

In [198]:
latest_df = sort_district_df[
    (sort_district_df.r_gpcd > 0)
].sort_values('reporting_month').groupby('pwsid').tail(1)

### Export

In [199]:
last_twelve_months_df.to_csv("data/processed/district-level-residential-use.csv", index=False)

In [200]:
latest_df.to_csv("data/processed/latest-district-level-residential-use.csv", index=False)

In [201]:
sort_region_df.to_csv("data/processed/regional-residential-usage.csv", index=False)

In [202]:
sort_region_df[
    sort_region_df.reporting_month == sort_district_df.reporting_month.max()
].to_csv("data/processed/latest-regional-residential-use.csv", index=False)

In [203]:
sort_county_df.to_csv("data/processed/county-residential-usage.csv", index=False)

In [204]:
sort_county_df[
    sort_county_df.reporting_month == sort_county_df.reporting_month.max()
].to_csv("data/processed/latest-county-residential-use.csv", index=False)

In [205]:
sort_state_df.to_csv("data/processed/statewide-residential-usage.csv", index=False)

In [206]:
sort_state_df[
    sort_state_df.reporting_month == sort_district_df.reporting_month.max()
].to_csv("data/processed/latest-statewide-level-residential-use.csv", index=False)