In [130]:
import numpy as np
import requests as re
import pandas as pd
import datetime
import altair as alt
import altair_latimes as lat

In [131]:
alt.themes.register('latimes', lat.theme)
alt.themes.enable('latimes')

ThemeRegistry.enable('latimes')

In [132]:
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [133]:
today = datetime.date.today()
today

datetime.date(2024, 2, 16)

# Download daily precipitation data from California Water Watch
Process to extract and clean precip. stats for the state and each hydrologic region

[Source](https://cww.water.ca.gov/regionscale)

## 1. Download

Get the most recent water year

In [134]:
def getWaterYear(date):
    month = date.month
    day = date.day

    if month >= 10:
        water_year = date.year + 1
    else:
        water_year = date.year

    return water_year

In [135]:
current_water_year = getWaterYear(today)

List the water years for the CWW query

In [136]:
all_years = list(range(1999,current_water_year))

In [137]:
all_years_list_str = str(all_years)[1:-1]

List the region IDs

In [138]:
atlasIDs = [
    'Statewide',
    'North Coast',
    'Sacramento River',
    'North Lahontan',
    'San Francisco Bay',
    'San Joaquin River',
    'Central Coast',
    'Tulare Lake',
    'South Lahontan',
    'South Coast',
    'Colorado River'
]

Loop through the IDs

In [139]:
df_list = []

for atlasId in atlasIDs:
    
    # get data and parse
    url = f"https://cww.water.ca.gov/service/prism/huc8/precipstatsmultiyear?years={all_years_list_str},3&atlasIDs={atlasId}&dataScales=Watershed"
    data = re.get(url).json()['data']
    columns = data['seriesNames']
    columns.insert(0, "date")
    
    # create df
    df = pd.DataFrame(data['series'], columns=columns)
        
    df["hydrologic_region"] = atlasId
    
    df_list.append(df)

## 2. Transform the data

In [140]:
wy_range = str(current_water_year-1) + "-" + str(current_water_year)
wy_range

'2023-2024'

In [141]:
def melt(df):
    # melt dataframe
    melt_df = pd.melt(
        df,
        id_vars=["date","Average","hydrologic_region"],
        var_name="water_year",
        value_name="precip_inches"
    )
    
    # clean date
    melt_df["date"] = pd.to_datetime(melt_df["date"])
    melt_df["month"] = melt_df["date"].dt.month
    melt_df["day"] = melt_df["date"].dt.day
    
    melt_df["water_year_start"] = melt_df["water_year"].str.split("-").str[0]
    melt_df["water_year_end"] = melt_df["water_year"].str.split("-").str[1]
    
    melt_df.loc[
        (melt_df.month <= 10),
        "clean_date"
    ] = melt_df["water_year_start"].astype(str) + "-" + melt_df["month"].astype(str) + "-" + melt_df["day"].astype(str)
    melt_df.loc[
        (melt_df.month > 10),
        "clean_date"
    ] = melt_df["water_year_end"].astype(str) + "-" + melt_df["month"].astype(str) + "-" + melt_df["day"].astype(str)
    
    melt_df["clean_date"] = pd.to_datetime(melt_df["clean_date"], errors='coerce')
    
    return melt_df

In [142]:
df_list_clean = []

for df in df_list:
    
    # set index to date and current water year
    # forward fill values
    df = df.set_index(["date",wy_range]).ffill().reset_index()
    
    # re-shape with melt
    melt_df = melt(df)
    
    # sort by water year and date
    sort_df = melt_df.sort_values(["water_year_start", "date"], ascending=[False, True])
    
    # calculate daily precip amounts
    sort_df["precip_daily"] = sort_df.groupby(["water_year"])[["precip_inches"]].diff().sort_values("precip_inches", ascending=False)
    
    # parse and clean date
    sort_df["date"] = pd.to_datetime(sort_df["date"])
    sort_df["month"] = sort_df["date"].dt.month
    sort_df["day"] = sort_df["date"].dt.day
    sort_df["clean_date"] = pd.to_datetime(sort_df["clean_date"])

    # cut leap days 
    bye_leap_day_df = sort_df[~((sort_df.date.dt.month == 2) & (sort_df.date.dt.day == 29))].copy()

    # on Oct. 1, replace the N/A daily value with the accumulated value (`precip_inches`)
    bye_leap_day_df.loc [
        (bye_leap_day_df["month"]==10) & (bye_leap_day_df["day"]==1),
        "precip_daily"
    ] = bye_leap_day_df["precip_inches"]
    
    # fill missing data with "M" so that we can differentiate missing and 0 precip. values later on
    bye_leap_day_df["precip_daily"] = bye_leap_day_df["precip_daily"].fillna("M")
    # bye_leap_day_df.loc[
    #     "precip_daily"
    # ] = bye_leap_day_df["precip_daily"].fillna("M")
    
    # Clean water year column from (YYYY-YYYY) to (YYYY-YY) format    
    #bye_leap_day_df["water_year"] = bye_leap_day_df["water_year_start"].astype(str) + "-" + bye_leap_day_df["water_year_end"].astype(str).str[2:]
    
    bye_leap_day_df.loc[
        "water_year"
    ] = (bye_leap_day_df["water_year_start"].astype(str) + "-" + bye_leap_day_df["water_year_end"].astype(str).str[2:])    
    
    bye_leap_day_df.loc[
        bye_leap_day_df.water_year == "Average-n",
        "water_year"
    ] = "Average"
    
    # drop na dates
    drop_weird_rows = bye_leap_day_df.dropna(subset=['date']).copy()
    
    df_list_clean.append(drop_weird_rows)

In [143]:
df = pd.concat(df_list_clean)

In [144]:
df["date"] = pd.to_datetime(df["date"])
df["clean_date"] = pd.to_datetime(df["clean_date"])

In [145]:
chart_region = atlasIDs[0]

alt.Chart(df[df.hydrologic_region == chart_region ]).mark_circle().encode(
    x="date:T",
    y="water_year",
    #color="precip_inches",
    size="precip_daily"
).properties(
    title=f"{chart_region} precipitation", 
    width=800
)

## 3. Trim and export cumulative and daily CSVs

In [146]:
diffs_df = df.drop("precip_inches", axis=1).rename(columns={"precip_daily":"precip_inches"})

In [147]:
diffs_df.to_csv("../../data/processed/precipitation/daily/all-regions-daily-precip.csv", index=False)

In [148]:
cumulative_df = df.drop("precip_daily", axis=1)

In [149]:
cumulative_df.to_csv("../../data/processed/precipitation/daily/all-regions-accumulated-precip.csv", index=False)