# HBV calibration

This notebook is used to calibrate the parameters for the HBV model for the Loire river analysis. The period for calibration is chosen to be from 2015 till 2019, as ERA5 only goes till 2019 for this catchment. 

### 1. Importing general python modules

In [134]:
# general python
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

import numpy as np
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import xarray as xr
import geopandas as gpd
import pandas as pd

#niceties
from rich import print

# Needed
from ipywidgets import IntProgress
from IPython.display import display
from scipy.stats import qmc
from sklearn.metrics import mean_squared_error
from scipy.stats import wasserstein_distance

In [135]:
# general eWaterCycle
import ewatercycle
import ewatercycle.models
import ewatercycle.forcing

In [136]:
# import drought analyser function
%run Drought_analyser.ipynb

### 2. Defining experiment data and paths 

In [137]:
# name of the catchment
basin_name = "FR003882"

# defining dates for calibration
experiment_start_date = "1990-01-01"
experiment_end_date = "2019-12-31"

# defining path for catchment shape file
station_shp = Path.home() / "BEP-Loire" / "book" / "model_loire" / "estreams_cb_FR003882.shp"

# defining destination path for ERA5 data
forcing_path_ERA5 = Path.home() / "forcing" / "loire_river" / "ERA5-90-19"
forcing_path_ERA5.mkdir(exist_ok=True)

# model HBV destination path
model_path_HBV = Path.home() / "tmp" / "HBV_model"

gdf = gpd.read_file("estreams_cb_FR003882.shp")
gdf = gdf.to_crs(epsg=2154)
gdf["area_km2"] = gdf.geometry.area / 1e6  
basin_area = gdf["area_km2"].sum()
#basin_area = 40630
#print(basin_area)

### 3. Generating ERA5 forcings

In [138]:
# option one: generate forcing:
ERA5_start_date = experiment_start_date+'T00:00:00Z'
ERA5_end_date = experiment_end_date+'T00:00:00Z'

#ERA5_forcing = ewatercycle.forcing.sources["LumpedMakkinkForcing"].generate(
#   dataset="ERA5",
#   directory= str(forcing_path_ERA5),
#   start_time=ERA5_start_date,
#   end_time=ERA5_end_date,
#   shape=station_shp,
#)

# get data from stored location
#load_location = forcing_path_ERA5 / "work" / "diagnostic" / "script" 
ERA5_forcing = ewatercycle.forcing.sources["LumpedMakkinkForcing"].load(directory=forcing_path_ERA5)
print(ERA5_forcing)

### 4. Defining historical data from eStreams

In [139]:
q_data = pd.read_csv("FR003882_streamflow_m3s.csv", index_col='date', parse_dates=True)
Q_obs = q_data[experiment_start_date:experiment_end_date]

### 5. Calibrate the parameters

#### 5.1 Calibration method 1: RMSE
For the first method 'Root Mean Squared Error' is used. This method is used first to check wether the rest of the calibration method works fine, as this should give good parameters for discharge. Still, the RMSE is only used for the discharge values under 150 m3/s, as we are only interested in low water flow. And the first year of the modeled discharge is also being ignored for calibration, because the s_0 (initial storages) need time to fill. Also the code checks wether the mean_flow is lower than 30 m3/s, to speed up the process, as these values can never be a good Q_model and thus the parameters are not useful.

In [140]:
def drought_calibration_objective1(modelOutput, observation, start_calibration, end_calibration):
    # Combine modeled and observed data in one DataFrame
    hydro_data = pd.concat([modelOutput.reindex(observation.index, method='ffill'), observation], axis=1)

    # Select calibration period (skip first year)
    start_calibration = str(int(start_calibration[:4]) + 1) + start_calibration[4:]
    hydro_data = hydro_data[hydro_data.index > pd.to_datetime(pd.Timestamp(start_calibration).date())]
    hydro_data = hydro_data[hydro_data.index < pd.to_datetime(pd.Timestamp(end_calibration).date())]
    hydro_data = hydro_data.dropna(subset=[basin_name])

    # Check if the discharge is not too low
    mean_flow = hydro_data['model output'].mean()
    if mean_flow < 30:
        #print(f"Skipping iteration: Mean flow {mean_flow:.2f} m³/s is too low.")
        return np.inf

    # Use RMSE on filtered data
    #filtered_data = hydro_data[(hydro_data[basin_name] < 500) & 
    #                           (hydro_data['model output'] < 150)]
    rms = mean_squared_error(hydro_data[0], hydro_data[1], squared=False)

    return rms

#### 5.2 Calibration method 2: Normal distribution

In [141]:
def drought_calibration_objective2(modelOutput, observation, start_calibration, end_calibration):
    # Combine modeled and observed data in one DataFrame
    hydro_data = pd.concat([modelOutput.reindex(observation.index, method='ffill'), observation], axis=1)

    # Select calibration period
    start_calibration = str(int(start_calibration[:4]) + 1) + start_calibration[4:]
    hydro_data = hydro_data[hydro_data.index > pd.to_datetime(pd.Timestamp(start_calibration).date())]
    hydro_data = hydro_data[hydro_data.index < pd.to_datetime(pd.Timestamp(end_calibration).date())]
    hydro_data = hydro_data.dropna(subset=[basin_name])
    
    mean_flow = hydro_data['model output'].mean()
    if mean_flow < 30:  # Threshold check
        return np.inf

    # Run drought analyser on both modeled and observed data
    drought_obs = drought_analyser(hydro_data[basin_name], basin_name, 60)
    drought_model = drought_analyser(hydro_data['model output'], 'model output', 60)
    
    if drought_model.empty or drought_obs.empty:
        return np.inf  # Handle empty dataframes gracefully

    # Extract drought duration and max deficit values
    x_obs, y_obs = drought_obs["Duration (days)"], drought_obs["Max Cumulative Deficit (m3/s)"]
    x_model, y_model = drought_model["Duration (days)"], drought_model["Max Cumulative Deficit (m3/s)"]

    # Compute Earth Mover’s Distance for both metrics
    duration_emd = wasserstein_distance(x_obs, x_model)
    deficit_emd = wasserstein_distance(y_obs, y_model)

    # Weighted combination of both metrics
    total_distance = duration_emd * 0.5 + deficit_emd * 0.5

    return total_distance

#### 5.3 Calibration method 3: Fitted polynomial

In [153]:
def drought_calibration_objective3(modelOutput, observation, start_calibration, end_calibration):
    # Combine modeled and observed data in one DataFrame
    hydro_data = pd.concat([modelOutput.reindex(observation.index, method='ffill'), observation], axis=1)
    
    # Select calibration period
    start_calibration = str(int(start_calibration[:4]) + 1) + start_calibration[4:]
    hydro_data = hydro_data[hydro_data.index > pd.to_datetime(pd.Timestamp(start_calibration).date())]
    hydro_data = hydro_data[hydro_data.index < pd.to_datetime(pd.Timestamp(end_calibration).date())]
    hydro_data = hydro_data.dropna(subset=[basin_name])
    
    #mean_flow = hydro_data['model output'].mean()
    #if mean_flow < 30:  # Threshold check (adjust as needed)
    #    #print(f"Skipping iteration: Mean flow {mean_flow:.2f} m³/s is too low.")
    #    return np.inf

    # Run drought analyser on both modeled and observed data
    drought_obs = drought_analyser(hydro_data[basin_name], basin_name, 60)
    drought_model = drought_analyser(hydro_data['model output'], 'model output', 60)
    
    if (
        1.2 * drought_obs["Max Cumulative Deficit (m3/s)"].min() < drought_model["Max Cumulative Deficit (m3/s)"].min() or
        0.8 * drought_obs["Max Cumulative Deficit (m3/s)"].min() > drought_model["Max Cumulative Deficit (m3/s)"].min()
    ):
        #print(drought_obs["Max Cumulative Deficit (m3/s)"].min(), drought_model["Max Cumulative Deficit (m3/s)"].min())
        return np.inf

    # Fit quadratic curves to both datasets
    x_obs, y_obs = drought_obs["Duration (days)"], drought_obs["Max Cumulative Deficit (m3/s)"]
    x_model, y_model = drought_model["Duration (days)"], drought_model["Max Cumulative Deficit (m3/s)"]

    if len(x_obs) > 1 and len(x_model) > 1:
        coeffs_obs = np.polyfit(x_obs, y_obs, 1)
        coeffs_model = np.polyfit(x_model, y_model, 1)
    else:
        #print(f"Insufficient data points for polyfit ({len(x_obs)}, {len(x_model)}).")
        return np.inf

    # Calculate sum of difference between polynomial coefficients and distribution
    poly_diff = np.sum(np.abs(coeffs_obs - coeffs_model))
    distribution_diff = earth_movers_distance(x_obs, y_obs, x_model, y_model)

    return poly_diff + distribution_diff

#### 5.4 Start calibrating

In [154]:
# Define initial storage for the model
#               Si,  Su, Sf, Ss, Sp
s_0 = np.array([0,  100,  0,  15, 0])


# Define parameter ranges for the model
p_min = np.array([0,   0.2,  40,    .5,   .001,   1,     .01,  .0001,  .01])
p_max = np.array([8,    1,  800,   4,    .3,     10,    .1,   .01,  0.5])

# Sample random parameter combinations
N = 10
parameters = np.zeros([9, N])

# Create a Latin Hypercube sampler
sampler = qmc.LatinHypercube(d=9)
sample = sampler.random(n=N)

# Scale the sample to match the parameter ranges
parameters = qmc.scale(sample, p_min, p_max)

In [155]:
ensemble = []

for counter in range(N): 
    ensemble.append(ewatercycle.models.HBVLocal(forcing=ERA5_forcing))
    config_file, _ = ensemble[counter].setup(
                            parameters = parameters[counter],
                            initial_storage=s_0,
                            cfg_dir = model_path_HBV,
                               )
    ensemble[counter].initialize(config_file)

In [156]:
# Progress bar for visualization
f = IntProgress(min=0, max=N)
display(f)

# Array to store objective values
objectives = []

# Loop over ensemble members
for ensembleMember in ensemble:
    Q_m = []
    time = []
    while ensembleMember.time < ensembleMember.end_time:
        ensembleMember.update()
        discharge_this_timestep = ensembleMember.get_value("Q")
        Q_m.append(discharge_this_timestep[0])
        time.append(pd.Timestamp(ensembleMember.time_as_datetime.date()))

    # Create DataFrame for model results
    Q_m = convert_Qsim_mmday_to_m3s(np.array(Q_m), basin_area)
    discharge_dataframe = pd.DataFrame({'model output': Q_m}, index=pd.to_datetime(time))
    print(f"Mean Discharge : {discharge_dataframe.mean()} m3/s")

    # Calculate the custom drought-based objective function
    objective_this_model = drought_calibration_objective3(
        discharge_dataframe, 
        Q_obs, 
        experiment_start_date, 
        experiment_end_date
    )
    objectives.append(objective_this_model)

    # Free up memory
    del Q_m, time, discharge_dataframe, objective_this_model
    f.value += 1

# Clean up models to save memory
for ensembleMember in ensemble:
    ensembleMember.finalize()

IntProgress(value=0, max=10)

### 6. Results

In [158]:
# Let's also show the minimal values:
parameters_minimum_index = np.argmin(np.array(objectives))
if np.min(np.array(objectives)) == np.inf:
    print("No real parameter is chosen")

parameters_minimum = parameters[parameters_minimum_index]

print(list(parameters_minimum))