# Predict Mean Runoff

## Introduction

In the data preprocessing, we computed the entropy of the distribution of each individual streamflow time series in bits per sample.  We'll now use an ensemble decision tree method called gradient boosting using the XGBoost (eXtreme Gradient Boosted decision tree) {cite}`chen2016xgboost` library to see if runoff can be predicted from catchment attributes as was shown in {cite}`addor2018ranking`.  The model input features are added in successive model tests to compare the contribution of catchment attribute groups related to climate, terrain, land cover, and soil.  

In [1]:
import os
import pandas as pd
import numpy as np

from bokeh.plotting import figure, show
from bokeh.layouts import gridplot
from bokeh.models import ColumnDataSource
from bokeh.io import output_notebook
from bokeh.palettes import Sunset10, Vibrant7

import xgboost as xgb
from sklearn.metrics import (
    root_mean_squared_error,
    mean_absolute_error,
    roc_auc_score,
    accuracy_score,
)

import data_processing_functions as dpf

from scipy.stats import linregress
output_notebook()

BASE_DIR = os.getcwd()

## Load Input Data

In [2]:
# load the catchment characteristics
attributes_filename = 'BCUB_watershed_attributes_updated.csv'
df = pd.read_csv(os.path.join('data', attributes_filename))
df.columns = [c.lower() for c in df.columns]
df.sort_values('official_id', inplace=True)

In [3]:
# load camels hydro attributes
cdf = pd.read_csv('data/camels/camels_hydro.txt', sep=';')
cdf['gauge_id'] = cdf['gauge_id'].astype(str)

Compute the mean runoff for each streamflow timeseries.

In [4]:
import re
def remove_leading_zeros(s):
    if re.match(r'^\d+$', s):  # Check if the string contains only digits
        return str(int(s))  # Convert to an integer and back to string to remove leading zeros
    return s  # If the string contains letters, return it unchanged


In [5]:
cm_means, hs_means = [], []
for i, row in df.iterrows():
    mean_runoff = dpf.compute_mean_runoff(row)
    df.loc[i, 'mean_runoff'] = mean_runoff
    # check the values against CAMELS
    camels_oid = remove_leading_zeros(row['official_id'])
    if camels_oid in cdf['gauge_id'].values:
        camels_q_mean = cdf.loc[cdf['gauge_id'] == camels_oid, 'q_mean'].values[0]
        # print(row['official_id'], round(mean_runoff, 2), round(camels_q_mean, 2))
        cm_means.append(camels_q_mean)
        hs_means.append(mean_runoff)

df.to_csv(os.path.join(BASE_DIR, 'data', attributes_filename), index=False)

In [6]:
# verify the conversion -- note some drainage areas are updated in the 
# (updated) HYSETS dataset so they may not match
cm_fig = figure(title='Camels vs. Hysets mean runoff', width=400, height=300)
cm_fig.scatter(cm_means, hs_means, size=3, alpha=0.6)
cm_fig.xaxis.axis_label = r'$$\text{Camels mean runoff } [mm/d]$$'
cm_fig.yaxis.axis_label = r'$$\text{HYSETS mean runoff } [mm/d]$$'
show(cm_fig)
    

Subdivide the attributes into related classes: terrain, land cover, soil, climate.

In [7]:
# list all the attributes in the input dataframe
print(df.columns.tolist())

['region', 'official_id', 'drainage_area_km2', 'centroid_lon_deg_e', 'centroid_lat_deg_n', 'logk_ice_x100', 'porosity_x100', 'land_use_forest_frac_2010', 'land_use_shrubs_frac_2010', 'land_use_grass_frac_2010', 'land_use_wetland_frac_2010', 'land_use_crops_frac_2010', 'land_use_urban_frac_2010', 'land_use_water_frac_2010', 'land_use_snow_ice_frac_2010', 'lulc_check_2010', 'land_use_forest_frac_2015', 'land_use_shrubs_frac_2015', 'land_use_grass_frac_2015', 'land_use_wetland_frac_2015', 'land_use_crops_frac_2015', 'land_use_urban_frac_2015', 'land_use_water_frac_2015', 'land_use_snow_ice_frac_2015', 'lulc_check_2015', 'land_use_forest_frac_2020', 'land_use_shrubs_frac_2020', 'land_use_grass_frac_2020', 'land_use_wetland_frac_2020', 'land_use_crops_frac_2020', 'land_use_urban_frac_2020', 'land_use_water_frac_2020', 'land_use_snow_ice_frac_2020', 'lulc_check_2020', 'slope_deg', 'aspect_deg', 'median_el', 'mean_el', 'max_el', 'min_el', 'elevation_m', 'prcp', 'tmin', 'tmax', 'vp', 'swe', 's

## Define attribute groups

In [8]:
terrain = ['drainage_area_km2', 'elevation_m', 'slope_deg', 'aspect_deg']
land_cover = [
    'land_use_forest_frac_2010', 'land_use_grass_frac_2010', 'land_use_wetland_frac_2010', 'land_use_water_frac_2010', 
    'land_use_urban_frac_2010', 'land_use_shrubs_frac_2010', 'land_use_crops_frac_2010', 'land_use_snow_ice_frac_2010']
climate = ['prcp', 'srad', 'swe', 'tmax', 'tmin', 'vp', 'high_prcp_freq', 'high_prcp_duration', 'low_prcp_freq', 'low_prcp_duration']
soil = ['logk_ice_x100', 'porosity_x100']
all_attributes = terrain + land_cover + soil + climate
len(all_attributes)

24

In [9]:
assert len([c for c in all_attributes if c not in df.columns]) == 0

In [10]:
results_folder = os.path.join(BASE_DIR, 'data', 'runoff_prediction_results')
if not os.path.exists(results_folder):
    os.makedirs(results_folder)

In [11]:
def predict_runoff_from_attributes(df, train_indices, test_indices, group_order, results_folder):
        
    # set the target column
    target_column = f'mean_runoff'
    test_attributes = []

    # add attribute groups successively
    for set_name in group_order:
        attribute_set = attribute_set_dict[set_name]
        print(f' Processing {set_name} attribute set')
        test_attributes += attribute_set
        
        input_data = df[test_attributes + [target_column]].copy()

        # run the XGBoost model with cross validation and test on holdout set
        trial_df, test_df = dpf.run_xgb_CV_trials(
            set_name, test_attributes, target_column, input_data, train_indices, 
            test_indices, n_optimization_rounds, nfolds, n_boost_rounds, results_folder
        )

        test_rmse = root_mean_squared_error(test_df['actual'], test_df['predicted'])
        test_mae = mean_absolute_error(test_df['actual'], test_df['predicted'])

        print(f'  {set_name}')
        print(f'   held-out test rmse: {test_rmse:.2f}, mae: {test_mae:.2f}')
        print('')
        # store the test set predictions and actuals
        all_test_results[set_name] = {
            'trials': trial_df, 'test_df': test_df,
            'test_mae': test_mae, 'test_rmse': test_rmse,
        } 
    return all_test_results

In [12]:
# define the amount of data to set aside for final testing
holdout_pct = 0.10
nfolds = 5
n_boost_rounds = 5000
n_optimization_rounds = 20

all_test_results = {}
attribute_set_dict = {
    'climate': climate, 
    '+land_cover': land_cover,
    '+terrain': terrain, 
    '+soil': soil,
}

## Set Attribute Groupings

In [13]:
group_1 = ['climate', '+terrain', '+land_cover', '+soil']
# for group 2, just reverse group 1
group_2 = group_1[::-1]
group_3 = ['+land_cover', '+terrain', '+soil', 'climate']
group_4 = ['+soil', 'climate', '+land_cover', '+terrain']
attribute_group_orders = [group_1, group_2, group_3, group_4]

## Run XGBoost Models

Separate the test set at the outset so the attribute group ordering is tested on the same hold-out set but necessarily on unique training optimizations.  This ensures that at least the presence of outliers in the hold-out set should at least be constant across the attribute group reordering.

In [14]:
# reset the index to ensure the split is done correctly
df.reset_index(drop=True, inplace=True)
train_indices, test_indices = dpf.train_test_split(df, holdout_pct)

In [15]:
n = 0
group_results = {}
for group in attribute_group_orders:
    print(f'Processing: {group} ordering.')
    n += 1
    test_results_fname = f'Mean_runoff_prediction_results_{n}.npy'
    test_results_fpath = os.path.join(results_folder, test_results_fname)
    if os.path.exists(test_results_fpath):
        all_test_results = np.load(test_results_fpath, allow_pickle=True).item()
    else:
        all_test_results = predict_runoff_from_attributes(df, train_indices, test_indices, group, results_folder)
        np.save(test_results_fpath, all_test_results)
        
    group_results[n] = {'order': group, 'results': all_test_results}
    

Processing: ['climate', '+terrain', '+land_cover', '+soil'] ordering.
Processing: ['+soil', '+land_cover', '+terrain', 'climate'] ordering.
Processing: ['+land_cover', '+terrain', '+soil', 'climate'] ordering.
Processing: ['+soil', 'climate', '+land_cover', '+terrain'] ordering.


## View Results

In [16]:
def create_results_plots(results_df, attribute_sets):
    
    plots = []

    test_rmse, test_mae = [], []

    y1 = [results_df[e]['test_rmse'] for e in attribute_sets]
    y2 = [results_df[e]['test_mae'] for e in attribute_sets]

    source = ColumnDataSource({'x': attribute_sets, 'y1': y1, 'y2': y2})

    title = f'Mean Runoff Prediction'

    if len(plots) == 0:
        fig = figure(title=title, x_range=attribute_sets)
    else:
        fig = figure(title=title, x_range=attribute_sets, y_range=plots[0].y_range)
    fig.line('x', 'y1', legend_label='rmse', color='green', source=source, line_width=3)
    fig.line('x', 'y2', legend_label='mae', color='dodgerblue', source=source, line_width=3)
    fig.legend.background_fill_alpha = 0.6
    fig.yaxis.axis_label = 'Error'

    result_df = pd.DataFrame({'set': attribute_sets, 'rmse': y1, 'mae': y2})
    best_rmse_idx = result_df['rmse'].idxmin()
    best_mae_idx = result_df['mae'].idxmin()
    best_rmse_set = result_df.loc[best_rmse_idx, 'set']
    best_mae_set = result_df.loc[best_mae_idx, 'set']
    best_result = results_df[best_rmse_set]['test_df']

    xx, yy = best_result['actual'], best_result['predicted']
    slope, intercept, r, p, se = linregress(xx, yy)

    sfig = figure(title=f'Test: best model {best_rmse_set} (N={len(best_result)})',
                 )
    sfig.scatter(xx, yy, size=3, alpha=0.8)
    x_obs = np.linspace(min(xx), max(xx), 1000)
    ybf = [slope * e + intercept for e in x_obs]
    sfig.line(x_obs, ybf, color='red', line_width=3, line_dash='dashed', legend_label=f'R²={r**2:.2f}')    
    sfig.xaxis.axis_label = r'$$\text{Observed Mean} \left[ mm / day \right]$$'
    sfig.yaxis.axis_label = r'$$\text{Predicted Mean} \left[ mm / day \right]$$'
    sfig.legend.location = 'top_left'
    plots.append(fig)
    plots.append(sfig)
    
    # plot a 1:1 line
    sfig.line([0, max(ybf)], [0, max(ybf)], color='black', line_dash='dotted', 
              line_width=2, legend_label='1:1')
    
    return plots

In the sequence of plots below, we change the order that groups of attributes are added to training.  Testing is done on the same held-out set across all 4 ordered tests plus one test where the 

In [17]:
n = 1
grp_1_plots = create_results_plots(group_results[n]['results'], group_results[n]['order'])
layout = gridplot(grp_1_plots, ncols=2, width=350, height=300)
show(layout)

### Test the sensitivity to Order of attribute groups

In [18]:
n = 2
grp_2_plots = create_results_plots(group_results[n]['results'], group_results[n]['order'])
layout = gridplot(grp_2_plots, ncols=2, width=350, height=300)
show(layout)

In [19]:
n = 3
grp_3_plots = create_results_plots(group_results[n]['results'], group_results[n]['order'])
layout = gridplot(grp_3_plots, ncols=2, width=350, height=300)
show(layout)

In [20]:
n = 4
grp_4_plots = create_results_plots(group_results[n]['results'], group_results[n]['order'])
layout = gridplot(grp_4_plots, ncols=2, width=350, height=300)
show(layout)

### Test randomly permuted target values

As a last iteration, randomize the order of the mean_runoff column to test what the algorithm is learning.

The predictive power decreases substantially across all groupings of input attributes.

In [22]:
test_results_fname = f'Mean_runoff_prediction_results_shuffled.npy'
test_results_fpath = os.path.join(results_folder, test_results_fname)

if os.path.exists(test_results_fpath):
    shuffled_test_results = np.load(test_results_fpath, allow_pickle=True).item()
else:
    shuffled_df = df.copy()
    for attr in all_attributes:
        # randomly shuffle the order of attribute values
        attr_values = df[attr].values
        np.random.shuffle(attr_values)
        shuffled_df[attr] = attr_values
    
    shuffled_test_results = predict_runoff_from_attributes(shuffled_df, train_indices, test_indices, group_1, results_folder)
    np.save(test_results_fpath, shuffled_test_results)

group_results['shuffled'] = {'order': group_1, 'results': shuffled_test_results}

 Processing climate attribute set
    2.41 ± 0.230 mean RMSE (of 20 hyperparameter optimization rounds.)
  climate
   held-out test rmse: 3.10, mae: 2.40

 Processing +terrain attribute set
    2.41 ± 0.231 mean RMSE (of 20 hyperparameter optimization rounds.)
  +terrain
   held-out test rmse: 3.15, mae: 2.46

 Processing +land_cover attribute set
    2.40 ± 0.235 mean RMSE (of 20 hyperparameter optimization rounds.)
  +land_cover
   held-out test rmse: 3.14, mae: 2.41

 Processing +soil attribute set
    2.40 ± 0.235 mean RMSE (of 20 hyperparameter optimization rounds.)
  +soil
   held-out test rmse: 3.12, mae: 2.42



### View results of shuffled target variable (mean runoff)

In [23]:
shuffled_results = group_results['shuffled']['results']
group_order = group_results['shuffled']['order']
shuffled_runoff_plots = create_results_plots(shuffled_results, group_order)
layout = gridplot(shuffled_runoff_plots, ncols=2, width=350, height=300)
show(layout)

## Discussion

- Reordering the attribute groupings suggests there are interactions between attributes in model training.  
- Across all orderings, the climate and terrain attributes appear to be the best predictors,
- Soil attributes contribute little or no explanatory power to the model.
- Randomly permuting the order of the target variable, `mean_runoff` erases all predictive power.


## Citations

```{bibliography}
:filter: docname in docnames
```