# Predictability of Mean Runoff

In the data preprocessing, we computed the entropy of the distribution of each individual streamflow time series in bits per sample.  We'll now use an ensemble decision tree method called gradient boosting using the XGBoost (eXtreme Gradient Boosted decision tree) {cite}`chen2016xgboost` library to see if runoff can be predicted from catchment attributes as was shown in {cite}`addor2018ranking`.  The model input features are added in successive model tests to compare the contribution of catchment attribute groups related to climate, terrain, land cover, and soil.  

In [1]:
import os
import pandas as pd
import numpy as np

from bokeh.plotting import figure, show
from bokeh.layouts import gridplot
from bokeh.models import ColumnDataSource
from bokeh.io import output_notebook
from bokeh.palettes import Sunset10, Vibrant7

import xgboost as xgb
from sklearn.metrics import (
    root_mean_squared_error,
    mean_absolute_error,
    roc_auc_score,
    accuracy_score,
)

import data_processing_functions as dpf

from scipy.stats import linregress
output_notebook()

In [2]:
# load the catchment characteristics
fname = 'BCUB_HYSETS_properties_with_climate_with_entropy.csv'
df = pd.read_csv(os.path.join('data', fname))

Compute the mean runoff for each streamflow timeseries.

In [3]:
for i, row in df.iterrows():
    mean_runoff = dpf.compute_mean_runoff(row)
    df.loc[i, 'mean_runoff'] = mean_runoff

Subdivide the attributes into related classes: terrain, land cover, soil, climate.

In [4]:
print(df.columns.tolist())

['official_id', 'watershed_id', 'name', 'centroid_lat_deg_n', 'centroid_lon_deg_e', 'drainage_area_km2', 'drainage_area_gsim_km2', 'flag_gsim_boundaries', 'flag_artificial_boundaries', 'elevation_m', 'slope_deg', 'gravelius', 'perimeter', 'flag_shape_extraction', 'aspect_deg', 'flag_terrain_extraction', 'land_use_forest_frac_2010', 'land_use_grass_frac_2010', 'land_use_wetland_frac_2010', 'land_use_water_frac_2010', 'land_use_urban_frac_2010', 'land_use_shrubs_frac_2010', 'land_use_crops_frac_2010', 'land_use_snow_ice_frac_2010', 'flag_land_use_extraction', 'logk_ice_x100', 'porosity_x100', 'flag_subsoil_extraction', 'year_from', 'year_to', 'record_length', 'agency', 'status', 'updated_official_basin', 'in_bcub', 'prcp', 'srad', 'swe', 'tmax', 'tmin', 'vp', 'high_prcp_freq', 'high_prcp_duration', 'low_prcp_freq', 'low_prcp_duration', 'H_4_bits', 'H_6_bits', 'H_8_bits', 'H_9_bits', 'H_10_bits', 'H_11_bits', 'H_12_bits', 'mean_runoff']


In [5]:
terrain = ['drainage_area_km2', 'elevation_m', 'slope_deg', 'gravelius', 'perimeter', 'aspect_deg']
land_cover = [
    'land_use_forest_frac_2010', 'land_use_grass_frac_2010', 'land_use_wetland_frac_2010', 'land_use_water_frac_2010', 
    'land_use_urban_frac_2010', 'land_use_shrubs_frac_2010', 'land_use_crops_frac_2010', 'land_use_snow_ice_frac_2010']
soil = ['logk_ice_x100', 'porosity_x100']
climate = ['prcp', 'srad', 'swe', 'tmax', 'tmin', 'vp', 'high_prcp_freq', 'high_prcp_duration', 'low_prcp_freq', 'low_prcp_duration']
all_attributes = terrain + land_cover + soil + climate

In [6]:
len(all_attributes)

26

In [8]:
def run_trials(set_name, features, target, input_data, train_indices, test_indices, n_optimization_rounds, nfolds, num_boost_rounds):
    
    # randomly select 5% of the stations to leave out for a hold-out test set
    # to ensure none of the data are seen in training    
    X_train, Y_train = input_data.loc[train_indices, features].values, input_data.loc[train_indices, target].values
    X_test, Y_test = input_data.loc[test_indices, features].values, input_data.loc[test_indices, target].values
    
    sample_choices =  np.arange(0.5, 0.9, 0.02) 
    lr_choices = np.arange(0.001, 0.1, 0.0005) 
    learning_rates = np.random.choice(lr_choices, n_optimization_rounds) 
    subsamples = np.random.choice(sample_choices, n_optimization_rounds) 
    colsamples = np.random.choice(sample_choices, n_optimization_rounds) 
       
    all_results = []
    for trial in range(n_optimization_rounds):
        
        lr, ss, cs = learning_rates[trial], subsamples[trial], colsamples[trial]

        params = {
            # "objective": "reg:absoluteerror",
            "objective": "reg:squarederror",
            "eval_metric": "rmse",
            "eta": lr,
            # "max_depth": 6,  # use default max_depth
            # "min_child_weight": 1, # use colsample and subsample instead of min_child_weight
            "subsample": ss,
            "colsample_bytree": cs,
            "seed": 42,
            "device": "cuda",  # note, change this to 'cpu' if your system doesn't have a CUDA GPU
            "sampling_method": 'gradient_based',
            "tree_method": 'hist',
        }
        
        results_folder = os.path.join('./data/', 'runoff_prediction_results')
        if not os.path.exists(results_folder):
            os.makedirs(results_folder)

        results_fname = f'{set_name}_{lr:.3f}_lr_{ss:.3f}_sub_{cs:.3f}_col.csv'
        results_fpath = os.path.join(results_folder, results_fname)

        model_results = xgb.cv(
            params=params,
            dtrain=xgb.DMatrix(X_train, label=Y_train),
            num_boost_round=num_boost_rounds,
            nfold=nfolds,
            metrics=['mae', 'rmse'],
            early_stopping_rounds=20,
            verbose_eval=False,
        )
        best_rmse_round = model_results['test-rmse-mean'].idxmin()
        best_mae_round = model_results['test-mae-mean'].idxmin()
        # print(lr, best_rmse_round, best_mae_round)
                
        results_dict = {
            'best_rmse_round': best_rmse_round,
            'best_mae_round': best_mae_round,
            'min_test_mae': model_results.loc[best_mae_round, 'test-mae-mean'],
            'min_test_rmse': model_results.loc[best_rmse_round, 'test-rmse-mean'],
            'min_mae_stdev': model_results.loc[best_mae_round, 'test-mae-std'],
            'min_rmse_stdev': model_results.loc[best_rmse_round, 'test-rmse-std'],
            'min_train_mae': model_results.loc[best_mae_round, 'train-mae-mean'],
            'min_train_rmse': model_results.loc[best_rmse_round, 'train-rmse-mean'],
        }
        results_cols = list(results_dict.keys())
        results_dict.update(params)
        
        all_results.append(results_dict)
        if (trial > 0) & (trial % 20 == 0):
            print(f'   completed {trial}/{n_optimization_rounds}')
        
    # save the trial results
    trial_results = pd.DataFrame(all_results)
    trial_results.to_csv(results_fpath)
    trial_mean = trial_results['min_test_mae'].mean()
    trial_stdev = trial_results['min_mae_stdev'].mean()
    
    # print(trial_results.sort_values('min_test_mae'))
    
    print(f'    {trial_mean:.2f} ± {trial_stdev:.3f} RMSE mean on the test set (N={len(trial_results)})')
    
    param_cols = list(params.keys())

    # get the optimal hyperparameters
    optimal_rmse_idx = trial_results['min_test_rmse'].idxmin()
    optimal_mae_idx = trial_results['min_test_mae'].idxmin()
    
    # print(optimal_rmse_idx)
    # print(trial_results.sort_values('min_test_rmse'))
    
    # print(trial_results.loc[[optimal_rmse_idx]])
    # print(trial_results.sort_values('min_test_rmse'))
    
    # if optimal_rmse_idx != optimal_mae_idx:
    #     print('best rmse and mae are from different trials, ', optimal_rmse_idx, optimal_mae_idx)
        
    best_rmse_params = trial_results.loc[optimal_rmse_idx, param_cols]
    best_mae_params = trial_results.loc[optimal_mae_idx, param_cols]
    
    final_model = xgb.XGBRegressor(n_estimators=2*num_boost_rounds, 
                                   **best_rmse_params.to_dict())
    final_model.fit(X_train, Y_train)

    predicted_y = final_model.predict(X_test)
    
    test_results = pd.DataFrame(
        {
            'predicted': predicted_y,
            'actual': Y_test,
        })
    
    return trial_results, test_results

In [9]:
# define the amount of data to set aside for final testing
holdout_pct = 0.10
nfolds = 5
n_boost_rounds = 2000
n_optimization_rounds = 20

all_test_results = {}
attribute_set_dict = {
    'climate': climate, 
    '+land_cover': land_cover,
    '+terrain': terrain, 
    '+soil': soil,
}


In [10]:

def predict_runoff_from_attributes(df, holdout_pct, group_order):
    df.reset_index(drop=True, inplace=True)
    train_indices, test_indices = dpf.train_test_split(df, holdout_pct)
        
    # set the target column
    target_column = f'mean_runoff'
    test_attributes = []

    # add attribute groups successively
    for set_name in group_order:
        attribute_set = attribute_set_dict[set_name]
        print(f' Processing {set_name} attribute set')
        test_attributes += attribute_set
        input_data = df[test_attributes + [target_column]].copy()

        # reset the index to ensure the random selection is done properly
        trial_df, test_df = run_trials(set_name, test_attributes, target_column, input_data, train_indices, test_indices, n_optimization_rounds, nfolds, n_boost_rounds)

        test_rmse = root_mean_squared_error(test_df['actual'], test_df['predicted'])
        test_mae = mean_absolute_error(test_df['actual'], test_df['predicted'])

        print(f'  {set_name}')
        print(f'   held-out test rmse: {test_rmse:.2f}, mae: {test_mae:.2f}')
        print('')
        # store the test set predictions and actuals
        all_test_results[set_name] = {
            'trials': trial_df, 'test_df': test_df,
            'test_mae': test_mae, 'test_rmse': test_rmse,
        } 
    return all_test_results

In [11]:
def create_results_plots(all_test_results, attribute_sets):
    
    plots = []

    test_rmse, test_mae = [], []

    y1 = [all_test_results[e]['test_rmse'] for e in attribute_sets]
    y2 = [all_test_results[e]['test_mae'] for e in attribute_sets]

    source = ColumnDataSource({'x': attribute_sets, 'y1': y1, 'y2': y2})

    title = f'Runoff Predictability'

    if len(plots) == 0:
        fig = figure(title=title, x_range=attribute_sets)
    else:
        fig = figure(title=title, x_range=attribute_sets, y_range=plots[0].y_range)
    fig.line('x', 'y1', legend_label='rmse', color='green', source=source, line_width=3)
    fig.line('x', 'y2', legend_label='mae', color='dodgerblue', source=source, line_width=3)
    fig.legend.background_fill_alpha = 0.6
    fig.yaxis.axis_label = 'RMSE'

    result_df = pd.DataFrame({'set': attribute_sets, 'rmse': y1, 'mae': y2})
    best_rmse_idx = result_df['rmse'].idxmin()
    best_mae_idx = result_df['mae'].idxmin()
    best_rmse_set = result_df.loc[best_rmse_idx, 'set']
    best_mae_set = result_df.loc[best_mae_idx, 'set']
    best_result = all_test_results[best_rmse_set]['test_df']

    xx, yy = best_result['actual'], best_result['predicted']
    slope, intercept, r, p, se = linregress(xx, yy)

    sfig = figure(title=f'Test: best model {best_rmse_set} (N={len(best_result)})',
                 )
    sfig.scatter(xx, yy, size=3, alpha=0.8)
    x_obs = np.linspace(min(xx), max(xx), 1000)
    ybf = [slope * e + intercept for e in x_obs]
    sfig.line(x_obs, ybf, color='red', line_width=3, line_dash='dashed', legend_label=f'R²={r**2:.2f}')    
    sfig.xaxis.axis_label = r'$$\text{Observed Mean} \left[ m^3 / s \right]$$'
    sfig.yaxis.axis_label = r'$$\text{Predicted Mean} \left[ m^3 / s \right]$$'
    sfig.legend.location = 'top_left'
    plots.append(fig)
    plots.append(sfig)
    return plots

In [12]:
group_order_1 = ['climate', '+terrain', '+land_cover', '+soil']
group_order_2 = group_order_1[::-1]
group_order_3 = ['+land_cover', '+terrain', '+soil', 'climate']
group_order_4 = ['+soil', 'climate', '+land_cover', '+terrain']

In [13]:
grp_1_results = predict_runoff_from_attributes(df, holdout_pct, group_order_1)

 Processing climate attribute set
    50.86 ± 9.511 RMSE mean on the test set (N=20)


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




  climate
   held-out test rmse: 94.55, mae: 43.30

 Processing +terrain attribute set
    21.60 ± 7.149 RMSE mean on the test set (N=20)
  +terrain
   held-out test rmse: 49.34, mae: 9.42

 Processing +land_cover attribute set
    18.87 ± 7.182 RMSE mean on the test set (N=20)
  +land_cover
   held-out test rmse: 44.49, mae: 8.82

 Processing +soil attribute set
    22.48 ± 7.133 RMSE mean on the test set (N=20)
  +soil
   held-out test rmse: 45.87, mae: 9.55



In [14]:
grp_1_plots = create_results_plots(grp_1_results, group_order_1)
layout = gridplot(grp_1_plots, ncols=2, width=350, height=300)
show(layout)

In [15]:
grp_2_results = predict_runoff_from_attributes(df, holdout_pct, group_order_2)

 Processing +soil attribute set
    49.72 ± 4.015 RMSE mean on the test set (N=20)
  +soil
   held-out test rmse: 141.31, mae: 55.00

 Processing +land_cover attribute set
    31.52 ± 3.962 RMSE mean on the test set (N=20)
  +land_cover
   held-out test rmse: 97.09, mae: 34.34

 Processing +terrain attribute set
    16.20 ± 1.874 RMSE mean on the test set (N=20)
  +terrain
   held-out test rmse: 67.45, mae: 19.77

 Processing climate attribute set
    14.76 ± 2.582 RMSE mean on the test set (N=20)
  climate
   held-out test rmse: 62.32, mae: 15.82



In [16]:
grp_2_plots = create_results_plots(grp_2_results, group_order_2)
layout = gridplot(grp_2_plots, ncols=2, width=350, height=300)
show(layout)

In [17]:
grp_3_results = predict_runoff_from_attributes(df, holdout_pct, group_order_3)

 Processing +land_cover attribute set
    31.31 ± 4.011 RMSE mean on the test set (N=20)
  +land_cover
   held-out test rmse: 95.53, mae: 34.30

 Processing +terrain attribute set
    15.82 ± 1.890 RMSE mean on the test set (N=20)
  +terrain
   held-out test rmse: 60.75, mae: 17.97

 Processing +soil attribute set
    15.63 ± 2.194 RMSE mean on the test set (N=20)
  +soil
   held-out test rmse: 62.10, mae: 17.96

 Processing climate attribute set
    14.46 ± 2.925 RMSE mean on the test set (N=20)
  climate
   held-out test rmse: 63.59, mae: 16.30



In [18]:
grp_3_plots = create_results_plots(grp_3_results, group_order_3)
layout = gridplot(grp_3_plots, ncols=2, width=350, height=300)
show(layout)

In [19]:
grp_4_results = predict_runoff_from_attributes(df, holdout_pct, group_order_4)

 Processing +soil attribute set
    49.72 ± 4.015 RMSE mean on the test set (N=20)
  +soil
   held-out test rmse: 141.31, mae: 55.00

 Processing climate attribute set
    43.92 ± 6.267 RMSE mean on the test set (N=20)
  climate
   held-out test rmse: 138.78, mae: 53.03

 Processing +land_cover attribute set
    29.91 ± 3.635 RMSE mean on the test set (N=20)
  +land_cover
   held-out test rmse: 80.82, mae: 26.59

 Processing +terrain attribute set
    14.79 ± 2.820 RMSE mean on the test set (N=20)
  +terrain
   held-out test rmse: 63.37, mae: 16.10



In [20]:
grp_4_plots = create_results_plots(grp_4_results, group_order_4)
layout = gridplot(grp_4_plots, ncols=2, width=350, height=300)
show(layout)

As a last iteration, randomize the order of the mean_runoff column to test what the algorithm is learning.

The predictive power decreases substantially across all groupings of input attributes.

In [33]:
shuffled_df = df.copy()
runoff_values = df['mean_runoff'].values
# randomly shuffle the order of runoff values
np.random.shuffle(runoff_values)
shuffled_df['mean_runoff'] = runoff_values
shuffled_results = predict_runoff_from_attributes(shuffled_df, holdout_pct, group_order_1)

 Processing climate attribute set
    51.61 ± 1.719 RMSE mean on the test set (N=20)
  climate
   held-out test rmse: 268.28, mae: 71.14

 Processing +terrain attribute set
    51.59 ± 1.672 RMSE mean on the test set (N=20)
  +terrain
   held-out test rmse: 289.04, mae: 88.24

 Processing +land_cover attribute set
    51.76 ± 1.465 RMSE mean on the test set (N=20)
  +land_cover
   held-out test rmse: 297.31, mae: 83.51

 Processing +soil attribute set
    51.70 ± 1.522 RMSE mean on the test set (N=20)
  +soil
   held-out test rmse: 293.72, mae: 82.98



In [34]:
shuffled_runoff_plots = create_results_plots(shuffled_results, group_order_1)
layout = gridplot(shuffled_runoff_plots, ncols=2, width=350, height=300)
show(layout)

## Citations

```{bibliography}
:filter: docname in docnames
```