# Predictability of (Shannon) Entropy

## Introduction

In the data preprocessing, we computed the entropy of the distribution of each individual streamflow time series in bits per sample.  We'll now use an ensemble decision tree method called XGBoost (eXtreme Gradient Boosted decision tree) {cite}`chen2016xgboost` to see if the entropy (or uncertainty) of a distribution can be predicted from catchment attributes.  The dictionary size (number of quantization levels) is varied to test if the additional information in the distribution can be exploited by the model.  The model input features are added in successive model tests to compare the contribution of catchment attribute groups related to climate, terrain, land cover, and soil.  

In [1]:
import os
import pandas as pd
import numpy as np

import xgboost as xgb
from sklearn.metrics import (
    root_mean_squared_error,
    mean_absolute_error,
    roc_auc_score,
    accuracy_score,
)

from scipy.stats import linregress

import data_processing_functions as dpf

from bokeh.plotting import figure, show
from bokeh.layouts import gridplot
from bokeh.models import ColumnDataSource
from bokeh.io import output_notebook
from bokeh.palettes import Sunset10, Vibrant7


output_notebook()

## Load Input Data

In [None]:
# load the catchment characteristics
fname = 'BCUB_HYSETS_properties_with_climate_with_entropy.csv'
df = pd.read_csv(os.path.join('data', fname))

Subdivide the attributes into related classes: terrain, land cover, soil, climate.

In [None]:
print(df.columns.tolist())

## Define Attribute Groups

In [4]:
terrain = ['drainage_area_km2', 'elevation_m', 'slope_deg', 'gravelius', 'perimeter', 'aspect_deg']
land_cover = [
    'land_use_forest_frac_2010', 'land_use_grass_frac_2010', 'land_use_wetland_frac_2010', 'land_use_water_frac_2010', 
    'land_use_urban_frac_2010', 'land_use_shrubs_frac_2010', 'land_use_crops_frac_2010', 'land_use_snow_ice_frac_2010']
soil = ['logk_ice_x100', 'porosity_x100']
climate = ['prcp', 'srad', 'swe', 'tmax', 'tmin', 'vp', 'high_prcp_freq', 'high_prcp_duration', 'low_prcp_freq', 'low_prcp_duration']
all_attributes = terrain + land_cover + soil + climate
len(all_attributes)

In [6]:
results_folder = os.path.join('./data/', 'entropy_prediction_results')
if not os.path.exists(results_folder):
    os.makedirs(results_folder)

In [21]:
def run_trials(bitrate, set_name, features, target, input_data, train_indices, test_indices, n_optimization_rounds, nfolds, num_boost_rounds, results_folder):
   
    X_train, Y_train = input_data.loc[train_indices, features].values, input_data.loc[train_indices, target].values
    X_test, Y_test = input_data.loc[test_indices, features].values, input_data.loc[test_indices, target].values
    
    # reset the index to ensure the random selection is done properly
    sample_choices =  np.arange(0.5, 0.9, 0.02) 
    lr_choices = np.arange(0.001, 0.1, 0.0005) 
    learning_rates = np.random.choice(lr_choices, n_optimization_rounds) 
    subsamples = np.random.choice(sample_choices, n_optimization_rounds) 
    colsamples = np.random.choice(sample_choices, n_optimization_rounds) 
       
    all_results = []
    for trial in range(n_optimization_rounds):
        
        lr, ss, cs = learning_rates[trial], subsamples[trial], colsamples[trial]

        params = {
            "objective": "reg:squarederror",
            "eval_metric": "rmse",
            "eta": lr,
            # "max_depth": 6,  # use default max_depth
            # "min_child_weight": 1, # use colsample and subsample instead of min_child_weight
            "subsample": ss,
            "colsample_bytree": cs,
            "seed": 42,
            "device": "cuda",  # note, change this to 'cpu' if your system doesn't have a CUDA GPU
            "sampling_method": 'gradient_based',
            "tree_method": 'hist',
        }

        results_fname = f'{set_name}_H_{bitrate}_bits_{lr:.3f}_lr_{ss:.3f}_sub_{cs:.3f}_col.csv'
        results_fpath = os.path.join(results_folder, results_fname)

        model_results = xgb.cv(
            params=params,
            dtrain=xgb.DMatrix(X_train, label=Y_train),
            num_boost_round=num_boost_rounds,
            nfold=nfolds,
            metrics=['mae', 'rmse'],
            early_stopping_rounds=20,
            verbose_eval=0,
        )
        best_rmse_round = model_results['test-rmse-mean'].idxmin()
        best_mae_round = model_results['test-mae-mean'].idxmin()
                
        results_dict = {
            'best_rmse_round': best_rmse_round,
            'best_mae_round': best_mae_round,
            'min_test_mae': model_results.loc[best_mae_round, 'test-mae-mean'],
            'min_test_rmse': model_results.loc[best_rmse_round, 'test-rmse-mean'],
            'min_mae_stdev': model_results.loc[best_mae_round, 'test-mae-std'],
            'min_rmse_stdev': model_results.loc[best_rmse_round, 'test-rmse-std'],
            'min_train_mae': model_results.loc[best_mae_round, 'train-mae-mean'],
            'min_train_rmse': model_results.loc[best_rmse_round, 'train-rmse-mean'],
        }
        
        results_cols = list(results_dict.keys())
        results_dict.update(params)
        
        all_results.append(results_dict)
        if (trial > 0) & (trial % 2 == 0):
            temp_results = pd.DataFrame(all_results)
            temp_mean = temp_results['min_test_rmse'].mean()
            temp_stdev = temp_results['min_rmse_stdev'].mean()    
            if temp_stdev / temp_mean < 0.025:
                print(f'   Mean (test) RMSE = {temp_mean:.2f} ± {temp_stdev:.3f} ({len(temp_results)} trials)')
                print(f'   completed {trial}/{n_optimization_rounds}')
                print('Standard deviation of optimization results is < 2.5% of mean, stopping early.')
                break
            if trial % 10 == 0:
                print(f'       completed {trial}/{n_optimization_rounds}')
        
    # save the trial results
    trial_results = pd.DataFrame(all_results)
    trial_results.to_csv(results_fpath)
    trial_mean = trial_results['min_test_rmse'].mean()
    trial_stdev = trial_results['min_rmse_stdev'].mean()
    
    print(f'    Mean (test) RMSE = {trial_mean:.2f} ± {trial_stdev:.3f} ({len(trial_results)} trials)')
    
    param_cols = list(params.keys())

    # get the optimal hyperparameters
    optimal_rmse_idx = trial_results['min_test_rmse'].idxmin()
    optimal_mae_idx = trial_results['min_test_mae'].idxmin()    
        
    best_rmse_params = trial_results.loc[optimal_rmse_idx, param_cols]
    best_mae_params = trial_results.loc[optimal_mae_idx, param_cols]
    
    dtrain = xgb.DMatrix(X_train, label=Y_train)
    dtest = xgb.DMatrix(X_test, label=Y_test)

    eval_list = [(dtrain, 'train'), (dtest, 'eval')]

    final_model = xgb.train(
        best_rmse_params.to_dict(), dtrain, 
        2*num_boost_rounds, evals=eval_list,
        verbose_eval=0, early_stopping_rounds=20
    )

    predicted_y = final_model.predict(dtest)
    
    test_results = pd.DataFrame(
        {
            'predicted': predicted_y,
            'actual': Y_test,
        })
    
    return trial_results, test_results

In [23]:
def predict_entropy_from_attributes(df, holdout_pct, results_folder):
    df.reset_index(drop=True, inplace=True)    
    # randomly select holdout_pct of the stations to leave out for a hold-out test set
    # to ensure none of the data are seen in training
    train_indices, test_indices = dpf.train_test_split(df, holdout_pct)
    all_test_results = {}
    for bitrate in [4, 6, 8, 9, 10, 11, 12]:
        all_test_results[bitrate] = {}
        print(f'bitrate = {bitrate}')
        # set the target column
        target_column = f'H_{bitrate}_bits'
        input_attributes = []

        # add attribute groups successively
        for attribute_set, set_name in zip([climate, land_cover, terrain, soil], attribute_set_names):
            print(f'  Processing {set_name} attribute set.')
            input_attributes += attribute_set
            input_data = df[input_attributes + [target_column]].copy()

            trial_df, test_df = run_trials(
                bitrate, set_name, input_attributes, target_column, 
                input_data, train_indices, test_indices, n_optimization_rounds, 
                nfolds, n_boost_rounds, results_folder
            )
            
            test_rmse = root_mean_squared_error(test_df['actual'], test_df['predicted'])
            test_mae = mean_absolute_error(test_df['actual'], test_df['predicted'])

            print(f'    Held-out test rmse: {test_rmse:.2f}, mae: {test_mae:.2f}')
            print('')
            # store the test set predictions and actuals
            all_test_results[bitrate][set_name] = {
                'trials': trial_df, 'test_df': test_df,
                'test_mae': test_mae, 'test_rmse': test_rmse} 
    return all_test_results

## Set Trial Parameters

In [22]:
# define the amount of data to set aside for final testing
holdout_pct = 0.10
nfolds = 5
n_boost_rounds = 5000
n_optimization_rounds = 50

all_test_results = {}
attribute_set_names = ['climate', '+land_cover', '+terrain', '+soil']


## Run Models

In [None]:
test_results_fname = f'Entropy_prediction_results.npy'
test_results_fpath = os.path.join('data', test_results_fname)
if os.path.exists(test_results_fpath):
    all_test_results = np.load(test_results_fpath, allow_pickle=True).item()
else:
    all_test_results = predict_entropy_from_attributes(df, holdout_pct, results_folder)
    np.save(test_results_fpath, all_test_results)

bitrate = 4
  Processing climate attribute set.
   completed 10/50
   completed 20/50
   completed 30/50
   completed 40/50
    Mean (test) RMSE = 0.67 ± 0.022 (50 trials)
   held-out test rmse: 0.66, mae: 0.53

  Processing +land_cover attribute set.
   completed 10/50
   completed 20/50
   completed 30/50
   completed 40/50
    Mean (test) RMSE = 0.66 ± 0.020 (50 trials)
   held-out test rmse: 0.65, mae: 0.53

  Processing +terrain attribute set.
   completed 10/50
   completed 20/50
   completed 30/50
   completed 40/50
    Mean (test) RMSE = 0.66 ± 0.019 (50 trials)
   held-out test rmse: 0.65, mae: 0.53

  Processing +soil attribute set.
   Mean (test) RMSE = 0.66 ± 0.016 (3 trials)
   completed 2/50
Standard deviation of optimization results is < 2.5% of mean, stopping early.
    Mean (test) RMSE = 0.66 ± 0.016 (3 trials)
   held-out test rmse: 0.66, mae: 0.53

bitrate = 6
  Processing climate attribute set.
   completed 10/50
   completed 20/50
   completed 30/50
   completed 40

## View Results

In [None]:
plots = []
for b, set_dict in all_test_results.items():
    test_rmse, test_mae = [], []
    attribute_sets = list(set_dict.keys())

    y1 = [set_dict[e]['test_rmse'] for e in attribute_sets]
    y2 = [set_dict[e]['test_mae'] for e in attribute_sets]
    
    source = ColumnDataSource({'x': attribute_sets, 'y1': y1, 'y2': y2})
    
    title = f'{b} bits'
    if len(plots) == 0:
        fig = figure(title=title, x_range=attribute_sets)
    else:
        fig = figure(title=title, x_range=attribute_sets, y_range=plots[0].y_range)
    fig.line('x', 'y1', legend_label='rmse', color='green', source=source, line_width=3)
    fig.line('x', 'y2', legend_label='mae', color='dodgerblue', source=source, line_width=3)
    fig.legend.background_fill_alpha = 0.6
    fig.yaxis.axis_label = 'RMSE'
    
    result_df = pd.DataFrame({'set': attribute_sets, 'rmse': y1, 'mae': y2})
    best_rmse_idx = result_df['rmse'].idxmin()
    best_mae_idx = result_df['mae'].idxmin()
    best_rmse_set = result_df.loc[best_rmse_idx, 'set']
    best_mae_set = result_df.loc[best_mae_idx, 'set']
    best_result = set_dict[best_rmse_set]['test_df']
    
    xx, yy = best_result['actual'], best_result['predicted']
    slope, intercept, r, p, se = linregress(xx, yy)
    
    sfig = figure(title=f'Test: {b} bits best model {best_rmse_set} (N={len(best_result)})')
    sfig.scatter(xx, yy, size=3, alpha=0.8)
    xpred = np.linspace(min(xx), max(xx), 100)
    ybf = [slope * e + intercept for e in xpred]
    sfig.line(xpred, ybf, color='red', line_width=3, line_dash='dashed', legend_label=f'R²={r**2:.2f}')    
    sfig.xaxis.axis_label = 'Actual H [bits/sample]'
    sfig.yaxis.axis_label = 'Predicted H [bits/sample]'
    sfig.legend.location = 'top_left'
    plots.append(fig)
    plots.append(sfig)

In [None]:
layout = gridplot(plots, ncols=2, width=350, height=300)
show(layout)

## Citations

```{bibliography}
:filter: docname in docnames
```