# Predictability of (Shannon) Entropy

In the data preprocessing, we computed the entropy of the distribution of each individual streamflow time series in bits per sample.  We'll now use an ensemble decision tree method called XGBoost (eXtreme Gradient Boosted decision tree) {cite}`chen2016xgboost` to see if the entropy (or uncertainty) of a distribution can be predicted from catchment attributes.  The dictionary size (number of quantization levels) is varied to test if the additional information in the distribution can be exploited by the model.  The model input features are added in successive model tests to compare the contribution of catchment attribute groups related to climate, terrain, land cover, and soil.  

In [1]:
import os
import pandas as pd
import numpy as np

from bokeh.plotting import figure, show
from bokeh.layouts import gridplot
from bokeh.models import ColumnDataSource
from bokeh.io import output_notebook
from bokeh.palettes import Sunset10, Vibrant7

import xgboost as xgb
from sklearn.metrics import (
    root_mean_squared_error,
    mean_absolute_error,
    roc_auc_score,
    accuracy_score,
)

from scipy.stats import linregress
output_notebook()

In [2]:
# load the catchment characteristics
fname = 'BCUB_HYSETS_properties_with_climate_with_entropy.csv'
df = pd.read_csv(os.path.join('data', fname))

Subdivide the attributes into related classes: terrain, land cover, soil, climate.

In [3]:
print(df.columns.tolist())

['official_id', 'watershed_id', 'name', 'centroid_lat_deg_n', 'centroid_lon_deg_e', 'drainage_area_km2', 'drainage_area_gsim_km2', 'flag_gsim_boundaries', 'flag_artificial_boundaries', 'elevation_m', 'slope_deg', 'gravelius', 'perimeter', 'flag_shape_extraction', 'aspect_deg', 'flag_terrain_extraction', 'land_use_forest_frac_2010', 'land_use_grass_frac_2010', 'land_use_wetland_frac_2010', 'land_use_water_frac_2010', 'land_use_urban_frac_2010', 'land_use_shrubs_frac_2010', 'land_use_crops_frac_2010', 'land_use_snow_ice_frac_2010', 'flag_land_use_extraction', 'logk_ice_x100', 'porosity_x100', 'flag_subsoil_extraction', 'year_from', 'year_to', 'record_length', 'agency', 'status', 'updated_official_basin', 'in_bcub', 'prcp', 'srad', 'swe', 'tmax', 'tmin', 'vp', 'high_prcp_freq', 'high_prcp_duration', 'low_prcp_freq', 'low_prcp_duration', 'H_4_bits', 'H_6_bits', 'H_8_bits']


In [4]:
terrain = ['drainage_area_km2', 'elevation_m', 'slope_deg', 'gravelius', 'perimeter', 'aspect_deg']
land_cover = [
    'land_use_forest_frac_2010', 'land_use_grass_frac_2010', 'land_use_wetland_frac_2010', 'land_use_water_frac_2010', 
    'land_use_urban_frac_2010', 'land_use_shrubs_frac_2010', 'land_use_crops_frac_2010', 'land_use_snow_ice_frac_2010']
soil = ['logk_ice_x100', 'porosity_x100']
climate = ['prcp', 'srad', 'swe', 'tmax', 'tmin', 'vp', 'high_prcp_freq', 'high_prcp_duration', 'low_prcp_freq', 'low_prcp_duration']
all_attributes = terrain + land_cover + soil + climate

In [5]:
def train_test_split(df, holdout_pct):
    """
    Split the input data into training and test sets.  
    The proportion of test data is holdout_pct.
    Return the data as arrays.
    """
    n_holdout = int(holdout_pct * len(df))
    test_idxs = np.random.choice(df.index.values, n_holdout, replace=False)
    train_idxs = [i for i in df.index.values if i not in test_idxs]
    
    common_idxs = np.intersect1d(train_idxs, test_idxs)
    assert len(common_idxs) == 0
    
    return train_idxs, test_idxs

In [17]:
def run_trials(set_name, features, target, input_data, train_indices, test_indices, n_optimization_rounds, nfolds, num_boost_rounds):
    # randomly select 5% of the stations to leave out for a hold-out test set
    # to ensure none of the data are seen in training
    
    X_train, Y_train = input_data.loc[train_indices, features].values, input_data.loc[train_indices, target].values
    X_test, Y_test = input_data.loc[test_indices, features].values, input_data.loc[test_indices, target].values
    
    all_results = []
    for trial in range(n_optimization_rounds):
        
        lr, ss, cs = learning_rates[trial], subsamples[trial], colsamples[trial]

        params = {
            "objective": "reg:squarederror",
            "eval_metric": "rmse",
            "eta": lr,
            # "max_depth": 6,  # use default max_depth
            # "min_child_weight": 1, # use colsample and subsample instead of min_child_weight
            "subsample": ss,
            "colsample_bytree": cs,
            "seed": 42,
            "device": "cuda",  # note, change this to 'cpu' if your system doesn't have a CUDA GPU
            "sampling_method": 'gradient_based',
            "tree_method": 'hist',
        }
        
        results_folder = os.path.join('./data/', 'entropy_results')
        if not os.path.exists(results_folder):
            os.makedirs(results_folder)

        results_fname = f'{set_name}_H_{bitrate}_bits_{lr:.3f}_lr_{ss:.3f}_sub_{cs:.3f}_col.csv'
        results_fpath = os.path.join(results_folder, results_fname)

        model_results = xgb.cv(
            params=params,
            dtrain=xgb.DMatrix(X_train, label=Y_train),
            num_boost_round=num_boost_rounds,
            nfold=nfolds,
            metrics=['mae', 'rmse'],
            early_stopping_rounds=20,
            verbose_eval=False,
        )
        best_rmse_round = model_results['test-rmse-mean'].idxmin()
        best_mae_round = model_results['test-mae-mean'].idxmin()
        # print(lr, best_rmse_round, best_mae_round)
                
        results_dict = {
            'best_rmse_round': best_rmse_round,
            'best_mae_round': best_mae_round,
            'min_test_mae': model_results.loc[best_mae_round, 'test-mae-mean'],
            'min_test_rmse': model_results.loc[best_rmse_round, 'test-rmse-mean'],
            'min_mae_stdev': model_results.loc[best_mae_round, 'test-mae-std'],
            'min_rmse_stdev': model_results.loc[best_rmse_round, 'test-rmse-std'],
            'min_train_mae': model_results.loc[best_mae_round, 'train-mae-mean'],
            'min_train_rmse': model_results.loc[best_rmse_round, 'train-rmse-mean'],
        }
        results_cols = list(results_dict.keys())
        results_dict.update(params)
        
        all_results.append(results_dict)
        
    # save the trial results
    trial_results = pd.DataFrame(all_results)
    trial_results.to_csv(results_fpath)
    trial_mean = trial_results['min_test_rmse'].mean()
    trial_stdev = trial_results['min_rmse_stdev'].mean()
    
    print(f'    {trial_mean:.2f} mean RMSE, {trial_stdev:.3f} mean stdev RMSE ({len(trial_results)} trials)')
    
    param_cols = list(params.keys())

    # get the optimal hyperparameters
    optimal_rmse_idx = trial_results['min_test_rmse'].idxmin()
    optimal_mae_idx = trial_results['min_test_mae'].idxmin()
    
    # print(optimal_rmse_idx)
    # print(trial_results.sort_values('min_test_rmse'))
    
    print(trial_results.loc[[optimal_rmse_idx]])
    print(trial_results.sort_values('min_test_rmse'))
    
    if optimal_rmse_idx != optimal_mae_idx:
        print('best rmse and mae are from different trials, ', optimal_rmse_idx, optimal_mae_idx)
        
    best_rmse_params = trial_results.loc[optimal_rmse_idx, param_cols]
    best_mae_params = trial_results.loc[optimal_mae_idx, param_cols]
    
    final_model = xgb.XGBRegressor(n_estimators=2*num_boost_rounds, 
                                   early_stopping_rounds=20,
                                   **best_rmse_params.to_dict())
    final_model.fit(X_train, Y_train)
    print(final_model)
    print(asdf)
    predicted_y = final_model.predict(X_test)
    
    test_results = pd.DataFrame(
        {
            'predicted': predicted_y,
            'actual': Y_test,
        })
    
    return trial_results, test_results

In [18]:
# define the amount of data to set aside for final testing
holdout_pct = 0.10
nfolds = 5
n_boost_rounds = 5000
n_optimization_rounds = 50

all_test_results = {}
attribute_set_names = ['climate', '+land_cover', '+terrain', '+soil']


In [19]:
execute_models = True
if execute_models:
    df.reset_index(drop=True, inplace=True)
    train_indices, test_indices = train_test_split(df, holdout_pct)
    
    for bitrate in [4, 6, 8]:
        all_test_results[bitrate] = {}
        print(f'bitrate = {bitrate}')
        # set the target column
        target_column = f'H_{bitrate}_bits'
        test_attributes = []

        # add attribute groups successively
        for attribute_set, set_name in zip([climate, land_cover, terrain, soil], attribute_set_names):

            test_attributes += attribute_set
            input_data = df[test_attributes + [target_column]].copy()

            # reset the index to ensure the random selection is done properly            

            sample_choices =  np.arange(0.5, 0.9, 0.02) 
            lr_choices = np.arange(0.001, 0.1, 0.0005) 
            learning_rates = np.random.choice(lr_choices, n_optimization_rounds) 
            subsamples = np.random.choice(sample_choices, n_optimization_rounds) 
            colsamples = np.random.choice(sample_choices, n_optimization_rounds) 

            trial_df, test_df = run_trials(set_name, test_attributes, target_column, input_data, train_indices, test_indices, n_optimization_rounds, nfolds, n_boost_rounds)
            
            test_rmse = root_mean_squared_error(test_df['actual'], test_df['predicted'])
            test_mae = mean_absolute_error(test_df['actual'], test_df['predicted'])

            print(f'  {set_name}')
            print(f'        rmse: {test_rmse:.2f}, mae: {test_mae:.2f}')
            print('')
            # store the test set predictions and actuals
            all_test_results[bitrate][set_name] = {
                'trials': trial_df, 'test_df': test_df,
                'test_mae': test_mae, 'test_rmse': test_rmse} 

bitrate = 4
    1.59 mean RMSE, 0.044 mean stdev RMSE (50 trials)
    best_rmse_round  best_mae_round  min_test_mae  min_test_rmse  \
18               28              28      1.325195        1.58104   

    min_mae_stdev  min_rmse_stdev  min_train_mae  min_train_rmse  \
18       0.041316        0.050512       1.078275        1.275563   

           objective eval_metric     eta  subsample  colsample_bytree  seed  \
18  reg:squarederror        rmse  0.0525       0.72              0.78    42   

   device sampling_method tree_method  
18   cuda  gradient_based        hist  
    best_rmse_round  best_mae_round  min_test_mae  min_test_rmse  \
18               28              28      1.325195       1.581040   
27               64              64      1.322785       1.583075   
0               840             840      1.321726       1.584535   
41              103             103      1.322578       1.584575   
28              112             112      1.329723       1.584657   
14           

ValueError: Must have at least 1 validation dataset for early stopping.

In [9]:
plots = []
for b, set_dict in all_test_results.items():
    test_rmse, test_mae = [], []
    attribute_sets = list(set_dict.keys())

    y1 = [set_dict[e]['test_rmse'] for e in attribute_sets]
    y2 = [set_dict[e]['test_mae'] for e in attribute_sets]
    
    source = ColumnDataSource({'x': attribute_sets, 'y1': y1, 'y2': y2})
    
    title = f'{b} bits'
    if len(plots) == 0:
        fig = figure(title=title, x_range=attribute_sets)
    else:
        fig = figure(title=title, x_range=attribute_sets, y_range=plots[0].y_range)
    fig.line('x', 'y1', legend_label='rmse', color='green', source=source, line_width=3)
    fig.line('x', 'y2', legend_label='mae', color='dodgerblue', source=source, line_width=3)
    fig.legend.background_fill_alpha = 0.6
    fig.yaxis.axis_label = 'RMSE'
    
    result_df = pd.DataFrame({'set': attribute_sets, 'rmse': y1, 'mae': y2})
    best_rmse_idx = result_df['rmse'].idxmin()
    best_mae_idx = result_df['mae'].idxmin()
    best_rmse_set = result_df.loc[best_rmse_idx, 'set']
    best_mae_set = result_df.loc[best_mae_idx, 'set']
    best_result = set_dict[best_rmse_set]['test_df']
    
    xx, yy = best_result['actual'], best_result['predicted']
    slope, intercept, r, p, se = linregress(xx, yy)
    
    sfig = figure(title=f'Test: {b} bits best model {best_mae_set} (N={len(best_result})')
    sfig.scatter(xx, yy, size=3, alpha=0.8)
    xpred = np.linspace(min(xx), max(xx), 100)
    ybf = [slope * e + intercept for e in xpred]
    sfig.line(xpred, ybf, color='red', line_width=3, line_dash='dashed', legend_label=f'R² ={r**2:.2f}')    
    sfig.xaxis.axis_label = 'Actual H [bits/sample]'
    sfig.yaxis.axis_label = 'Predicted H [bits/sample]'
    sfig.legend.location = 'top_left'
    plots.append(fig)
    plots.append(sfig)

In [10]:
layout = gridplot(plots, ncols=2, width=350, height=300)
show(layout)

## Citations

```{bibliography}
:filter: docname in docnames
```