# Predictability of Kullback-Leibler (KL) Divergence

## Introduction

In the first steps, we looked at the predictability of a common hydrological signature and an uncommon information measure, both computed on individual distributions.  If the entropy measure is reasonably predictable from attributes, then the uncertainty of unmonitored locations might be predictable enough for some applications.  

Here we make a comparison of runoff between **many pairs of locations** and ask if the **Kullback-Leibler Divergence** (KLD) of the distribution of unit area runoff between two locations can be predicted from the attributes of both catchments (and their differences). We will again use the gradient boosted decision tree method.

In [1]:
import os
import pandas as pd
import numpy as np
from time import time

from bokeh.plotting import figure, show
from bokeh.layouts import gridplot
from bokeh.models import ColumnDataSource
from bokeh.io import output_notebook
from bokeh.palettes import Sunset10, Vibrant7

import xgboost as xgb
xgb.config_context(verbosity=2)

from sklearn.metrics import (
    root_mean_squared_error,
    mean_absolute_error,
    roc_auc_score,
    accuracy_score,
)

import data_processing_functions as dpf

from scipy.stats import linregress
output_notebook()

BASE_DIR = os.getcwd()

### Load attribute data 

In [2]:
# load the catchment characteristics
fname = 'BCUB_HYSETS_properties_with_climate_with_entropy.csv'
df = pd.read_csv(os.path.join('data', fname))
station_ids = df['official_id'].values
print(f'There are {len(station_ids)} monitored basins in the attribute set.')

There are 1616 monitored basins in the attribute set.


### Load pairwise attribute comparisons

Load a few rows from one of the pairwise data files.  These contain attributes about divergence measures that are computed on concurrent and non-concurrent time series at two monitored locations.

In [3]:
# open an example pairwise results file
pairs_results_folder = os.path.join(
    BASE_DIR, "data", "divergence_test_results",
)
pairs_files = os.listdir(pairs_results_folder)
test_df = pd.read_csv(os.path.join(pairs_results_folder, pairs_files[0]), nrows=1000)

In [10]:
kld_columns = [c for c in test_df.columns if 'dkl' in c]
kld_columns

['dkl_concurrent_uniform',
 'dkl_concurrent_post_-5R',
 'dkl_concurrent_post_-4R',
 'dkl_concurrent_post_-3R',
 'dkl_concurrent_post_-2R',
 'dkl_concurrent_post_-1R',
 'dkl_concurrent_post_0R',
 'dkl_concurrent_post_1R',
 'dkl_concurrent_post_2R',
 'dkl_concurrent_post_3R',
 'dkl_concurrent_post_4R',
 'dkl_concurrent_post_5R',
 'dkl_concurrent_post_6R',
 'dkl_concurrent_post_7R',
 'dkl_concurrent_post_8R',
 'dkl_concurrent_post_9R',
 'dkl_concurrent_post_10R',
 'dkl_nonconcurrent_uniform',
 'dkl_nonconcurrent_post_-5R',
 'dkl_nonconcurrent_post_-4R',
 'dkl_nonconcurrent_post_-3R',
 'dkl_nonconcurrent_post_-2R',
 'dkl_nonconcurrent_post_-1R',
 'dkl_nonconcurrent_post_0R',
 'dkl_nonconcurrent_post_1R',
 'dkl_nonconcurrent_post_2R',
 'dkl_nonconcurrent_post_3R',
 'dkl_nonconcurrent_post_4R',
 'dkl_nonconcurrent_post_5R',
 'dkl_nonconcurrent_post_6R',
 'dkl_nonconcurrent_post_7R',
 'dkl_nonconcurrent_post_8R',
 'dkl_nonconcurrent_post_9R',
 'dkl_nonconcurrent_post_10R']

### Define attribute groupings

In [7]:
terrain = ['drainage_area_km2', 'elevation_m', 'slope_deg', 'gravelius', 'perimeter', 'aspect_deg']
land_cover = [
    'land_use_forest_frac_2010', 'land_use_grass_frac_2010', 'land_use_wetland_frac_2010', 'land_use_water_frac_2010', 
    'land_use_urban_frac_2010', 'land_use_shrubs_frac_2010', 'land_use_crops_frac_2010', 'land_use_snow_ice_frac_2010']
soil = ['logk_ice_x100', 'porosity_x100']
climate = ['prcp', 'srad', 'swe', 'tmax', 'tmin', 'vp', 'high_prcp_freq', 'high_prcp_duration', 'low_prcp_freq', 'low_prcp_duration']
all_attributes = terrain + land_cover + soil + climate
len(all_attributes)

26

### Set trial parameters

In [8]:
# define the amount of data to set aside for final testing
holdout_pct = 0.10
nfolds = 5
n_boost_rounds = 2500
n_optimization_rounds = 10

#define if testing concurrent or nonconcurrent data
concurrent = 'concurrent'

# the input data file has an associated revision date
revision_date = '20240717'

all_test_results = {}
attribute_set_names = ['climate', '+land_cover', '+terrain', '+soil']

results_folder = os.path.join('./data/', 'kld_prediction_results')
if not os.path.exists(results_folder):
    os.makedirs(results_folder)

In [9]:
def predict_KLD_from_attributes(target_col, holdout_pct, stations, nfolds, results_folder, partial_counts=False):

    training_stn_cv_sets, test_stn_sets = dpf.train_test_split_by_official_id(holdout_pct, stations, nfolds)
    all_test_results = {}
    for bitrate in [4, 6, 8]:
        t0 = time()
        print(f'bitrate = {bitrate}')
        fname = f"DKL_results_{bitrate}bits_{revision_date}.csv"
        if partial_counts:
            fname = f"DKL_results_{bitrate}bits_{revision_date}_partial_counts.csv"

        fpath = os.path.join(
            BASE_DIR, "data", "divergence_test_results", fname
        )
        df = pd.read_csv(fpath, nrows=None, low_memory=False)
        df.dropna(subset=[target_col], inplace=True)
        t1 = time()
        print(f'    {t1-t0:.2f}s to load input data')
        
        all_test_results[bitrate] = {}
        input_attributes = []

        # add attribute groups successively
        for attribute_set, set_name in zip([land_cover, terrain, soil, climate], attribute_set_names):
            print(f'  Processing {set_name} attribute set.')
            input_attributes += attribute_set 
                        
            features = dpf.format_features(input_attributes)
            
            trial_df, test_df = dpf.run_xgb_trials(
                bitrate, set_name, features, target_col, df, 
                training_stn_cv_sets, test_stn_sets, n_optimization_rounds, 
                nfolds, n_boost_rounds, results_folder
            )
            
            test_rmse = root_mean_squared_error(test_df['actual'], test_df['predicted'])
            test_mae = mean_absolute_error(test_df['actual'], test_df['predicted'])

            print(f'   held-out test rmse: {test_rmse:.2f}, mae: {test_mae:.2f}')
            print('')
            # store the test set predictions and actuals
            all_test_results[bitrate][set_name] = {
                'trials': trial_df, 'test_df': test_df,
                'test_mae': test_mae, 'test_rmse': test_rmse} 
    return all_test_results

## Run Models

In [None]:
priors_to_test = [-2, -1, 0, 1, 2]

for prior in priors_to_test;
    target_col = f'dkl_{concurrent}_post_{prior}R'

    test_results_fname = f'{target_col}_{prior}_prior_results.npy'
    test_results_fpath = os.path.join('data', test_results_fname)
    if os.path.exists(test_results_fpath):
        all_test_results = np.load(test_results_fpath, allow_pickle=True).item()
    else:
        all_test_results = predict_TVD_from_attributes(target_col, holdout_pct, station_ids, nfolds, results_folder)
    np.save(test_results_fpath, all_test_results)

## Citations

```{bibliography}
:filter: docname in docnames
```