# Predictability of Total Variation Distance (TVD)

## Introduction

Here we make a comparison of runoff between two locations and ask if the **Total Variation Distance** (TVD) of the distribution of unit area runoff between two locations can be predicted from the attributes of both catchments (and their differences). We will again use the gradient boosted decision tree method.

In [1]:
import os
import pandas as pd
import numpy as np
from time import time

from bokeh.plotting import figure, show
from bokeh.layouts import gridplot
from bokeh.models import ColumnDataSource
from bokeh.io import output_notebook
from bokeh.palettes import Sunset10, Vibrant7

import xgboost as xgb
xgb.config_context(verbosity=2)

from sklearn.metrics import (
    root_mean_squared_error,
    mean_absolute_error,
    roc_auc_score,
    accuracy_score,
)

import data_processing_functions as dpf

from scipy.stats import linregress
output_notebook()

BASE_DIR = os.getcwd()

## Load input data

In [2]:
# load the catchment characteristics
fname = 'BCUB_watershed_attributes_updated.csv'
attr_df = pd.read_csv(os.path.join('data', fname))
attr_df.columns = [c.lower() for c in attr_df.columns]
station_ids = attr_df['official_id'].values
print(f'There are {len(station_ids)} monitored basins in the attribute set.')

There are 1609 monitored basins in the attribute set.


### Load pairwise attribute comparisons

Load a few rows from one of the pairwise data files.  These contain attributes about divergence measures that are computed on concurrent and non-concurrent time series at two monitored locations.

In [3]:
# open an example pairwise results file
input_folder = os.path.join(
    BASE_DIR, "data", "processed_divergence_inputs",
)
pairs_files = os.listdir(input_folder)
test_df = pd.read_csv(os.path.join(input_folder, pairs_files[0]), nrows=1000)

In [4]:
tvd_columns = [c for c in test_df.columns if 'tvd' in c]
tvd_columns

['tvd_concurrent',
 'tvd_concurrent_max',
 'tvd_nonconcurrent',
 'tvd_nonconcurrent_max']

### Define attribute groupings

In [11]:
terrain = ['drainage_area_km2', 'elevation_m', 'slope_deg', 'aspect_deg'] # 'gravelius', 'perimeter',
land_cover = [
    'land_use_forest_frac_2010', 'land_use_grass_frac_2010', 'land_use_wetland_frac_2010', 'land_use_water_frac_2010', 
    'land_use_urban_frac_2010', 'land_use_shrubs_frac_2010', 'land_use_crops_frac_2010', 'land_use_snow_ice_frac_2010']
soil = ['logk_ice_x100', 'porosity_x100']
climate = ['prcp', 'srad', 'swe', 'tmax', 'tmin', 'vp', 'high_prcp_freq', 'high_prcp_duration', 'low_prcp_freq', 'low_prcp_duration']
all_attributes = terrain + land_cover + soil + climate
len(all_attributes)

24

## Set Trial Parameters

In [12]:
# define the amount of data to set aside for final testing
holdout_pct = 0.10
nfolds = 5
n_boost_rounds = 2000
n_optimization_rounds = 20

#define if testing concurrent or nonconcurrent data
concurrent = 'concurrent'

# the input data file has an associated revision date
revision_date = '20240812'

all_test_results = {}
attribute_set_names = ['climate', '+land_cover', '+terrain', '+soil']

results_folder = os.path.join(BASE_DIR, 'data', 'tvd_prediction_results')
if not os.path.exists(results_folder):
    os.makedirs(results_folder)

In [13]:
def predict_TVD_from_attributes(attr_df, target_col, holdout_pct, stations, nfolds, results_folder, partial_counts=False):

    training_stn_cv_sets, test_stn_sets = dpf.train_test_split_by_official_id(holdout_pct, stations, nfolds)
    all_test_results = {}
    for bitrate in [4, 6, 8, 9, 10, 11, 12]:
        t0 = time()
        print(f'bitrate = {bitrate}')
        fname = f"KL_results_{bitrate}bits_{revision_date}.csv"
        if partial_counts:
            fname = f"KL_results_{bitrate}bits_{revision_date}_partial_counts.csv"

        fpath = os.path.join(input_folder, fname)
        df = pd.read_csv(fpath, nrows=None, low_memory=False)
        df.dropna(subset=[target_col], inplace=True)

        # add the attributes into the input dataset
        df = dpf.add_attributes(attr_df, df, all_attributes)
        
        t1 = time()
        print(f'    {t1-t0:.2f}s to load input data')

        input_attributes = []
        all_test_results[bitrate] = {}
        # add attribute groups successively
        for attribute_set, set_name in zip([land_cover, terrain, soil, climate], attribute_set_names):
            print(f'  Processing {set_name} attribute set.')
            input_attributes += attribute_set 
                        
            features = dpf.format_features(input_attributes)

            trial_df, test_df = dpf.run_xgb_trials_custom_CV(
                bitrate, set_name, features, target_col, df, 
                training_stn_cv_sets, test_stn_sets, n_optimization_rounds, 
                nfolds, n_boost_rounds, results_folder
            )
            
            test_rmse = root_mean_squared_error(test_df['actual'], test_df['predicted'])
            test_mae = mean_absolute_error(test_df['actual'], test_df['predicted'])

            print(f'   held-out test rmse: {test_rmse:.2f}, mae: {test_mae:.2f}')
            print('')
            # store the test set predictions and actuals
            all_test_results[bitrate][set_name] = {
                'trials': trial_df, 'test_df': test_df,
                'test_mae': test_mae, 'test_rmse': test_rmse} 
    return all_test_results

## Run XGBoost Models

In [None]:
target_col = f'tvd_{concurrent}'

test_results_fname = f'{target_col}_results.npy'
test_results_fpath = os.path.join(results_folder, test_results_fname)
if os.path.exists(test_results_fpath):
    all_test_results = np.load(test_results_fpath, allow_pickle=True).item()
else:
    all_test_results = predict_TVD_from_attributes(attr_df, target_col, holdout_pct, station_ids, nfolds, results_folder)
    np.save(test_results_fpath, all_test_results)

bitrate = 4
    15.09s to load input data
  Processing climate attribute set.
    0.26 ± 0.005 RMSE mean on the test set (N=20)
   held-out test rmse: 0.26, mae: 0.21

  Processing +land_cover attribute set.
    0.18 ± 0.004 RMSE mean on the test set (N=20)
   held-out test rmse: 0.17, mae: 0.13

  Processing +terrain attribute set.
    0.18 ± 0.004 RMSE mean on the test set (N=20)
   held-out test rmse: 0.17, mae: 0.14

  Processing +soil attribute set.


## View Results

In [None]:
plots = []
for b, set_dict in all_test_results.items():
    test_rmse, test_mae = [], []
    attribute_sets = list(set_dict.keys())

    y1 = [set_dict[e]['test_rmse'] for e in attribute_sets]
    y2 = [set_dict[e]['test_mae'] for e in attribute_sets]
    
    source = ColumnDataSource({'x': attribute_sets, 'y1': y1, 'y2': y2})
    
    title = f'{b} bits'
    if len(plots) == 0:
        fig = figure(title=title, x_range=attribute_sets)
    else:
        fig = figure(title=title, x_range=attribute_sets, y_range=plots[0].y_range)
    fig.line('x', 'y1', legend_label='rmse', color='green', source=source, line_width=3)
    fig.line('x', 'y2', legend_label='mae', color='dodgerblue', source=source, line_width=3)
    fig.legend.background_fill_alpha = 0.6
    fig.yaxis.axis_label = 'RMSE'
    
    result_df = pd.DataFrame({'set': attribute_sets, 'rmse': y1, 'mae': y2})
    best_rmse_idx = result_df['rmse'].idxmin()
    best_mae_idx = result_df['mae'].idxmin()
    best_rmse_set = result_df.loc[best_rmse_idx, 'set']
    best_mae_set = result_df.loc[best_mae_idx, 'set']
    best_result = set_dict[best_rmse_set]['test_df']
    
    xx, yy = best_result['actual'], best_result['predicted']
    slope, intercept, r, p, se = linregress(xx, yy)
    
    sfig = figure(title=f'Test: {b} bits best model {best_rmse_set} (N={len(best_result)})')
    sfig.scatter(xx, yy, size=1, alpha=0.6)
    xpred = np.linspace(min(xx), max(xx), 100)
    ybf = [slope * e + intercept for e in xpred]
    sfig.line(xpred, ybf, color='red', line_width=3, line_dash='dashed', legend_label=f'R²={r**2:.2f}')   
    # plot a 1:1 line
    sfig.line([min(yy), max(yy)], [min(yy), max(yy)], color='black', line_dash='dotted', 
              line_width=2, legend_label='1:1')
    sfig.xaxis.axis_label = 'Actual TVD [bits/sample]'
    sfig.yaxis.axis_label = 'Predicted TVD [bits/sample]'
    sfig.legend.location = 'top_left'
    plots.append(fig)
    plots.append(sfig)

In [None]:
layout = gridplot(plots, ncols=2, width=350, height=300)
show(layout)

## Discussion

In the case of TVD, the target variable occupies the same range regardless of the dictionary size used in quantizing the streamflow data to define probabilities.  Two patterns are evident: 1) the $R^2$ increases as the dictionary size grows, and 2) there are clusters at both 0 and 1 that the model is unable to discern well.  

## Citations

```{bibliography}
:filter: docname in docnames
```