# Predictability of Wasserstein Distance (WD)

Here we make yet another comparison of runoff between two locations and ask if the **Wasserstein Distance** (WD) of the distributions of unit area runoff between two locations can be predicted from the attributes of both catchments (and their differences). We will again use the gradient boosted decision tree method.

In [29]:
import os
import pandas as pd
import numpy as np
from time import time

from bokeh.plotting import figure, show
from bokeh.layouts import gridplot
from bokeh.models import ColumnDataSource
from bokeh.io import output_notebook
from bokeh.palettes import Sunset10, Vibrant7

import xgboost as xgb
from sklearn.metrics import (
    root_mean_squared_error,
    mean_absolute_error,
    roc_auc_score,
    accuracy_score,
)

import data_processing_functions as dpf

from scipy.stats import linregress
output_notebook()

BASE_DIR = os.getcwd()

In [30]:
# load the catchment characteristics
fname = 'BCUB_HYSETS_properties_with_climate_with_entropy.csv'
df = pd.read_csv(os.path.join('data', fname))
station_ids = df['official_id'].values
print(f'There are {len(station_ids)} monitored basins in the attribute set.')

There are 1616 monitored basins in the attribute set.


In [31]:
# open an example pairwise results file
pairs_results_folder = os.path.join(
    BASE_DIR, "data", "divergence_test_results",
)
pairs_files = os.listdir(pairs_results_folder)
test_df = pd.read_csv(os.path.join(pairs_results_folder, pairs_files[0]), nrows=1000)
# for f in pairs_files:
#     test_df = pd.read_csv(os.path.join(pairs_results_folder, f))
#     unnamed_cols = [c for c in test_df.columns if c.startswith('Unnamed')]
#     test_df.drop(unnamed_cols, inplace=True, axis=1)
#     test_df.to_csv(os.path.join(pairs_results_folder, f), index=False)
    # print(test_df.columns.tolist())

In [32]:
wd_columns = [c for c in test_df.columns if 'wasserstein' in c]
wd_columns

['wasserstein_concurrent',
 'wasserstein_concurrent_max',
 'wasserstein_nonconcurrent',
 'wasserstein_nonconcurrent_max']

In [33]:
terrain = ['drainage_area_km2', 'elevation_m', 'slope_deg', 'gravelius', 'perimeter', 'aspect_deg']
land_cover = [
    'land_use_forest_frac_2010', 'land_use_grass_frac_2010', 'land_use_wetland_frac_2010', 'land_use_water_frac_2010', 
    'land_use_urban_frac_2010', 'land_use_shrubs_frac_2010', 'land_use_crops_frac_2010', 'land_use_snow_ice_frac_2010']
soil = ['logk_ice_x100', 'porosity_x100']
climate = ['prcp', 'srad', 'swe', 'tmax', 'tmin', 'vp', 'high_prcp_freq', 'high_prcp_duration', 'low_prcp_freq', 'low_prcp_duration']
all_attributes = terrain + land_cover + soil + climate

In [34]:
len(all_attributes)

26

In [35]:
# define the amount of data to set aside for final testing
holdout_pct = 0.10
nfolds = 5
n_boost_rounds = 100
n_optimization_rounds = 10

#define if testing concurrent or nonconcurrent data
concurrent = 'concurrent'

# the input data file has an associated revision date
revision_date = '20240717'

all_test_results = {}
attribute_set_names = ['climate', '+land_cover', '+terrain', '+soil']

results_folder = os.path.join('./data/', 'emd_prediction_results')
if not os.path.exists(results_folder):
    os.makedirs(results_folder)

In [36]:
def run_trials(bitrate, set_name, attributes, target, input_data, train_stn_cv_sets, test_stations, n_optimization_rounds, nfolds, num_boost_rounds):
    # randomly select 5% of the stations to leave out for a hold-out test set
    # to ensure none of the data are seen in training
    
    # X_train, Y_train = input_data.loc[train_indices, features].values, input_data.loc[train_indices, target].values
    # X_test, Y_test = input_data.loc[test_indices, features].values, input_data.loc[test_indices, target].values
    
    # reset the index to ensure the random selection is done properly
    sample_choices =  np.arange(0.5, 0.9, 0.02) 
    lr_choices = np.arange(0.001, 0.1, 0.0005) 
    learning_rates = np.random.choice(lr_choices, n_optimization_rounds) 
    subsamples = np.random.choice(sample_choices, n_optimization_rounds) 
    colsamples = np.random.choice(sample_choices, n_optimization_rounds) 
    
    test_data = dpf.filter_input_data_by_official_id(input_data, test_stations)
    
    all_results = []
    for trial in range(n_optimization_rounds):
        
        lr, ss, cs = learning_rates[trial], subsamples[trial], colsamples[trial]

        params = {
            "objective": "reg:squarederror",
            "eval_metric": "rmse",
            "eta": lr,
            "n_estimators": num_boost_rounds,
            # "max_depth": 6,  # use default max_depth
            # "min_child_weight": 1, # use colsample and subsample instead of min_child_weight
            "subsample": ss,
            "colsample_bytree": cs,
            "seed": 42,
            "device": "cuda",  # note, change this to 'cpu' if your system doesn't have a CUDA GPU
            "sampling_method": 'gradient_based',
            "tree_method": 'hist',
        }

        results_fname = f'{set_name}_{bitrate}_bits_{lr:.3f}_lr_{ss:.3f}_sub_{cs:.3f}_col.csv'
        results_fpath = os.path.join(results_folder, results_fname)
        
        # we need to manually do CV because we're separating by stations 
        # to prevent data leakage across training rounds
        cv_mses, cv_rmses, best_mae_rounds, best_rmse_rounds = [], [], None, None
        
        best_rmse = 1e9
        
        all_training_stations = np.array([np.array(e) for e in train_stn_cv_sets]).flatten()
        
        n_cv = 0
        cv_df = pd.DataFrame()
        cv_rmses, cv_maes = [], []
        for cv_test_set in train_stn_cv_sets:
            
            train_stns = [e for e in all_training_stations if e not in cv_test_set]
            
            assert len(np.intersect1d(train_stns, cv_test_set)) == 0
            cv_train_data = dpf.filter_input_data_by_official_id(input_data, train_stns)
            cv_test_data = dpf.filter_input_data_by_official_id(input_data, cv_test_set)
            
            X_train = cv_train_data[attributes].values
            Y_train = cv_train_data[target].values
            X_test = cv_test_data[attributes].values
            Y_test = cv_test_data[target].values
            
            model = xgb.XGBRegressor(
                **params
            )
            
            # Xy = xgb.DMatrix(X_train, label=Y_train)
            model.fit(X_train, Y_train)
            predictions = model.predict(X_test)
            
            rmse = root_mean_squared_error(predictions, Y_test)
            mae = mean_absolute_error(predictions, Y_test)
            cv_rmses.append(rmse)
            cv_maes.append(mae)
        
        cv_mean_rmse, cv_std_rmse = np.mean(cv_rmses), np.std(cv_rmses)
        cv_mean_mae, cv_std_mae = np.mean(cv_maes), np.std(cv_maes)

        results_dict = {
            'test_mae': cv_mean_mae,
            'test_rmse': cv_mean_rmse,
            'mae_stdev': cv_std_mae,
            'rmse_stdev': cv_std_rmse,
        }
        results_cols = list(results_dict.keys())
        results_dict.update(params)

        all_results.append(results_dict)
        if (trial > 0) & (trial % 20 == 0):
            print(f'   completed {trial}/{n_optimization_rounds}')

    # save the trial results
    trial_results = pd.DataFrame(all_results)
    trial_results.to_csv(results_fpath)
    trial_mean = trial_results['test_rmse'].mean()
    trial_stdev = trial_results['rmse_stdev'].mean()
    
    print(f'    {trial_mean:.2f} ± {trial_stdev:.3f} RMSE mean on the test set (N={len(trial_results)})')
    
    param_cols = list(params.keys())

    # get the optimal hyperparameters
    optimal_rmse_idx = trial_results['test_rmse'].idxmin()
    optimal_mae_idx = trial_results['test_mae'].idxmin()
    
    best_rmse_params = trial_results.loc[optimal_rmse_idx, param_cols]
    best_mae_params = trial_results.loc[optimal_mae_idx, param_cols]
    
    final_model = xgb.XGBRegressor(**best_rmse_params.to_dict())
    final_model.fit(X_train, Y_train)

    predicted_y = final_model.predict(X_test)
    
    test_results = pd.DataFrame(
        {
            'predicted': predicted_y,
            'actual': Y_test,
        })
    
    return trial_results, test_results

In [37]:
def predict_EMD_from_attributes(target_col, holdout_pct, stations, nfolds, partial_counts=False):

    training_stn_cv_sets, test_stn_sets = dpf.train_test_split_by_official_id(holdout_pct, stations, nfolds)
    all_test_results = {}
    for bitrate in [4, 6, 8]:
        t0 = time()
        print(f'bitrate = {bitrate}')
        fname = f"DKL_results_{bitrate}bits_{revision_date}.csv"
        if partial_counts:
            fname = f"DKL_results_{bitrate}bits_{revision_date}_partial_counts.csv"

        fpath = os.path.join(
            BASE_DIR, "data", "divergence_test_results", fname
        )
        df = pd.read_csv(fpath, nrows=None, low_memory=False)
        df.dropna(subset=[target_col], inplace=True)
        t1 = time()
        print(f'    {t1-t0:.2f}s to load input data')
        
        all_test_results[bitrate] = {}
        input_attributes = []

        # add attribute groups successively
        for attribute_set, set_name in zip([land_cover, terrain, soil, climate], attribute_set_names):

            input_attributes += attribute_set 
            
            features = dpf.format_features(input_attributes)
            
            # input_data = df[test_attributes + [target_column]].copy()

            trial_df, test_df = run_trials(bitrate, set_name, features, target_col, df, training_stn_cv_sets, test_stn_sets, n_optimization_rounds, nfolds, n_boost_rounds)
            
            test_rmse = root_mean_squared_error(test_df['actual'], test_df['predicted'])
            test_mae = mean_absolute_error(test_df['actual'], test_df['predicted'])

            print(f'  {set_name}')
            print(f'   held-out test rmse: {test_rmse:.2f}, mae: {test_mae:.2f}')
            print('')
            # store the test set predictions and actuals
            all_test_results[bitrate][set_name] = {
                'trials': trial_df, 'test_df': test_df,
                'test_mae': test_mae, 'test_rmse': test_rmse} 
    return all_test_results

In [38]:
target_col = f'wasserstein_{concurrent}'
all_test_results = predict_EMD_from_attributes(target_col, holdout_pct, station_ids, nfolds)

bitrate = 4
    19.88s to load input data


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




    93.10 ± 31.976 RMSE mean on the test set (N=10)
  climate
   held-out test rmse: 62.66, mae: 20.71

    97.46 ± 29.984 RMSE mean on the test set (N=10)
  +land_cover
   held-out test rmse: 59.66, mae: 16.86

    95.66 ± 29.964 RMSE mean on the test set (N=10)
  +terrain
   held-out test rmse: 60.66, mae: 15.64

    85.10 ± 35.335 RMSE mean on the test set (N=10)
  +soil
   held-out test rmse: 48.42, mae: 11.63

bitrate = 6
    23.31s to load input data
    92.96 ± 31.829 RMSE mean on the test set (N=10)
  climate
   held-out test rmse: 62.97, mae: 20.26

    97.31 ± 30.297 RMSE mean on the test set (N=10)
  +land_cover
   held-out test rmse: 62.29, mae: 22.41

    96.93 ± 29.868 RMSE mean on the test set (N=10)
  +terrain
   held-out test rmse: 60.15, mae: 16.97

    85.93 ± 35.156 RMSE mean on the test set (N=10)
  +soil
   held-out test rmse: 49.25, mae: 11.69

bitrate = 8
    23.27s to load input data
    93.39 ± 31.588 RMSE mean on the test set (N=10)
  climate
   held-out test

In [41]:
plots = []
for b, set_dict in all_test_results.items():
    test_rmse, test_mae = [], []
    attribute_sets = list(set_dict.keys())

    y1 = [set_dict[e]['test_rmse'] for e in attribute_sets]
    y2 = [set_dict[e]['test_mae'] for e in attribute_sets]
    
    source = ColumnDataSource({'x': attribute_sets, 'y1': y1, 'y2': y2})
    
    title = f'{b} bits'
    if len(plots) == 0:
        fig = figure(title=title, x_range=attribute_sets)
    else:
        fig = figure(title=title, x_range=attribute_sets, y_range=plots[0].y_range)
    fig.line('x', 'y1', legend_label='rmse', color='green', source=source, line_width=3)
    fig.line('x', 'y2', legend_label='mae', color='dodgerblue', source=source, line_width=3)
    fig.legend.background_fill_alpha = 0.6
    fig.yaxis.axis_label = 'RMSE'
    
    result_df = pd.DataFrame({'set': attribute_sets, 'rmse': y1, 'mae': y2})
    best_rmse_idx = result_df['rmse'].idxmin()
    best_mae_idx = result_df['mae'].idxmin()
    best_rmse_set = result_df.loc[best_rmse_idx, 'set']
    best_mae_set = result_df.loc[best_mae_idx, 'set']
    best_result = set_dict[best_rmse_set]['test_df']
    
    xx, yy = best_result['actual'], best_result['predicted']
    slope, intercept, r, p, se = linregress(xx, yy)
    
    sfig = figure(title=f'Test: {b} bits best model {best_rmse_set} (N={len(best_result)})')
    sfig.scatter(xx, yy, size=1, alpha=0.6)
    xpred = np.linspace(min(xx), max(xx), 100)
    ybf = [slope * e + intercept for e in xpred]
    sfig.line(xpred, ybf, color='red', line_width=3, line_dash='dashed', legend_label=f'R²={r**2:.2f}')    
    sfig.xaxis.axis_label = 'Actual EMD [bits/sample]'
    sfig.yaxis.axis_label = 'Predicted EMD [bits/sample]'
    sfig.legend.location = 'top_left'
    plots.append(fig)
    plots.append(sfig)

In [42]:
layout = gridplot(plots, ncols=2, width=350, height=300)
show(layout)

## Citations

```{bibliography}
:filter: docname in docnames
```