# Predict Earth Mover's (Wasserstein) Distance (EMD)

## Introduction

Here we make yet another comparison of runoff between two locations and ask if the **Earth Mover's Distance** (EMD), also known as the Wasserstein distance, between UAR distributions  can be predicted from the attributes of the two catchments (and their differences). We will again use the gradient boosted decision tree method.

In [1]:
import os
import pandas as pd
import numpy as np
from time import time

from bokeh.plotting import figure, show
from bokeh.layouts import gridplot
from bokeh.models import ColumnDataSource
from bokeh.io import output_notebook
from bokeh.palettes import Sunset10, Vibrant7, Category20

import xgboost as xgb
xgb.config_context(verbosity=2)

from sklearn.metrics import (
    root_mean_squared_error,
    mean_absolute_error,
    roc_auc_score,
    accuracy_score,
    confusion_matrix,
)

import data_processing_functions as dpf

from scipy.stats import linregress
output_notebook()

BASE_DIR = os.getcwd()

## Load data

In [2]:
# load the catchment characteristics
fname = 'BCUB_watershed_attributes_updated.csv'
attr_df = pd.read_csv(os.path.join('data', fname))
attr_df.columns = [c.lower() for c in attr_df.columns]
station_ids = attr_df['official_id'].values
print(f'There are {len(station_ids)} monitored basins in the attribute set.')

There are 1609 monitored basins in the attribute set.


### Load pairwise attribute comparisons

Load a few rows from one of the pairwise data files.  These contain attributes about divergence measures that are computed on concurrent and non-concurrent time series at two monitored locations.

In [3]:
# open an example pairwise results file
input_folder = os.path.join(BASE_DIR, "data", "processed_divergence_inputs")
pairs_files = os.listdir(input_folder)
test_df = pd.read_csv(os.path.join(input_folder, pairs_files[0]), nrows=1000)


In [4]:
wd_columns = [c for c in test_df.columns if 'wasserstein' in c]
wd_columns

['wasserstein_concurrent',
 'wasserstein_concurrent_max',
 'wasserstein_nonconcurrent',
 'wasserstein_nonconcurrent_max']

### Define attribute groupings

In [5]:
terrain = ['drainage_area_km2', 'elevation_m', 'slope_deg', 'aspect_deg'] # 'gravelius', 'perimeter',
land_cover = [
    'land_use_forest_frac_2010', 'land_use_grass_frac_2010', 'land_use_wetland_frac_2010', 'land_use_water_frac_2010', 
    'land_use_urban_frac_2010', 'land_use_shrubs_frac_2010', 'land_use_crops_frac_2010', 'land_use_snow_ice_frac_2010']
soil = ['logk_ice_x100', 'porosity_x100']
climate = ['prcp', 'srad', 'swe', 'tmax', 'tmin', 'vp', 'high_prcp_freq', 'high_prcp_duration', 'low_prcp_freq', 'low_prcp_duration']
all_attributes = terrain + land_cover + soil + climate
len(all_attributes)

24

## Set trial parameters

In [6]:
# define the amount of data to set aside for final testing
holdout_pct = 0.10 # holdout for final test set
nfolds = 5 # for cross validation
n_boost_rounds = 2500 
# for xgboost hyperparameter optimization
n_optimization_rounds = 20  

#define if testing concurrent or nonconcurrent data
concurrent = 'concurrent'

# partial counts refer to the test where observations were assigned
# a uniform distribution to approximate error and allow fractional 
# observations in state space
partial_counts = False

# the input data file has an associated revision date
revision_date = '20240812'

all_test_results = {}
attribute_set_names = ['climate', '+land_cover', '+terrain', '+soil']

## Run Models

In [7]:
def predict_EMD_from_attributes(attr_df, target_col_base, holdout_pct, stations, nfolds, results_folder, loss_function, partial_counts=False, binary_test=False):
    
    # split out the test set at the outset so it's constant across bitrate tests
    training_stn_cv_sets, test_stn_sets = dpf.train_test_split_by_official_id(holdout_pct, stations, nfolds)
    all_test_results = {}
    for bitrate in [4, 6, 8, 9, 10, 11, 12]:
        t0 = time()
        print(f'bitrate = {bitrate}')
        fname = f"KL_results_{bitrate}bits_{revision_date}.csv"
        if partial_counts:
            fname = f"KL_results_{bitrate}bits_{revision_date}_partial_counts.csv"

        fpath = os.path.join(input_folder, fname)
        nrows = None
        df = pd.read_csv(fpath, nrows=None, low_memory=False)
        df.dropna(subset=[target_col_base], inplace=True)
        
        # add the attributes into the input dataset
        df = dpf.add_attributes(attr_df, df, all_attributes)

        if binary_test == True:
            # if TVD(P||Q) > TVD(P||U), then the model is "further" from the true (posterior/observed)
            # than the uniform distribution
            df['binary_target'] = df[target_col_base] < df[f'wasserstein_{concurrent}_max']
            ut, ct = np.unique(df['binary_target'].values, return_counts=True)
            pct_false = ct[0] / len(df)
            # change target_col to the new binary target
            target_col = 'binary_target'
            print(f'The binary target variable balance is {100*pct_false:.0f}% False and {100*(1-pct_false):.0f}% True')
        else:
            target_col = target_col_base

        t1 = time()
        print(f'    {t1-t0:.2f}s to load input data')
        
        all_test_results[bitrate] = {}
        input_attributes = []

        # add attribute groups successively
        for attribute_set, set_name in zip([land_cover, terrain, soil, climate], attribute_set_names):
            print(f'  Processing {set_name} attribute set.')
            input_attributes += attribute_set 
            
            features = dpf.format_features(input_attributes)

            if binary_test == True:
                trial_df, test_df = dpf.run_binary_xgb_trials_custom_CV(
                    bitrate, set_name, features, target_col, df, 
                    training_stn_cv_sets, test_stn_sets, n_optimization_rounds, 
                    nfolds, n_boost_rounds, results_folder, loss=loss_function, eval_metric='error'
                )
                
                obs, pred = test_df['actual'].values, test_df['predicted'].values
                obs_set, obs_counts = np.unique(obs, return_counts=True)
                if (obs == pred).all() & (len(obs_set) == 1):
                    print('    All observations have the same class')
                    test_accuracy = 1.0
                else:
                    tn, fp, fn, tp = confusion_matrix(obs, pred).ravel()
                    test_accuracy = (tp + tn) / (tp + fp + fn + tn) 
                
                print(f'   held-out test accuracy: {test_accuracy:.2f}')
                # store the test set predictions and actuals
                all_test_results[bitrate][set_name] = {
                    'trials': trial_df, 'test_df': test_df,
                    'test_accuracy': test_accuracy} 
            else:
                
                trial_df, test_df = dpf.run_xgb_trials_custom_CV(
                    bitrate, set_name, features, target_col, df, 
                    training_stn_cv_sets, test_stn_sets, n_optimization_rounds, 
                    nfolds, n_boost_rounds, results_folder, loss=loss_function,
                )
                
                test_rmse = root_mean_squared_error(test_df['actual'], test_df['predicted'])
                test_mae = mean_absolute_error(test_df['actual'], test_df['predicted'])
    
                print(f'   Held-out test RMSE: {test_rmse:.2f}, MAE: {test_mae:.2f}')
                print('')
                # store the test set predictions and actuals
                all_test_results[bitrate][set_name] = {
                    'trials': trial_df, 'test_df': test_df,
                    'test_mae': test_mae, 'test_rmse': test_rmse} 
    return all_test_results

### Binary Model

In [8]:
binary_results_folder = os.path.join(BASE_DIR, 'data', 'emd_prediction_results_binary')
if not os.path.exists(binary_results_folder):
    os.makedirs(binary_results_folder)

loss_function = 'binary:hinge'
target_col = f'wasserstein_{concurrent}'
test_results_fname = f'{target_col}_results.npy'
test_results_fpath = os.path.join(binary_results_folder, test_results_fname)
if os.path.exists(test_results_fpath):
    all_test_results = np.load(test_results_fpath, allow_pickle=True).item()
else:
    all_test_results = predict_EMD_from_attributes(attr_df, target_col, holdout_pct, station_ids, nfolds, binary_results_folder, loss_function, binary_test=True)
    np.save(test_results_fpath, all_test_results)

### Regression Model

In [14]:
results_folder = os.path.join(BASE_DIR, 'data', 'emd_prediction_results_L1')
if not os.path.exists(results_folder):
    os.makedirs(results_folder)

loss_function = 'reg:absoluteerror'
target_col = f'wasserstein_{concurrent}'
test_results_fname = f'{target_col}_results_mae.npy'
test_results_fpath = os.path.join(results_folder, test_results_fname)
if os.path.exists(test_results_fpath):
    all_test_results = np.load(test_results_fpath, allow_pickle=True).item()
else:
    all_test_results = predict_EMD_from_attributes(attr_df, target_col, holdout_pct, station_ids, nfolds, results_folder, loss_function, binary_test=False)
    np.save(test_results_fpath, all_test_results)

## View Results

### Binary Model

Look at the effect of varying the bitrate on the classification balance in the binary problem.

In [10]:
balances, bitrates = [], []
for bitrate in range(4,13):
    if bitrate in [5, 7]:
        continue
    # print(f'bitrate = {bitrate}')
    fname = f"KL_results_{bitrate}bits_{revision_date}.csv"
    if partial_counts:
        fname = f"KL_results_{bitrate}bits_{revision_date}_partial_counts.csv"

    input_data_fpath = os.path.join(input_folder, fname)
    nrows = None
    
    df = pd.read_csv(input_data_fpath, nrows=nrows, low_memory=False)

    tvd_cols = [c for c in df.columns if c.startswith('wasserstein')]
    # print(tvd_cols)
    target_col = f'wasserstein_{concurrent}'
    # if DKL(P||Q) < DKL(P||U), then the model is a "better compressor"
    # of the target signal than a uniform distribution
    df['binary_target'] = df[target_col] < df[f'wasserstein_{concurrent}_max']
    ut, ct = np.unique(df['binary_target'].values, return_counts=True)
    pct_false = ct[0] / len(df)
    balances.append(1-pct_false)
    bitrates.append(bitrate)

In [11]:
bal_fig = figure(width=600, height=400, x_axis_type='log')
bal_fig.line(bitrates, balances, color='dodgerblue', 
             line_width=2)
bal_fig.xaxis.axis_label = r'$$\text{Dictionary size} (2^{\text{bits}}) $$'
bal_fig.yaxis.axis_label = r'$$\text{P(True) } [ \% ]$$'
# bal_fig.legend.location = 'bottom_right'
show(bal_fig)

From above, the dictionary size has very little effect on the binary target variable classification balance.  The binary classification is $D_{EMD}(P||Q) < D_{EMD}(P||\mathbb{U})$.

### Regression Model

In [15]:
plots = []
for b, set_dict in all_test_results.items():
    test_rmse, test_mae = [], []
    attribute_sets = list(set_dict.keys())

    y1 = [set_dict[e]['test_rmse'] for e in attribute_sets]
    y2 = [set_dict[e]['test_mae'] for e in attribute_sets]
    
    source = ColumnDataSource({'x': attribute_sets, 'y1': y1, 'y2': y2})
    
    title = f'{b} bits'
    if len(plots) == 0:
        fig = figure(title=title, x_range=attribute_sets)
    else:
        fig = figure(title=title, x_range=attribute_sets, y_range=plots[0].y_range)
    fig.line('x', 'y1', legend_label='rmse', color='green', source=source, line_width=3)
    fig.line('x', 'y2', legend_label='mae', color='dodgerblue', source=source, line_width=3)
    fig.legend.background_fill_alpha = 0.6
    fig.yaxis.axis_label = r'$$\text{Error } [L/s/\text{km}^{2}]$$'
    
    result_df = pd.DataFrame({'set': attribute_sets, 'rmse': y1, 'mae': y2})
    best_rmse_idx = result_df['rmse'].idxmin()
    best_mae_idx = result_df['mae'].idxmin()
    best_rmse_set = result_df.loc[best_rmse_idx, 'set']
    best_mae_set = result_df.loc[best_mae_idx, 'set']
    best_result = set_dict[best_rmse_set]['test_df']
    
    xx, yy = best_result['actual'], best_result['predicted']
    slope, intercept, r, p, se = linregress(xx, yy)
    
    sfig = figure(title=f'Test: {b} bits best model {best_rmse_set} (N={len(best_result)})')
    sfig.scatter(xx, yy, size=1, alpha=0.6)
    xpred = np.linspace(min(xx), max(xx), 100)
    ybf = [slope * e + intercept for e in xpred]
    sfig.line(xpred, ybf, color='red', line_width=3, line_dash='dashed', legend_label=f'R²={r**2:.2f}')   
    # plot a 1:1 line
    sfig.line([min(yy), max(yy)], [min(yy), max(yy)], color='black', line_dash='dotted', 
              line_width=2, legend_label='1:1')
    sfig.xaxis.axis_label = r'$$\text{Actual EMD } [L/s/\text{km}^{2}]$$'
    sfig.yaxis.axis_label = r'$$\text{Predicted EMD } [L/s/\text{km}^{2}]$$'
    sfig.legend.location = 'top_left'
    plots.append(fig)
    plots.append(sfig)

In [16]:
layout = gridplot(plots, ncols=2, width=350, height=300)
show(layout)

## Discussion

EMD not sensitive to bitrate.  While the COD suggests the model predicts EMD fairly well, the distribution of values is heavily skewed, making the L2 loss function sensitive to outliers.  a) the model predicts many negative EMD values, and b) the model 

- Discussion of the analogy between EMD and volume error.  i.e. frequency x distance moved.

## Citations

```{bibliography}
:filter: docname in docnames
```