In [2]:
import os
import pandas as pd
import numpy as np

import xgboost as xgb
# import shap
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler

from dask import array as da
from dask.distributed import Client, LocalCluster

from bokeh.layouts import column, row
from bokeh.models import ColumnDataSource
from bokeh.models import LinearAxis, Range1d
from bokeh.plotting import figure, curdoc, show
from bokeh.io import output_notebook
output_notebook()

## XGBoost

We want to predict the KL divergence between a target with unit area runoff (UAR) distribution P, and a proxy with UAR distribution Q. The proxy is the model, and the KL divergence measures the average number of additional bits needed to fully describe the target series given the model.  

Using the XKBoost library, find a set of hyperparameters that yields low generalization error (error/variance on out-of-sample test data).  

Gradient boosting is a machine learning algorithm where a large number of "weak" (decision tree) models are combined to yield a stronger collective model.  

Here we'll test various hyperparameters for their sensitivity to bias-variance.

* **number of estimators**: the number of trees added in the training sequence
* **learning rate**: the strength of subsequent trees gets scaled by the learning rate which affects convergence

In [3]:
BASE_DIR = os.getcwd()

## Feature importance

At the end of this notebook there is an analysis of feature importance.  For a large sample of simulations, the feature importance score is collected and aggregated to understand the attributes that commonly score high in importance. This is an iterative process, so to begin you can use all features.

In [4]:
features_to_keep = ['proxy_prcp', 'target_prcp', 'target_Slope_deg', 'proxy_Slope_deg',
       'proxy_Drainage_Area_km2', 'target_Centroid_Lon_deg_E',
       'target_Land_Use_Snow_Ice_frac', 'target_low_prcp_freq',
       'proxy_Elevation_m', 'proxy_Centroid_Lat_deg_N',
       'proxy_Land_Use_Snow_Ice_frac', 'proxy_Centroid_Lon_deg_E',
       'target_Drainage_Area_km2', 'proxy_Gravelius',
       'proxy_low_prcp_freq', 'target_Elevation_m',
       'target_low_prcp_duration', 'proxy_Aspect_deg',
       'proxy_Permeability_logk_m2', 'proxy_vp']

features_to_keep = [
    'proxy_prcp', 'target_prcp', 'target_Slope_deg', 'proxy_Slope_deg',
    'target_Centroid_Lon_deg_E', 'proxy_Drainage_Area_km2',
    'target_low_prcp_duration', 'proxy_Elevation_m',
    'target_Drainage_Area_km2', 'target_low_prcp_freq',
    'proxy_Porosity_frac', 'proxy_low_prcp_freq',
    'proxy_Land_Use_Snow_Ice_frac', 'target_srad',
    'proxy_Land_Use_Forest_frac', 'proxy_srad',
    'proxy_low_prcp_duration', 'target_Elevation_m',
    'target_Land_Use_Snow_Ice_frac', 'proxy_Centroid_Lat_deg_N',
    'proxy_Centroid_Lon_deg_E', 'target_Porosity_frac',
]


In [5]:
class ObjectiveFunction:
    def __init__(
        self,
        train_df,
        test_df,
        bitrate,
        feature_columns,
        objective,
        model_params,
        target_column="dkl_post_0R",
    ):
        self.train_df = train_df
        self.test_df = test_df
        self.bitrate = bitrate
        self.target_column = target_column
        
        self.feature_columns = feature_columns
        self.feature_columns = features_to_keep
        
        self.training_frac = 1
        self.objective = objective
        self.model_params = model_params
        self.create_model()

    def prepare_input_data(self):
        # keep just the first rows corresponding to the training size
        # dataframe = self.input_df.loc[:self.training_size, :].copy()
        # Split the data into features and target
        X = self.train_df[self.feature_columns].values
        y = self.train_df[self.target_column].values
        
        X_test = self.test_df[self.feature_columns].values
        y_test = self.test_df[self.target_column].values
        # Standardize the features
        # scaler = QuantileTransformer(output_distribution='normal')
        self.scaler = StandardScaler()
        # use the scaling from training data on the test
        X_train_scaled = self.scaler.fit_transform(X)
        X_test_scaled = self.scaler.transform(X_test)
        return X_train_scaled, y, X_test_scaled, y_test

    def create_model(self):
        # objective = "squarederror"
        # objective = "absoluteerror"
        # Define hyperparameter space
        # formulate a unique model id using the model parameters
        model_id = f"xgb_hist_{self.bitrate}_bits_{self.target_column}"
        for k, v in self.model_params.items():
            if v == "absoluteerror":
                v = "mae"
            elif v == "squarederror":
                v = "mse"
            if isinstance(v, float):
                v = f"{v:1.3e}"
            model_id += f"_{k}_{v}"
        self.model_id = model_id
        
        self.model = xgb.XGBRegressor(
            **self.model_params,
        )

    def train_model(self):
        # X_train, Y_train, X_test, Y_test = self.prepare_input_data()
        self.X_train, self.Y_train, self.X_test, self.Y_test = self.prepare_input_data()
        # train the model
        # self.dtrain = xgb.DMatrix(self.X_train, label=self.Y_train, feature_names=self.feature_columns)
        self.model.fit(self.X_train, self.Y_train)
        # get the model fit mae
        if self.objective == 'squarederror':
            self.training_error = root_mean_squared_error(
                self.Y_train, self.model.predict(self.X_train)
            )
        elif self.objective == 'absoluteerror':
            self.training_error = mean_absolute_error(
                self.Y_train, self.model.predict(self.X_train)
            )
        else:
            raise Exception('self.objective is not recognized')
        return self.training_error
    
    def evaluate_model(self):
        # evaluate the model
        predictions = self.model.predict(self.X_test)
        res_df = pd.DataFrame(
            {"actual": self.Y_test, "predicted": predictions.flatten()}
        )
        # expected_mae = mean_absolute_error(self.Y_test, predictions)
        expected_mse = root_mean_squared_error(self.Y_test, predictions)

        result_fpath = os.path.join(
            BASE_DIR,
            "processed_data",
            "xgb_results",
            f"{self.model_id}_{expected_mse:1.2f}Emse_{self.training_error:1.2f}_Tmse.csv",
        )
        res_df.to_csv(result_fpath)
        return expected_mse

In [6]:
attributes = [
    "Centroid_Lat_deg_N",
    "Centroid_Lon_deg_E",
    "Drainage_Area_km2",
    "Elevation_m",
    "Slope_deg",
    "Aspect_deg",
    "Gravelius",
    "Perimeter",
    "Land_Use_Forest_frac",
    "Land_Use_Grass_frac",
    "Land_Use_Wetland_frac",
    "Land_Use_Snow_Ice_frac",
    "Land_Use_Urban_frac",
    "Land_Use_Shrubs_frac",
    "Land_Use_Crops_frac",
    "Land_Use_Water_frac",
    "Permeability_logk_m2",
    "Porosity_frac",
    "tmax",
    "tmin",
    "prcp",
    "srad",
    "swe",
    "vp",
    "high_prcp_freq",
    "high_prcp_duration",
    "low_prcp_freq",
    "low_prcp_duration",
]

features = []
for c in attributes:
    features.append(f"proxy_{c}")
    features.append(f"target_{c}")
features.append("centroid_distance")

In [7]:
def load_data(b):
    # fname = f'compression_test_results_{b}bits_20240212.csv'
    fname = f"DKL_results_{b}bits_{revision_date}.csv"
    fpath = os.path.join(BASE_DIR, "processed_data", "dkl_test_results", fname)
    return pd.read_csv(fpath, low_memory=False)


def check_for_duplicates(df, col1, col2, keep='first'):
    """
    Check for duplicated rows in a pandas DataFrame based on two columns.

    Parameters:
    - df: The pandas DataFrame to check for duplicates.
    - col1, col2: The names of the two columns to check for duplicates.
    - keep: Determines which duplicates (if any) to mark.
            - 'first': Mark duplicates as True except for the first occurrence.
            - 'last': Mark duplicates as True except for the last occurrence.
            - False: Mark all duplicates as True.

    Returns:
    - A pandas DataFrame with the duplicated rows based on the specified columns.
    """
    # Find duplicates based on the specified columns
    duplicates = df.duplicated(subset=[col1, col2], keep=keep)
    if duplicates.sum()  > 0:
        print(f"    Found {duplicates.sum()} duplicate rows in the dataset.")
        df = df[~duplicates]
        
    return df


def leave_out_at_random(df, unique_stations, K=10):
    excluded_stations = np.random.choice(unique_stations, K, replace=False)
    training_df = df[
        ~df["proxy"].isin(excluded_stations) & ~df["target"].isin(excluded_stations)
    ].copy()
    test_df = df[
        df["proxy"].isin(excluded_stations) & df["target"].isin(excluded_stations)
    ].copy()
    return training_df, test_df

In [8]:
def create_plot(data, n_simulations, param):
    source = ColumnDataSource(data)
    p = figure(title=f"{param} ({n_simulations} simulations)", width=600, height=300)
    p.line(param, 'train_mse_mean', color='dodgerblue',
            legend_label='Train MSE', source=source, line_width=3)
    p.line(param, 'test_mse_mean', color='dodgerblue',
            legend_label='Test MSE', source=source, line_width=3, line_dash='dashed')
    p.yaxis.axis_label = 'MSE'
    std_data = data[[c for c in data.columns if c.endswith('_std')]]
    y1, y2 = std_data.min().min(), std_data.max().max()
    p.extra_y_ranges = {"secondary": Range1d(start=y1-0.05, end=y2+0.05)}
    p.add_layout(LinearAxis(y_range_name="secondary", axis_label='Stdev MSE'), 'right')
    p.line(param, 'train_mse_std', color='green', y_range_name='secondary',
           legend_label='Train stdev', source=source, line_width=3)
    p.line(param, 'test_mse_std', color='green', y_range_name='secondary',
           legend_label='Test stdev', source=source, line_width=3, line_dash='dashed')
    p.xaxis.axis_label = param
    p.legend.location = 'bottom_left'
    p.legend.background_fill_alpha = 0.6
    return p

In [15]:
bitrate = 8
revision_date = "20240409"
n_simulations = 10
K = 250
objective = 'squarederror'
objective = 'absoluteerror'
all_data = load_data(bitrate)
all_data = check_for_duplicates(all_data, 'proxy', 'target')

In [16]:
# get the unique stations
stations = pd.unique(all_data[["proxy", "target"]].values.ravel("K"))

In [17]:
all_data[features] = all_data[features].astype('float32')

First we test the effect of increasing the number of estimators, or the number of "boosting rounds", where a new decision tree is added and initialized based on compensating for the error of the previous tree.

In [18]:
def format_results(sim_results, param):
    result_df = pd.DataFrame(sim_results, columns=[param, 'train_mse', 'test_mse'])
    estimators = result_df.groupby(param)
    mean_mse = estimators.mean()
    std_mse = estimators.std()
    mean_mse.columns = [f'{c}_mean' for c in mean_mse.columns]
    std_mse.columns = [f'{c}_std' for c in std_mse.columns]
    df = pd.concat([mean_mse, std_mse], axis=1)
    df.reset_index(inplace=True)
    return df

In [20]:
param = 'n_estimators'
n_estimators_range = [1, 2, 5, 10, 20, 50, 100]
sim_results = []
n_simulations = 10
target_col = 'tvd'
for n in range(n_simulations):
    train_df, test_df = leave_out_at_random(all_data, stations, K)
    model_results = []
    for n_est in n_estimators_range:
        model_params = {
            "objective": f"reg:{objective}",
            "n_estimators": n_est,
            "random_state": 42,
        }
        model = ObjectiveFunction(train_df, test_df, bitrate, features, objective, model_params, target_column=target_col)
        training_error = model.train_model()
        test_error = model.evaluate_model()
        model_results.append((n_est, training_error, test_error))
        sim_results.append((n_est, training_error, test_error))
        model_results.append((n_est, training_error, test_error))
    model_df = pd.DataFrame(model_results, columns=['n_estimators', 'train_mse', 'test_mse'])
    mean_test_err = model_df['test_mse'].mean()
    stdev_test_err = model_df['test_mse'].std()
    print(f"   Simulation {n+1}/{n_simulations} complete: {mean_test_err:.2f}/{stdev_test_err:.2f} train/test error")

   Simulation 1/10 complete: 0.14/0.02 train/test error
   Simulation 2/10 complete: 0.14/0.02 train/test error
   Simulation 3/10 complete: 0.14/0.02 train/test error
   Simulation 4/10 complete: 0.14/0.02 train/test error
   Simulation 5/10 complete: 0.14/0.02 train/test error
   Simulation 6/10 complete: 0.13/0.02 train/test error
   Simulation 7/10 complete: 0.15/0.02 train/test error
   Simulation 8/10 complete: 0.14/0.02 train/test error
   Simulation 9/10 complete: 0.14/0.02 train/test error
   Simulation 10/10 complete: 0.14/0.02 train/test error


In [21]:
result_df = format_results(sim_results, param)
f1 = create_plot(result_df, n_simulations, param)
show(f1)

## Test Max Depth

In [22]:
# max_depth
param = 'max_depth'
param_range = [1, 2, 5, 10, 20, 50]
sim_results = []
n_simulations = 10
for n in range(n_simulations):
    train_df, test_df = leave_out_at_random(all_data, stations, K)
    model_results = []
    for p in param_range:
        model_params = {
            "objective": f"reg:{objective}",
            "max_depth": p,
            "n_estimators": 20,
            "random_state": 42,
            "tree_method": 'hist',
        }
        model = ObjectiveFunction(train_df, test_df, bitrate, features, objective, model_params, target_column="dkl_post_0R")
        training_error = model.train_model()
        test_error = model.evaluate_model()
        model_results.append((p, training_error, test_error))
        sim_results.append((p, training_error, test_error))
        model_results.append((p, training_error, test_error))
    model_df = pd.DataFrame(
        model_results, 
        columns=['learning_rate', 'train_mse', 'test_mse']
    )
    mean_test_err = model_df['test_mse'].mean()
    stdev_test_err = model_df['test_mse'].std()
    print(f"   Simulation {n+1}/{n_simulations} complete: {mean_test_err:.2f}/{stdev_test_err:.2f} mean/stdev error")
      

   Simulation 1/10 complete: 0.58/0.02 mean/stdev error



KeyboardInterrupt



In [None]:
result_df = format_results(sim_results, param)
f1 = create_plot(result_df, n_simulations, param)
show(f1)

## Test the learning rate (aka eta)

Step size shrinkage, default is 0.3.

Looks like as n_estimators increases, optimal value **decreases** (as far as out-of-sample test data is concerned), i.e. for n_estimators = 100, optimal learning rate decreases to 0.1.

In [140]:
param = 'learning_rate'
# learning_rate_range = list(np.logspace(-4, -1, 5)) + list(np.linspace(0.125, 0.3, 5))
param_range = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 2e-1, 3e-1, 4e-1, 5e-1, 6e-1]
sim_results = []
n_simulations = 10
for n in range(n_simulations):
    train_df, test_df = leave_out_at_random(all_data, stations, K)
    model_results = []
    for p in param_range:
        model_params = {
            "objective": f"reg:{objective}",
            "learning_rate": p,
            'n_estimators': 100,
            "random_state": 42,
        }
        model = ObjectiveFunction(train_df, test_df, bitrate, features, objective, model_params, target_column="dkl_post_0R")
        training_error = model.train_model()
        test_error = model.evaluate_model()
        model_results.append((p, training_error, test_error))
        sim_results.append((p, training_error, test_error))
        model_results.append((p, training_error, test_error))
    model_df = pd.DataFrame(model_results, columns=[param, 'train_mse', 'test_mse'])
    mean_test_err = model_df['test_mse'].mean()
    stdev_test_err = model_df['test_mse'].std()
    print(f"   Simulation {n+1}/{n_simulations} complete: {mean_test_err:.2f}/{stdev_test_err:.2f} mean/stdev error")
      

   Simulation 1/10 complete: 1.37/0.34 mean/stdev error
   Simulation 2/10 complete: 1.25/0.31 mean/stdev error
   Simulation 3/10 complete: 1.46/0.31 mean/stdev error
   Simulation 4/10 complete: 1.42/0.35 mean/stdev error
   Simulation 5/10 complete: 1.35/0.35 mean/stdev error
   Simulation 6/10 complete: 1.34/0.34 mean/stdev error
   Simulation 7/10 complete: 1.27/0.27 mean/stdev error
   Simulation 8/10 complete: 1.32/0.32 mean/stdev error
   Simulation 9/10 complete: 1.45/0.36 mean/stdev error
   Simulation 10/10 complete: 1.33/0.25 mean/stdev error


In [141]:
result_df = format_results(sim_results, param)
f1 = create_plot(result_df, n_simulations, param)
show(f1)

## Test the (L2) regularization parameter

Default is 1, range is $[0, \infty]$.

In [142]:
param = 'lambda'
param_range = [1, 10, 20, 50, 100, 200, 500, 1000]
sim_results = []
n_simulations = 10
for n in range(n_simulations):
    train_df, test_df = leave_out_at_random(all_data, stations, K)
    model_results = []
    for p in param_range:
        model_params = {
            "objective": f"reg:{objective}",
            "lambda": p,
            'n_estimators': 100,
            "random_state": 42,
        }
        model = ObjectiveFunction(train_df, test_df, bitrate, features, objective, model_params, target_column="dkl_post_0R")
        training_error = model.train_model()
        test_error = model.evaluate_model()
        model_results.append((p, training_error, test_error))
        sim_results.append((p, training_error, test_error))
        model_results.append((p, training_error, test_error))
    model_df = pd.DataFrame(model_results, columns=[param, 'train_mse', 'test_mse'])
    mean_test_err = model_df['test_mse'].mean()
    stdev_test_err = model_df['test_mse'].std()
    print(f"   Simulation {n+1}/{n_simulations} complete: {mean_test_err:.2f}/{stdev_test_err:.2f} mean/stdev test error")
      

   Simulation 1/10 complete: 1.05/0.01 mean/stdev test error
   Simulation 2/10 complete: 1.10/0.02 mean/stdev test error
   Simulation 3/10 complete: 1.12/0.03 mean/stdev test error
   Simulation 4/10 complete: 1.13/0.03 mean/stdev test error
   Simulation 5/10 complete: 1.18/0.01 mean/stdev test error
   Simulation 6/10 complete: 1.12/0.02 mean/stdev test error
   Simulation 7/10 complete: 1.17/0.02 mean/stdev test error
   Simulation 8/10 complete: 1.04/0.01 mean/stdev test error
   Simulation 9/10 complete: 1.09/0.03 mean/stdev test error
   Simulation 10/10 complete: 1.12/0.01 mean/stdev test error


In [143]:
result_df = format_results(sim_results, param)
f1 = create_plot(result_df, n_simulations, param)
show(f1)

## Test the L1 regularization parameter (alpha)

In [144]:
# learning_rate_range = list(np.logspace(-4, -1, 5)) + list(np.linspace(0.125, 0.3, 5))
param = 'alpha'
param_range = [0, 1, 10, 100, 100, 200, 500]
sim_results = []
K = 250
for n in range(n_simulations):
    train_df, test_df = leave_out_at_random(all_data, stations, K)
    model_results = []
    for p in param_range:
        model_params = {
            "objective": f"reg:{objective}",
            "alpha": p,
            'n_estimators': 100,
            "random_state": 42,
            "tree_method": 'hist',
        }
        model = ObjectiveFunction(train_df, test_df, bitrate, features, objective, model_params, target_column="dkl_post_0R")
        training_error = model.train_model()
        test_error = model.evaluate_model()
        model_results.append((p, training_error, test_error))
        sim_results.append((p, training_error, test_error))
        model_results.append((p, training_error, test_error))
    model_df = pd.DataFrame(model_results, columns=[param, 'train_mse', 'test_mse'])
    mean_test_err = model_df['test_mse'].mean()
    stdev_test_err = model_df['test_mse'].std()
    print(f"   Simulation {n+1}/{n_simulations} complete: {mean_test_err:.2f}/{stdev_test_err:.2f} mean/stdev test error")
      

   Simulation 1/10 complete: 1.18/0.01 mean/stdev test error
   Simulation 2/10 complete: 1.14/0.01 mean/stdev test error
   Simulation 3/10 complete: 1.16/0.01 mean/stdev test error
   Simulation 4/10 complete: 1.11/0.02 mean/stdev test error
   Simulation 5/10 complete: 0.99/0.01 mean/stdev test error
   Simulation 6/10 complete: 1.04/0.01 mean/stdev test error
   Simulation 7/10 complete: 1.13/0.01 mean/stdev test error
   Simulation 8/10 complete: 1.13/0.01 mean/stdev test error
   Simulation 9/10 complete: 1.10/0.03 mean/stdev test error
   Simulation 10/10 complete: 1.17/0.01 mean/stdev test error


In [145]:
result_df = format_results(sim_results, param)
f1 = create_plot(result_df, n_simulations, param)
show(f1)

## Test the subsample rate

In [146]:
# learning_rate_range = list(np.logspace(-4, -1, 5)) + list(np.linspace(0.125, 0.3, 5))
param = 'subsample'
param_range = np.linspace(0.4, 1.0, 10)
sim_results = []
for n in range(n_simulations):
    train_df, test_df = leave_out_at_random(all_data, stations, K)
    model_results = []
    for p in param_range:
        model_params = {
            "objective": f"reg:{objective}",
            "subsample": p,
            "random_state": 42,
        }
        model = ObjectiveFunction(train_df, test_df, bitrate, features, objective, model_params, target_column="dkl_post_0R")
        training_error = model.train_model()
        test_error = model.evaluate_model()
        model_results.append((p, training_error, test_error))
        sim_results.append((p, training_error, test_error))
        model_results.append((p, training_error, test_error))
    model_df = pd.DataFrame(model_results, columns=[param, 'train_mse', 'test_mse'])
    mean_test_err = model_df['test_mse'].mean()
    stdev_test_err = model_df['test_mse'].std()
    print(f"   Simulation {n+1}/{n_simulations} complete: {mean_test_err:.2f}/{stdev_test_err:.2f} mean/stdev test error")
      

   Simulation 1/10 complete: 1.12/0.02 mean/stdev test error
   Simulation 2/10 complete: 1.25/0.01 mean/stdev test error
   Simulation 3/10 complete: 1.09/0.01 mean/stdev test error
   Simulation 4/10 complete: 1.15/0.01 mean/stdev test error
   Simulation 5/10 complete: 1.10/0.01 mean/stdev test error
   Simulation 6/10 complete: 1.21/0.01 mean/stdev test error
   Simulation 7/10 complete: 1.19/0.02 mean/stdev test error
   Simulation 8/10 complete: 1.01/0.02 mean/stdev test error
   Simulation 9/10 complete: 1.12/0.02 mean/stdev test error
   Simulation 10/10 complete: 1.13/0.01 mean/stdev test error


In [147]:
result_df = format_results(sim_results, param)
f1 = create_plot(result_df, n_simulations, param)
show(f1)

## Test the colsample

Looks like slight optimum at 0.6 (at least for n_estimators = 100).  Less pronounced when n_estimators is small.

In [148]:
# colsample
param = 'colsample'
param_range = np.linspace(0.4, 1, 10)
sim_results = []
for n in range(n_simulations):
    train_df, test_df = leave_out_at_random(all_data, stations, K)
    model_results = []
    for p in param_range:
        model_params = {
            "objective": f"reg:{objective}",
            "colsample_bytree": p,
            "random_state": 42,
        }
        model = ObjectiveFunction(train_df, test_df, bitrate, features, objective, model_params, target_column="dkl_post_0R")
        training_error = model.train_model()
        test_error = model.evaluate_model()
        model_results.append((p, training_error, test_error))
        sim_results.append((p, training_error, test_error))
        model_results.append((p, training_error, test_error))
    model_df = pd.DataFrame(model_results, columns=['learning_rate', 'train_mse', 'test_mse'])
    mean_test_err = model_df['test_mse'].mean()
    stdev_test_err = model_df['test_mse'].std()
    print(f"   Simulation {n+1}/{n_simulations} complete: {mean_test_err:.2f}/{stdev_test_err:.2f} mean/stdev error")
      

   Simulation 1/10 complete: 1.18/0.02 mean/stdev error
   Simulation 2/10 complete: 1.09/0.01 mean/stdev error
   Simulation 3/10 complete: 1.22/0.02 mean/stdev error
   Simulation 4/10 complete: 1.15/0.02 mean/stdev error
   Simulation 5/10 complete: 1.20/0.02 mean/stdev error
   Simulation 6/10 complete: 1.11/0.02 mean/stdev error
   Simulation 7/10 complete: 1.26/0.02 mean/stdev error
   Simulation 8/10 complete: 1.14/0.02 mean/stdev error
   Simulation 9/10 complete: 1.11/0.02 mean/stdev error
   Simulation 10/10 complete: 1.17/0.03 mean/stdev error


In [149]:
result_df = format_results(sim_results, param)
f1 = create_plot(result_df, n_simulations, param)
show(f1)

## Test the size of left out sample

In [157]:
# max_depth
param = 'leave_out_size'

param_range = range(10, int(len(stations) * 0.8), 20)
sim_results = []
n_simulations = 25
target_col = 'tvd'
for n in range(n_simulations):
    model_results = []
    for p in param_range:
        train_df, test_df = leave_out_at_random(all_data, stations, p)
        train_size, test_size = len(train_df), len(test_df)
        model_params = {
            "objective": f"reg:{objective}",
            "max_depth": 2,
            "n_estimators": 10,
            "lambda": 0,
            "colsample_bytree": 0.6,
            "learning_rate": 0.3,
            "random_state": 42,
            "tree_method": 'hist',
        }
        model = ObjectiveFunction(train_df, test_df, bitrate, features, objective, model_params, target_column=target_col)
        training_error = model.train_model()
        test_error = model.evaluate_model()
        model_results.append((p, train_size, test_size, training_error, test_error))
        sim_results.append((p, train_size, test_size, training_error, test_error))
        model_results.append((p, train_size, test_size, training_error, test_error))
    model_df = pd.DataFrame(model_results, columns=[param, 'train_size', 'test_size', 'train_mse', 'test_mse'])
    mean_test_err = model_df['test_mse'].mean()
    stdev_test_err = model_df['test_mse'].std()
    print(f"   Simulation {n+1}/{n_simulations} complete: {mean_test_err:.2f}/{stdev_test_err:.2f} mean/stdev error")
      

   Simulation 1/25 complete: 0.18/0.00 mean/stdev error
   Simulation 2/25 complete: 0.18/0.01 mean/stdev error
   Simulation 3/25 complete: 0.18/0.01 mean/stdev error
   Simulation 4/25 complete: 0.18/0.01 mean/stdev error
   Simulation 5/25 complete: 0.18/0.00 mean/stdev error
   Simulation 6/25 complete: 0.18/0.01 mean/stdev error
   Simulation 7/25 complete: 0.18/0.01 mean/stdev error
   Simulation 8/25 complete: 0.18/0.01 mean/stdev error
   Simulation 9/25 complete: 0.18/0.00 mean/stdev error
   Simulation 10/25 complete: 0.18/0.00 mean/stdev error
   Simulation 11/25 complete: 0.18/0.01 mean/stdev error
   Simulation 12/25 complete: 0.18/0.01 mean/stdev error
   Simulation 13/25 complete: 0.18/0.00 mean/stdev error
   Simulation 14/25 complete: 0.18/0.01 mean/stdev error
   Simulation 15/25 complete: 0.18/0.01 mean/stdev error
   Simulation 16/25 complete: 0.18/0.01 mean/stdev error
   Simulation 17/25 complete: 0.18/0.01 mean/stdev error
   Simulation 18/25 complete: 0.18/0.01 

In [158]:
param = 'leave_out_size'
result_df = pd.DataFrame(sim_results, columns=[param, 'train_size', 'test_size', 'train_mse', 'test_mse'])
result_df
estimators = result_df.groupby(param)
mean_mse = estimators.mean()
std_mse = estimators.std()
mean_mse.columns = [f'{c}_mean' for c in mean_mse.columns]
std_mse.columns = [f'{c}_std' for c in std_mse.columns]
rdf = pd.concat([mean_mse, std_mse], axis=1)
rdf.reset_index(inplace=True)
rdf.head()

Unnamed: 0,leave_out_size,train_size_mean,test_size_mean,train_mse_mean,test_mse_mean,train_size_std,test_size_std,train_mse_std,test_mse_std
0,10,333078.88,19.36,0.1423,0.17782,774.805746,6.421838,0.00058,0.024028
1,30,321661.88,206.8,0.142416,0.179498,1426.109402,36.987611,0.000492,0.015954
2,50,310655.56,584.08,0.142464,0.17854,1948.03076,88.384067,0.000746,0.009401
3,70,299961.4,1142.48,0.142349,0.174154,1605.22366,100.551446,0.001348,0.009209
4,90,289101.4,1907.0,0.142012,0.177658,2098.354574,168.855708,0.00087,0.007886


In [159]:
source = ColumnDataSource(rdf)
param = 'train_size_mean'
p = figure(title=f"{param} ({n_simulations} simulations)", width=600, height=300)
p.line(param, 'train_mse_mean', color='dodgerblue',
        legend_label='Train MSE', source=source, line_width=3)
p.line(param, 'test_mse_mean', color='dodgerblue',
        legend_label='Test MSE', source=source, line_width=3, line_dash='dashed')

p.line(param, 'train_mse_std', color='green', y_range_name='secondary',
       legend_label='Train stdev', source=source, line_width=3)
p.line(param, 'test_mse_std', color='green', y_range_name='secondary',
       legend_label='Test stdev', source=source, line_width=3, line_dash='dashed')

p.yaxis.axis_label = 'MSE'
p.xaxis.axis_label = 'Training Size'

# plot second x axis
x1, x2 = rdf['leave_out_size'].min(), rdf['leave_out_size'].max()

p.extra_x_ranges = {"secondary": Range1d(start=x1-1, end=x2+1)}
p.add_layout(LinearAxis(x_range_name="secondary", axis_label='N stations held out'), 'above')

# # plot second y axis
std_data = rdf[[c for c in rdf.columns if c.endswith('mse_std')]].copy()
y1, y2 = std_data.min().min(), std_data.max().max()
p.extra_y_ranges = {"secondary": Range1d(start=y1-0.05, end=y2+0.05)}
p.add_layout(LinearAxis(y_range_name="secondary", axis_label='MSE Stdev.'), 'right')

p.legend.location = 'center_left'
p.legend.background_fill_alpha = 0.6
show(p)

In [152]:
source = ColumnDataSource(rdf)
param = 'train_size_mean'
p = figure(title=f"{param} ({n_simulations} simulations)", width=600, height=300)
p.line(param, 'train_mse_mean', color='dodgerblue',
        legend_label='Train MSE', source=source, line_width=3)
p.line(param, 'test_mse_mean', color='dodgerblue',
        legend_label='Test MSE', source=source, line_width=3, line_dash='dashed')

p.line(param, 'train_mse_std', color='green', y_range_name='secondary',
       legend_label='Train stdev', source=source, line_width=3)
p.line(param, 'test_mse_std', color='green', y_range_name='secondary',
       legend_label='Test stdev', source=source, line_width=3, line_dash='dashed')

p.yaxis.axis_label = 'MSE'
p.xaxis.axis_label = 'Training Size'

# plot second x axis
x1, x2 = rdf['leave_out_size'].min(), rdf['leave_out_size'].max()

p.extra_x_ranges = {"secondary": Range1d(start=x1-1, end=x2+1)}
p.add_layout(LinearAxis(x_range_name="secondary", axis_label='N stations held out'), 'above')

# # plot second y axis
std_data = rdf[[c for c in rdf.columns if c.endswith('mse_std')]].copy()
y1, y2 = std_data.min().min(), std_data.max().max()
p.extra_y_ranges = {"secondary": Range1d(start=y1-0.05, end=y2+0.05)}
p.add_layout(LinearAxis(y_range_name="secondary", axis_label='MSE Stdev.'), 'right')

p.legend.location = 'center_left'
p.legend.background_fill_alpha = 0.6
show(p)

## Geographic Sample Bias

* Test proximity bias 1 (vary **max** distance)
* Test proximity bias 2 (vary **min** distance)
* Test North-South selection bias


In [153]:
# max_depth
param = 'max_distance'
param_range = np.arange(50, 1100, 100)
sim_results = []
n_simulations = 10
K = 100
for n in range(n_simulations):
    model_results = []
    for p in param_range:
        within_distance = all_data[all_data['centroid_distance'] <= p].copy()
        train_df, test_df = leave_out_at_random(within_distance, stations, K)
        n_train, n_test = len(train_df), len(test_df)
        model_params = {
            "objective": f"reg:{objective}",
            "colsample_bytree": 0.6,
            "lambda": 0,
            "tree_method": 'hist',
            "random_state": 42,
        }
        model = ObjectiveFunction(train_df, test_df, bitrate, features, objective, model_params, target_column="dkl_post_0R")
        training_error = model.train_model()
        test_error = model.evaluate_model()
        result = (p, n_train, n_test, training_error, test_error)
        model_results.append(result)
        sim_results.append(result)
    model_df = pd.DataFrame(model_results, columns=[param, 'train_size', 'test_size', 'train_mse', 'test_mse'])
    mean_test_err = model_df['test_mse'].mean()
    stdev_test_err = model_df['test_mse'].std()
    print(f"   Simulation {n+1}/{n_simulations} complete: {mean_test_err:.2f}/{stdev_test_err:.2f} mean/stdev error")
      

   Simulation 1/10 complete: 1.04/0.13 mean/stdev error
   Simulation 2/10 complete: 1.13/0.12 mean/stdev error
   Simulation 3/10 complete: 1.13/0.11 mean/stdev error
   Simulation 4/10 complete: 1.07/0.16 mean/stdev error
   Simulation 5/10 complete: 1.11/0.21 mean/stdev error
   Simulation 6/10 complete: 1.16/0.12 mean/stdev error
   Simulation 7/10 complete: 1.08/0.16 mean/stdev error
   Simulation 8/10 complete: 1.18/0.20 mean/stdev error
   Simulation 9/10 complete: 1.01/0.16 mean/stdev error
   Simulation 10/10 complete: 1.13/0.23 mean/stdev error


In [154]:
param = 'max_distance'
result_df = pd.DataFrame(sim_results, columns=[param, 'train_size', 'test_size', 'train_mse', 'test_mse'])
estimators = result_df.groupby(param)
mean_mse = estimators.mean()
std_mse = estimators.std()
mean_mse.columns = [f'{c}_mean' for c in mean_mse.columns]
std_mse.columns = [f'{c}_std' for c in std_mse.columns]
rdf = pd.concat([mean_mse, std_mse], axis=1)
rdf.reset_index(inplace=True)
rdf.head()

Unnamed: 0,max_distance,train_size_mean,test_size_mean,train_mse_mean,test_mse_mean,train_size_std,test_size_std,train_mse_std,test_mse_std
0,50,6828.0,50.8,0.27201,0.889049,155.775479,13.579396,0.006317,0.226978
1,150,36332.3,274.6,0.365325,1.078407,575.30167,56.352462,0.00488,0.255435
2,250,72172.6,560.9,0.416536,1.130103,1023.905291,84.066706,0.006496,0.18509
3,350,105105.1,878.5,0.456671,1.122569,920.838561,84.5212,0.00607,0.155988
4,450,138405.8,1263.8,0.474007,1.134226,1060.035199,129.767998,0.004415,0.162681


In [155]:
source = ColumnDataSource(rdf)
param = 'max_distance'
p = figure(title=f"{param} ({n_simulations} simulations)", width=600, height=300)
p.line(param, 'train_mse_mean', color='dodgerblue',
        legend_label='Train MSE', source=source, line_width=3)
p.line(param, 'test_mse_mean', color='dodgerblue',
        legend_label='Test MSE', source=source, line_width=3, line_dash='dashed')
p.yaxis.axis_label = 'MSE'
p.xaxis.axis_label = 'Training Size'
p.xaxis.axis_label = param

# plot second x axis
std_data = rdf[[c for c in rdf.columns if c.endswith('mse_std')]]
x1, x2 = rdf['train_size_mean'].min(), rdf['max_distance'].max()

p.extra_x_ranges = {"secondary": Range1d(start=x1-1, end=x2+1)}
p.add_layout(LinearAxis(x_range_name="secondary", axis_label='Mean Training Sample Size'), 'above')

# plot second y axis
std_data = rdf[[c for c in rdf.columns if c.endswith('mse_std')]]
y1, y2 = std_data.min().min(), std_data.max().max()
p.extra_y_ranges = {"secondary": Range1d(start=y1-0.05, end=y2+0.05)}
p.add_layout(LinearAxis(y_range_name="secondary", axis_label='Stdev MSE'), 'right')

p.line(param, 'train_mse_std', color='green', y_range_name='secondary',
       legend_label='Train stdev', source=source, line_width=3)
p.line(param, 'test_mse_std', color='green', y_range_name='secondary',
       legend_label='Test stdev', source=source, line_width=3, line_dash='dashed')

p.legend.location = 'bottom_right'
p.legend.background_fill_alpha = 0.6
show(p)

In [156]:
# max_depth
param = 'min_distance'
param_range = np.arange(0, 800, 100)
sim_results = []
n_simulations = 10
K = 100
for n in range(n_simulations):
    model_results = []
    for p in param_range:
        outside_distance = all_df[all_df['centroid_distance'] > p].copy()
        train_df, test_df = leave_out_at_random(outside_distance, stations, K)
        n_train, n_test = len(train_df), len(test_df)
        model_params = {
            "objective": f"reg:{objective}",
            "random_state": 42,
        }
        model = ObjectiveFunction(train_df, test_df, bitrate, features, objective, model_params, target_column="dkl_post_0R")
        training_error = model.train_model()
        test_error = model.evaluate_model()
        result = (p, n_train, n_test, training_error, test_error)
        model_results.append(result)
        sim_results.append(result)
    model_df = pd.DataFrame(model_results, columns=[param, 'train_size', 'test_size', 'train_mse', 'test_mse'])
    mean_test_err = model_df['test_mse'].mean()
    stdev_test_err = model_df['test_mse'].std()
    print(f"   Simulation {n+1}/{n_simulations} complete: {mean_test_err:.2f}/{stdev_test_err:.2f} mean/stdev error")
      

NameError: name 'all_df' is not defined

In [None]:
param = 'min_distance'
result_df = pd.DataFrame(sim_results, columns=[param, 'train_size', 'test_size', 'train_mse', 'test_mse'])
print(result_df)
estimators = result_df.groupby(param)
mean_mse = estimators.mean()
std_mse = estimators.std()
mean_mse.columns = [f'{c}_mean' for c in mean_mse.columns]
std_mse.columns = [f'{c}_std' for c in std_mse.columns]
rdf = pd.concat([mean_mse, std_mse], axis=1)
rdf.reset_index(inplace=True)
rdf.head()

In [None]:
source = ColumnDataSource(rdf)
param = 'min_distance'
p = figure(title=f"{param} ({n_simulations} simulations)", width=600, height=300)
p.line(param, 'train_mse_mean', color='dodgerblue',
        legend_label='Train MSE', source=source, line_width=3)
p.line(param, 'test_mse_mean', color='dodgerblue',
        legend_label='Test MSE', source=source, line_width=3, line_dash='dashed')
p.yaxis.axis_label = 'MSE'
p.xaxis.axis_label = 'Training Size'
p.xaxis.axis_label = param

# plot second x axis
std_data = rdf[[c for c in rdf.columns if c.endswith('mse_std')]]
x1, x2 = rdf['train_size_mean'].min(), rdf['train_size_mean'].max()

p.extra_x_ranges = {"secondary": Range1d(start=x1-1, end=x2+1)}
p.add_layout(LinearAxis(x_range_name="secondary", axis_label='Mean Training Sample Size'), 'above')

# plot second y axis
std_data = rdf[[c for c in rdf.columns if c.endswith('mse_std')]]
y1, y2 = std_data.min().min(), std_data.max().max()
p.extra_y_ranges = {"secondary": Range1d(start=y1-0.05, end=y2+0.05)}
p.add_layout(LinearAxis(y_range_name="secondary", axis_label='Stdev MSE'), 'right')

p.line(param, 'train_mse_std', color='green', y_range_name='secondary',
       legend_label='Train stdev', source=source, line_width=3)
p.line(param, 'test_mse_std', color='green', y_range_name='secondary',
       legend_label='Test stdev', source=source, line_width=3, line_dash='dashed')

p.legend.location = 'bottom_right'
p.legend.background_fill_alpha = 0.6
show(p)

### Sort the values by geographic region

In [None]:
# max_depth
param = 'min_distance'
param_range = np.arange(0, 800, 100)
sim_results = []
n_simulations = 10
K = 100
for n in range(n_simulations):
    model_results = []
    # get a dataframe of all station coordinates
    # filter by degrees from the northernmost
    # leave the southernmost 100 stations at the end
    for p in param_range:
        
        
        train_df, test_df = leave_out_at_random(all_data, stations, K)
        n_train, n_test = len(filtered_train_df), len(filtered_test_df)
        model_params = {
            "objective": f"reg:{objective}",
            "random_state": 42,
        }
        model = ObjectiveFunction(filtered_train_df, filtered_test_df, bitrate, features, objective, model_params, target_column="dkl_post_0R")
        training_error = model.train_model()
        test_error = model.evaluate_model()
        result = (p, n_train, n_test, training_error, test_error)
        model_results.append(result)
        sim_results.append(result)
    model_df = pd.DataFrame(model_results, columns=[param, 'train_size', 'test_size', 'train_mse', 'test_mse'])
    mean_test_err = model_df['test_mse'].mean()
    stdev_test_err = model_df['test_mse'].std()
    print(f"   Simulation {n+1}/{n_simulations} complete: {mean_test_err:.2f}/{stdev_test_err:.2f} mean/stdev error")
    

# Get rankings of feature importance

For a large set of experiments, retrieve the feature importance scores to generate an overall idea of what features are most influential.  Reducing dimensionality of the problem may help with overfitting?



In [93]:
feature_folder = 'processed_data/feature_importance'
feature_files = os.listdir(feature_folder)
dfs = [pd.read_csv(os.path.join(feature_folder, f)) for f in feature_files]

In [94]:
df = pd.concat(dfs, axis=0)
df.drop(columns=[e for e in df.columns if e.startswith('Unnamed')], inplace=True)
df.reset_index(inplace=True, drop=True)
print(len(df))
df.head()

79030


Unnamed: 0,proxy_prcp,target_prcp,target_Slope_deg,proxy_Slope_deg,target_low_prcp_duration,proxy_Drainage_Area_km2,proxy_Centroid_Lon_deg_E,proxy_Elevation_m,target_low_prcp_freq,target_Centroid_Lon_deg_E,...,target_high_prcp_duration,centroid_distance,target_Land_Use_Water_frac,proxy_Land_Use_Wetland_frac,target_Land_Use_Wetland_frac,proxy_Land_Use_Crops_frac,proxy_high_prcp_duration,target_Land_Use_Crops_frac,proxy_high_prcp_freq,target_high_prcp_freq
0,144.0,129.0,115.0,93.0,72.0,68.0,67.0,66.0,66.0,62.0,...,27.0,25.0,24.0,20.0,20.0,18.0,18.0,13.0,13.0,11.0
1,130.0,127.0,98.0,99.0,49.0,71.0,64.0,57.0,54.0,61.0,...,27.0,31.0,23.0,41.0,21.0,16.0,27.0,10.0,18.0,16.0
2,164.0,129.0,96.0,113.0,59.0,73.0,56.0,57.0,59.0,77.0,...,28.0,24.0,32.0,28.0,26.0,19.0,21.0,11.0,21.0,9.0
3,144.0,118.0,108.0,86.0,57.0,83.0,58.0,65.0,72.0,73.0,...,23.0,37.0,35.0,26.0,32.0,20.0,20.0,18.0,27.0,6.0
4,152.0,137.0,109.0,98.0,71.0,70.0,66.0,60.0,70.0,68.0,...,30.0,33.0,32.0,32.0,25.0,30.0,13.0,15.0,25.0,15.0


In [96]:
mdf = pd.DataFrame()
mdf['mean'] = df.mean(0).sort_values(ascending=False)
mdf['std'] = df.std(0).sort_values(ascending=False)
mdf.index.values[:30]

array(['proxy_prcp', 'target_prcp', 'target_Slope_deg', 'proxy_Slope_deg',
       'target_Centroid_Lon_deg_E', 'proxy_Drainage_Area_km2',
       'target_low_prcp_duration', 'proxy_Elevation_m',
       'target_Drainage_Area_km2', 'target_low_prcp_freq',
       'proxy_Porosity_frac', 'proxy_low_prcp_freq',
       'proxy_Land_Use_Snow_Ice_frac', 'target_srad',
       'proxy_Land_Use_Forest_frac', 'proxy_srad',
       'proxy_low_prcp_duration', 'target_Elevation_m',
       'target_Land_Use_Snow_Ice_frac', 'proxy_Centroid_Lat_deg_N',
       'proxy_Centroid_Lon_deg_E', 'target_Porosity_frac',
       'proxy_Gravelius', 'proxy_Aspect_deg',
       'proxy_Permeability_logk_m2', 'proxy_vp', 'target_Aspect_deg',
       'target_Centroid_Lat_deg_N', 'proxy_Perimeter', 'target_tmax'],
      dtype=object)

### Reference Priors

maximize the divergence bdtween the prior and posterior.  the distance between prior and posterior is (in some sense, in what sense?) the likelihood. if you maximize the distance between prior and posterior you make the likelihood do most of the work and minimizing the work of the prior.  Protects you from having overinfluential prior.  (Jeffreys priors? -- improper prior)