# Ensemble Machine Learning for Void Filling in Glacier Elevation Change Maps
*By Cameron Markovsky*

## 03 - XGB Model Tuning

### Imports

In [33]:
import pandas as pd
import xgboost as xgb
import optuna
from sklearn.metrics import root_mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
import pickle

### Define DataPrepper Class

In [3]:
class DataPrepper:

    def __init__(self, filename):
        self.filename = filename
        self.data = self.load_data()
        self.train, self.test = self.preprocess_data()
        # self.X = pd.concat([self.X_train, self.X_test])
        # self.y = pd.concat([self.y_train, self.y_test])
        
        # self.train = self.train.drop(columns = ['RGIId_Full', 'void_mask'])
        # self.test = self.test.drop(columns = ['RGIId_Full', 'void_mask'])
        # self.full = self.data.drop(columns = ['RGIId_Full', 'void_mask'])
        # self.X = self.X.drop(columns = ['RGIId_Full', 'target', 'void_mask'])


    def load_data(self):
        try:
            data = pd.read_csv(self.filename)
        except:
            data = pd.read_feather(self.filename)
        try:
            data = data.drop(columns=['Unnamed: 0'])
        except:
            pass
        data = data.rename(columns={'dh1': 'target', 'elevation': 'z'})
        data = data.dropna()
        return data

    def preprocess_data(self):
        train = self.data[self.data['void_mask'] == False]
        # y_train = x_train['target']
        
        test = self.data[self.data['void_mask'] == True]
        # y_test = x_test['target']
        return train, test

### Define global parameters and Config Class

In [39]:
filename = '../data/ts1.csv' # name of file to tune XGB on
n_trials = 10 # number of Optuna trials to run
save_cfg = True # save the model configuration?

class config:

    filename = filename
    # filename = 'data/ts1.csv'
    if filename == '../data/ts1.csv':
        savename = 'ts'
    elif filename == 'data/ehim_full.csv':
        savename = 'ehim'
    elif filename == 'data/whim_full.csv':
        savename = 'whim'
    else:
        savename = 'other'

    dp = DataPrepper(filename)
    train, test = dp.train, dp.test
    full = pd.concat([train, test])
    # tune_train, tune_test = train_test_split(full, test_size=0.37, random_state=42)
    features = ['x', 'y', 'z', 'Area', 'Zmin', 'Zmed', 'Zmax', 'Slope', 'dc_ratio', 'HI', 'sin_Aspect', 'cos_Aspect']
    target = 'target'
    n_rounds = 1
    n_trials = n_trials
    save_cfg = save_cfg

    def save_cfg(self, filename):
        with open(filename, 'wb') as f:
            pickle.dump(self, f)
        return

### Define the XGB Objective function

In [27]:
def objective_xgb(trial):
    n = trial.number
    validation, test_valid = train_test_split(config.test, test_size=0.50)

    # Suggest values of the hyperparameters using a trial object.
    params = {
        "objective": 'reg:squarederror',
        'tree_method': "hist",
        # "device": "cuda",
        'lambda': trial.suggest_float('lambda', 1e-3, 100.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-3, 100.0, log=True),
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
        "gamma": trial.suggest_float("gamma", 1e-3, 100, log=True),
        "max_depth": trial.suggest_int("max_depth", 1, 20),
        "subsample": trial.suggest_float("subsample", 0.3, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.3, 1.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 200),
    }

    # X_train, y_train = CFG.tune_train[CFG.features], CFG.tune_train[CFG.target] # Random split of 63% train, 37% test
    # X_test, y_test = CFG.tune_test[CFG.features], CFG.tune_test[CFG.target] # Random split of 63% train, 37% test

    train = config.train
    # test = CFG.test_valid

    X_train, y_train = train[config.features], train[config.target]
    X_valid, y_valid = validation[config.features], validation[config.target]
    # X_test, y_test = test[CFG.features], test[CFG.target]

    dtrain = xgb.DMatrix(data=X_train, label=y_train)
    dvalidation = xgb.DMatrix(data=X_valid, label=y_valid)
    # dtest = xgb.DMatrix(data=X_test, label=y_test)
    
    model_xgb = xgb.train(
            params,
            dtrain,
            evals=[(dvalidation, 'eval')],
            num_boost_round=1000,
            early_stopping_rounds=50,
            verbose_eval=False
        )
    # model = xgb.XGBRegressor(**params)
    # model.fit(X_train, y_train, eval_set=[(X_test, y_test)])
    y_preds_xgb = model_xgb.predict(dvalidation)

    rmse = root_mean_squared_error(y_valid, y_preds_xgb)
    return rmse

### Define methods to run Optuna optimization and retrieve the best parameters

In [30]:
def run_xgb_optuna(n_trials=10):
    study = optuna.create_study(direction='minimize')
    study.optimize(objective_xgb, n_trials=n_trials)
    return study, study.best_params, study.best_value

def get_xgb_params(best_params):
    xgb_params = {
        "objective": 'reg:squarederror',
        # "tree_method": "hist",
        # "device": "cuda",
        # "num_boosted_round": 1000, #trial.suggest_int("n_estimators", 1, 2000),
        # "early_stopping_rounds": 50, #100
        'lambda': best_params['lambda'], 
        'alpha': best_params['alpha'], 
        'learning_rate': best_params['learning_rate'], 
        'gamma': best_params['gamma'], 
        'max_depth': best_params['max_depth'], 
        'subsample': best_params['subsample'], 
        'colsample_bytree': best_params['colsample_bytree'], 
        'min_child_weight': best_params['min_child_weight']
    }
    return xgb_params

# Run the Optuna optimiziation and save the configuration

In [38]:
cfg = config()


print(f'Optimizing XGBoost hyperparameters on {filename.split('/')[2]} using {n_trials} trials')
print(f'Train size: {len(cfg.train)}; Test size: {len(cfg.test)}')
study, best_params, best_rmse = run_xgb_optuna(n_trials)
xgb_params = get_xgb_params(best_params)
config.xgb_params = xgb_params
print(f'Best XGB Parameters: {best_params}')
print('-----------------------------------')
print('Finished optimizing XGBoost hyperparameters.\n')
if save_cfg:
    cfg.save_cfg(f'../models/{cfg.savename}_cfg.pkl')
print(f'Saved model configuration to ../models/{cfg.savename}_cfg.pkl')

[I 2025-06-10 11:01:51,953] A new study created in memory with name: no-name-28e07a06-4ac2-487c-add0-8102d1d95b0c


Optimizing XGBoost hyperparameters on ts1.csv using 10 trials
Train size: 1786; Test size: 1042


[I 2025-06-10 11:01:52,497] Trial 0 finished with value: 0.47485789707060083 and parameters: {'lambda': 1.556981142447684, 'alpha': 0.012530083882374227, 'learning_rate': 0.03926153881790991, 'gamma': 0.16497599858860146, 'max_depth': 6, 'subsample': 0.5230790731182884, 'colsample_bytree': 0.8212602748454507, 'min_child_weight': 64}. Best is trial 0 with value: 0.47485789707060083.
[I 2025-06-10 11:01:52,746] Trial 1 finished with value: 0.5054362065940325 and parameters: {'lambda': 0.026033885354085944, 'alpha': 0.5819617905285948, 'learning_rate': 0.06684033567529597, 'gamma': 0.7219480423536395, 'max_depth': 13, 'subsample': 0.471313027491404, 'colsample_bytree': 0.8323267100189169, 'min_child_weight': 72}. Best is trial 0 with value: 0.47485789707060083.
[I 2025-06-10 11:01:53,414] Trial 2 finished with value: 0.5197453618356351 and parameters: {'lambda': 0.029692497962988052, 'alpha': 0.23394400008976124, 'learning_rate': 0.00508807735372225, 'gamma': 0.3295702032078926, 'max_dept

Best XGB Parameters: {'lambda': 1.556981142447684, 'alpha': 0.012530083882374227, 'learning_rate': 0.03926153881790991, 'gamma': 0.16497599858860146, 'max_depth': 6, 'subsample': 0.5230790731182884, 'colsample_bytree': 0.8212602748454507, 'min_child_weight': 64}
-----------------------------------
Finished optimizing XGBoost hyperparameters.

Saved model configuration to ../models/ts_cfg.pkl
