# 2: Train XGBoost Model

Author: Daniel Lusk

## Imports and configuration

In [None]:
import numpy as np
import spacv

from spacv.visualisation import plot_autocorrelation_ranges
from TrainModelConfig import TrainModelConfig
from utils.geodata import drop_XY_NAs
from utils.visualize import plot_splits
import utils.datasets as datasets
from utils.datasets import DataCollection, Dataset, MLCollection, CollectionName
from utils.dataset_tools import Unit, FileExt

NOTEBOOK = True

if NOTEBOOK:
    %load_ext autoreload
    %autoreload 2

config = TrainModelConfig()

## Load data

Initialize the datasets

In [None]:
inat_orig = Dataset(
    res=0.5,
    unit=Unit.DEGREE,
    collection_name=config.iNat_name,
    transform="exp_ln",
)

inat_dgvm = Dataset(
    res=0.5,
    unit=Unit.DEGREE,
    collection_name=CollectionName.INAT_DGVM,
    transform="exp_ln",
)

inat_gbif = Dataset(
    res=0.5,
    unit=Unit.DEGREE,
    collection_name=CollectionName.INAT_GBIF,
    # filter_outliers=config.training_config.filter_y_outliers,
)

gbif = Dataset(
    res=0.5,
    unit=Unit.DEGREE,
    collection_name=CollectionName.GBIF,
    band=datasets.GBIFBand.MEAN,
    file_ext=FileExt.GRID
)

splot = Dataset(
    res=0.5,
    unit=Unit.DEGREE,
    collection_name=CollectionName.SPLOT,
    band=datasets.GBIFBand.MEAN,
    file_ext=FileExt.GRID
)

wc = Dataset(
    res=0.5,
    unit=Unit.DEGREE,
    collection_name=config.WC_name,
)

modis = Dataset(
    res=0.5,
    unit=Unit.DEGREE,
    collection_name=config.MODIS_name,
)

soil = Dataset(
    res=0.5,
    unit=Unit.DEGREE,
    collection_name=config.soil_name,
)

vodca = Dataset(
    res=0.5,
    unit=Unit.DEGREE,
    collection_name=CollectionName.VODCA,
    file_ext=FileExt.NETCDF4,
)


Organize the datasets for training

In [None]:
X = DataCollection([wc, modis, soil, vodca])
Y = DataCollection([inat_gbif])

# Convert to MLCollection for training
XY = MLCollection(X, Y)
XY.drop_NAs(verbose=1)

## XGBoost

<div class="alert alert-block alert-info">
To-Dos:

1) ~~Create a data frame where you have all response variables and predictors.~~
2) ~~Remove cells where you do not have a value for ANY predictor/response variable (you still may have NA for some columns then).~~
3) ~~Train the models and do the evaluation~~
4) Repeat step 3, but remove rows where you have at least one NA
5) Compare accuracies of step 3 and 4 and see what´s best.
</div>

### Calculate autocorrelation range of predictors and generate spatial folds for spatial cross-validation

In [None]:
if config.SAVE_AUTOCORRELATION_RANGES:
    coords = XY.df["geometry"]
    data = XY.df[XY.X.cols]

    _, _, ranges = plot_autocorrelation_ranges(
        coords, data, config.LAGS, config.BW, distance_metric="haversine", workers=10
    )

    np.save("ranges.npy", np.asarray(ranges))

#### Explore splits for a single response variable

In [None]:
if config.EXPLORE_SPLITS:
    y_col = "iNat_Stem.conduit.density_05deg_ln"
    sample_Xy = XY.df[["geometry", *XY.X.cols, y_col]]

    # Drop full-NAs
    sample_Xy, sample_X_cols, sample_y_col = drop_XY_NAs(
        sample_Xy, XY.X.cols, y_col, True
    )

    # Sample X data on which split dissimilarity will be measured
    sample_data = sample_Xy[sample_X_cols]
    sample_locs = sample_Xy["geometry"]

    # Grid settings
    tiles_x = int(np.round(360 / config.AUTOCORRELATION_RANGE))
    tiles_y = int(np.round(180 / config.AUTOCORRELATION_RANGE))

    # Spatial blocking
    hblock = spacv.HBLOCK(
        tiles_x,
        tiles_y,
        shape="hex",
        method="optimized_random",
        buffer_radius=0.01,
        n_groups=10,
        data=sample_data,
        n_sims=50,
        distance_metric="haversine",
        random_state=config.RNG_STATE,
    )

    # Plot splits
    print(f"Tile size: {config.AUTOCORRELATION_RANGE:.2f} degrees")
    plot_splits(hblock, sample_locs)


### Train models for each response variable (better to use `2-TrainModel.py` script now)

In [None]:
################### TRAINING  ####################
if config.TRAIN_MODE:
    config = TrainModelConfig()
    XY.train_Y_models(config.training_config, resume=True)
##################################################


## Tighten hyperparameter ranges based on results

In [None]:
import pandas as pd
import ast
import matplotlib.pyplot as plt
import numpy as np

In [None]:
results = pd.read_csv("results/training_results.csv")
params = results["Best parameters"]
params = [ast.literal_eval(p) for p in params if p is not np.nan]

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import gaussian_kde

# create a list of parameter names
param_names = list(params[0].keys())

# calculate the number of rows and columns needed for the subplots
n_rows = len(param_names) // 3 + len(param_names) % 3
n_cols = 3

# create a figure with 3 columns of subplots
fig, axs = plt.subplots(n_rows, n_cols, figsize=(16, 6*n_rows))

# create a box and whisker plot for each parameter
for i, name in enumerate(param_names):
    # calculate the row and column index for this subplot
    row = i // n_cols
    col = i % n_cols

    # create a list of parameter values for this parameter
    param_values = [d[name] for d in params]

    # calculate the quartile values
    q1, median, q3 = np.percentile(param_values, [25, 50, 75])

    # create a box and whisker plot for this parameter
    axs[row, col].boxplot(param_values)

    # set informative x-axis labels and title
    axs[row, col].set_xticklabels([name])
    axs[row, col].set_xlabel("Parameter")
    axs[row, col].set_ylabel("Value")
    axs[row, col].set_title(f"Distribution of {name}")

    # add text with quartile values
    axs[row, col].text(0.95, 0.95, f"Q1: {q1:.2f}\nMed: {median:.2f}\nQ3: {q3:.2f}",
                        transform=axs[row, col].transAxes, ha="right", va="top")

    # add sample density along right axis
    kde = gaussian_kde(param_values)
    x_vals = np.linspace(min(param_values), max(param_values), 100)
    axs[row, col].twinx().plot(kde(x_vals), x_vals, color='red')
    axs[row, col].set_ylim(min(param_values), max(param_values))

# remove any unused subplots
for i in range(len(param_names), n_rows*n_cols):
    row = i // n_cols
    col = i % n_cols
    fig.delaxes(axs[row, col])

plt.tight_layout()
plt.show()

# Debugging

In [None]:
from datetime import datetime
from sklearn.model_selection import train_test_split

from utils.training import train_model_full
from utils.training import block_cv_splits, optimize_params
from utils.datasets import Dataset, DataCollection, MLCollection
from utils.geodata import drop_XY_NAs
from utils.dataset_tools import Unit
from TrainModelConfig import TrainModelConfig


config = TrainModelConfig(debug=True)

if config.DEBUG:
    %load_ext autoreload
    %autoreload 2
    
    res = 0.5
    y_transform = "exp_ln"

    inat_orig = Dataset(
        res=res,
        unit=Unit.DEGREE,
        collection_name=config.iNat_name,
        transform=y_transform,
    )

    wc = Dataset(
        res=res,
        unit=Unit.DEGREE,
        collection_name=config.WC_name,
    )

    X = DataCollection([wc])
    Y = DataCollection([inat_orig])

    print("\nPreparing data...")
    print("X:")
    for dataset in X.datasets:
        print("    ", dataset.collection_name.short)

    print("Y:")
    for dataset in Y.datasets:
        print("    ", dataset.collection_name.short)

    # Convert to MLCollection for training
    XY = MLCollection(X, Y)
    XY.drop_NAs(verbose=1)

    y_col = XY.Y.cols[0]

    Xy = XY.df[["geometry", *XY.X.cols, y_col]]
    Xy, X_cols, y_cols = drop_XY_NAs(Xy, XY.X.cols, y_col, True)

    X = Xy[X_cols].to_numpy()
    y = Xy[y_col].to_numpy()
    coords = Xy["geometry"]

    X_train, X_test, y_train, y_test, coords_train, coords_test = train_test_split(
        X, y, coords, test_size=0.2, random_state=config.training_config.random_state
    )

    cv = block_cv_splits(
        X=X_train,
        coords=coords_train,
        grid_size=config.training_config.cv_grid_size,
        n_groups=10,
        random_state=config.training_config.random_state,
        verbose=1,
    )
    id = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    save_dir = config.training_config.results_dir / "ray-results" / id
    save_dir.mkdir(parents=True, exist_ok=True)
    
    reg = optimize_params(
        X=X_train,
        y=y_train,
        col_name=y_col,
        cv=cv,
        save_dir=save_dir,
        n_trials=200,
        random_state=config.training_config.random_state,
        max_iters=13,
        verbose=1,
    )

    model, r2 = train_model_full(
        model_params=reg.best_params_,
        X_train=X_train,
        y_train=y_train,
        X_test=X_test,
        y_test=y_test,
        verbose=1,
    )