# 2: Train XGBoost Model

Author: Daniel Lusk

## Imports and configuration

In [None]:
import numpy as np
import spacv
import pandas as pd

from spacv.visualisation import plot_autocorrelation_ranges
from TrainModelConfig import TrainModelConfig
from utils.geodata import drop_XY_NAs
from utils.visualize import plot_splits
from utils.datasets import DataCollection, Dataset, MLCollection, CollectionName
from utils.dataset_tools import Unit, FileExt
from pathlib import Path

NOTEBOOK = True

if NOTEBOOK:
    %load_ext autoreload
    %autoreload 2

config = TrainModelConfig()

## Load data

Initialize the datasets

In [None]:
inat_orig = Dataset(
    res=0.5,
    unit=Unit.DEGREE,
    parent_dir=config.iNat_dir,
    collection_name=config.iNat_name,
    transform="ln",
)

inat_dgvm = Dataset(
    res=0.5,
    unit=Unit.DEGREE,
    parent_dir=Path("./iNaturalist_traits/maps_iNaturalist/DGVM/continuous_traits/"),
    collection_name=CollectionName.INAT_DGVM,
    transform="ln",
)

inat_gbif = Dataset(
    res=0.5,
    unit=Unit.DEGREE,
    parent_dir=Path("./iNaturalist_traits/maps_GBIF/traitmaps/TRY_gap_filled/"),
    collection_name=CollectionName.INAT_GBIF,
    # filter_outliers=config.training_config.filter_y_outliers,
)

wc = Dataset(
    res=0.5,
    unit=Unit.DEGREE,
    parent_dir=config.WC_dir,
    collection_name=config.WC_name,
    bio_ids=config.WC_bio_ids,
)

modis = Dataset(
    res=0.5,
    unit=Unit.DEGREE,
    parent_dir=config.MODIS_dir,
    collection_name=config.MODIS_name,
)

soil = Dataset(
    res=0.5,
    unit=Unit.DEGREE,
    parent_dir=config.soil_dir,
    collection_name=config.soil_name,
)

vodca = Dataset(
    res=0.5,
    unit=Unit.DEGREE,
    parent_dir=Path("./data/vodca/"),
    collection_name=CollectionName.VODCA,
    file_ext=FileExt.NETCDF4,
)


Organize the datasets for training

In [None]:
X = DataCollection([wc, modis, soil, vodca])
Y = DataCollection([inat_gbif])

# Convert to MLCollection for training
XY = MLCollection(X, Y)
XY.drop_NAs(verbose=1)

## XGBoost

<div class="alert alert-block alert-info">
To-Dos:

1) ~~Create a data frame where you have all response variables and predictors.~~
2) ~~Remove cells where you do not have a value for ANY predictor/response variable (you still may have NA for some columns then).~~
3) ~~Train the models and do the evaluation~~
4) Repeat step 3, but remove rows where you have at least one NA
5) Compare accuracies of step 3 and 4 and see what´s best.
</div>

### Calculate autocorrelation range of predictors and generate spatial folds for spatial cross-validation

In [None]:
if config.SAVE_AUTOCORRELATION_RANGES:
    coords = XY.df["geometry"]
    data = XY.df[XY.X.cols]

    _, _, ranges = plot_autocorrelation_ranges(
        coords, data, config.LAGS, config.BW, distance_metric="haversine", workers=10
    )

    np.save("ranges.npy", np.asarray(ranges))

#### Explore splits for a single response variable

In [None]:
if config.EXPLORE_SPLITS:
    y_col = "iNat_Stem.conduit.density_05deg_ln"
    sample_Xy = XY.df[["geometry", *XY.X.cols, y_col]]

    # Drop full-NAs
    sample_Xy, sample_X_cols, sample_y_col = drop_XY_NAs(
        sample_Xy, XY.X.cols, y_col, True
    )

    # Sample X data on which split dissimilarity will be measured
    sample_data = sample_Xy[sample_X_cols]
    sample_locs = sample_Xy["geometry"]

    # Grid settings
    tiles_x = int(np.round(360 / config.AUTOCORRELATION_RANGE))
    tiles_y = int(np.round(180 / config.AUTOCORRELATION_RANGE))

    # Spatial blocking
    hblock = spacv.HBLOCK(
        tiles_x,
        tiles_y,
        shape="hex",
        method="optimized_random",
        buffer_radius=0.01,
        n_groups=10,
        data=sample_data,
        n_sims=50,
        distance_metric="haversine",
        random_state=config.RNG_STATE,
    )

    # Plot splits
    print(f"Tile size: {config.AUTOCORRELATION_RANGE:.2f} degrees")
    plot_splits(hblock, sample_locs)


### Train models for each response variable

In [None]:
################### TRAINING  ####################
if config.TRAIN_MODE:
    config = TrainModelConfig()
    XY.train_Y_models(config.training_config, resume=True)
##################################################


# Debugging

In [None]:
from sklearn.model_selection import train_test_split

from utils.training import train_model_full


if config.DEBUG:
    from utils.training import block_cv_splits, optimize_params

    y_col = XY.Y.cols[0]

    Xy = XY.df[["geometry", *XY.X.cols, y_col]]
    Xy, X_cols, y_cols = drop_XY_NAs(Xy, XY.X.cols, y_col, True)

    X = Xy[X_cols].to_numpy()
    y = Xy[y_col].to_numpy()
    coords = Xy["geometry"]

    X_train, X_test, y_train, y_test, coords_train, coords_test = train_test_split(
        X, y, coords, test_size=0.2
    )

    cv = block_cv_splits(
        X=X_train,
        coords=coords_train,
        grid_size=config.training_config.cv_grid_size,
        n_groups=config.training_config.cv_n_groups,
        random_state=config.training_config.random_state,
        verbose=1,
    )

    reg = optimize_params(
        X=X_train,
        y=y_train,
        col_name=y_col,
        cv=cv,
        save_dir=config.training_config.results_dir,
        n_trials=config.training_config.search_n_trials,
        random_state=config.training_config.random_state,
        verbose=1,
    )

    model, r2 = train_model_full(
        model_params=reg.best_params_,
        X_train=X_train,
        y_train=y_train,
        X_test=X_test,
        y_test=y_test,
        verbose=1,
    )