# 2: Train XGBoost Model

Author: Daniel Lusk

## Imports and configuration

In [None]:
import datetime
import os

import numpy as np
import spacv
import pandas as pd

from spacv.visualisation import plot_autocorrelation_ranges
from TrainModelConfig import TrainModelConfig
from utils.data_retrieval import gdf_from_list
from utils.geodata import drop_XY_NAs, merge_gdfs
from utils.training import run_training
from utils.visualize import plot_splits

%load_ext autoreload
%autoreload 2

config = TrainModelConfig()

## Load data

In [None]:
from utils.datasets import Dataset, DatasetInfo, FileExt, Unit
from utils.datasets import DataCollection

inat_info = DatasetInfo(
    res=0.5,
    unit=Unit.DEGREE,
    parent_dir=config.iNat_dir,
    file_ext=FileExt.TIF,
    collection_name=config.iNat_name,
    transform="ln",
)

wc_info = DatasetInfo(
    res=0.5,
    unit=Unit.DEGREE,
    parent_dir=config.WC_dir,
    file_ext=FileExt.TIF,
    collection_name=config.WC_name,
    bio_ids=config.WC_bio_ids,
)

modis_info = DatasetInfo(
    res=0.5,
    unit=Unit.DEGREE,
    parent_dir=config.MODIS_dir,
    file_ext=FileExt.TIF,
    collection_name=config.MODIS_name,
)

soil_info = DatasetInfo(
    res=0.5,
    unit=Unit.DEGREE,
    parent_dir=config.soil_dir,
    file_ext=FileExt.TIF,
    collection_name=config.soil_name,
)

inat = Dataset(info=inat_info)
wc = Dataset(info=wc_info)
modis = Dataset(info=modis_info)
soil = Dataset(info=soil_info)

X = DataCollection([wc, modis, soil])
Y = DataCollection([inat])


In [None]:
from utils.datasets import MLCollection


XY = MLCollection(X, Y)
print("XY shape:", XY.df.shape)

XY.drop_NAs(verbose=1)

## XGBoost

<div class="alert alert-block alert-info">
To-Dos:

1) ~~Create a data frame where you have all response variables and predictors.~~
2) ~~Remove cells where you do not have a value for ANY predictor/response variable (you still may have NA for some columns then).~~
3) ~~Train the models and do the evaluation~~
4) Repeat step 3, but remove rows where you have at least one NA
5) Compare accuracies of step 3 and 4 and see what´s best.
</div>

### Calculate autocorrelation range of predictors and generate spatial folds for spatial cross-validation

In [None]:
if config.SAVE_AUTOCORRELATION_RANGES:
    coords = XY["geometry"]
    data = XY[X.cols]
    
    _, _, ranges = plot_autocorrelation_ranges(
        coords, data, config.LAGS, config.BW, distance_metric="haversine", workers=10
    )

    np.save("ranges.npy", np.asarray(ranges))

#### Explore splits for a single response variable

In [None]:
y_col = "iNat_Stem.conduit.density_05deg_ln"
sample_Xy = XY.df[["geometry", *XY.X.cols, y_col]]

# Drop full-NAs
sample_Xy, sample_X_cols, sample_y_col = drop_XY_NAs(
    sample_Xy, XY.X.cols, y_col, True
)

# Sample X data on which split dissimilarity will be measured
sample_data = sample_Xy[sample_X_cols]
sample_locs = sample_Xy["geometry"]

# Grid settings
tile = config.AUTOCORRELATION_RANGE / config.DEGREE
tiles_x = int(np.round(360 / tile))
tiles_y = int(np.round(180 / tile))

# Spatial blocking
hblock = spacv.HBLOCK(
    tiles_x,
    tiles_y,
    shape="hex",
    method="optimized_random",
    buffer_radius=0.01,
    n_groups=10,
    data=sample_data,
    n_sims=50,
    distance_metric="haversine",
    random_state=config.RNG_STATE,
)

# Plot splits
print(f"Tile size: {tile:.2f} degrees")
plot_splits(hblock, sample_locs)

### Train models for each response variable

In [None]:
run_time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

param_opt_run_dir = os.path.join(config.PARAM_OPT_RESULTS_DIR, run_time)
results_dir = os.path.join(config.MODEL_DIR, run_time)

if not os.path.exists(param_opt_run_dir):
    os.makedirs(param_opt_run_dir)

if not os.path.exists(results_dir):
    os.makedirs(results_dir)

results_fn = os.path.join(results_dir, f"{run_time}_results.csv")
data_cols = ["model", "params", "mean rmse", "std", "r-squared"]
results_df = pd.DataFrame(columns=data_cols)

for y_col in Y_cols:
    print(f"Processing {y_col}...\n")
    run_name = f"{run_time}_{y_col}"

    Xy = XY[["geometry", *X_cols, y_col]]

    Xy, X_cols, y_col = drop_NAs(Xy, X_cols, y_col, True)

    model_fn, params, rmse, std, r2 = run_training(
        Xy=Xy,
        X_cols=X_cols,
        y_col=y_col,
        autocorr_range=config.AUTOCORRELATION_RANGE,
        search_n_trials=100,
        n_jobs=-1,
        random_state=config.RNG_STATE,
        param_opt_save_dir=param_opt_run_dir,
        final_save_dir=results_dir,
        run_name=run_name,
    )
    results = [model_fn, params, rmse, std, r2]
    new_df = pd.DataFrame([results], columns=data_cols)
    results_df = pd.concat([results_df, new_df])

    results_df.to_csv(os.path.join(config.MODEL_DIR, results_fn))

results_df