# 2: Train XGBoost Model

Author: Daniel Lusk

## Imports and configuration

In [None]:
import geopandas as gpd
import xgboost as xgb

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from TrainModelConfig import TrainModelConfig
from utils.data_retrieval import all_gdfs
from utils.visualize import plot_splits

%load_ext autoreload
%autoreload 2

config = TrainModelConfig()

## Load data

In [None]:
X_fns = config.WC_fns + config.MODIS_fns + config.soil_fns
Y_fns = config.iNat_fns

X = all_gdfs(X_fns)
Y = all_gdfs(Y_fns)

Compute Preciptation Annual Range by subtracting BIO14 from BIO13

In [None]:
bio_13 = X.loc[:, ["bio_13" in x for x in X.columns]].values
bio_14 = X.loc[:, ["bio_14" in x for x in X.columns]].values
X["wc2.1_10m_bio_13-14"] = bio_13 - bio_14

Drop the unnecessary `x`, `y`, `band` and `spatial_ref` columns.

In [None]:
X = X.drop(columns=["x", "y", "band", "spatial_ref"])
Y = Y.drop(columns=["x", "y", "band", "spatial_ref"])

## XGBoost

To-Dos:

1. Match predictors with response variable(s) (just use one variable at first to test all the folllowing steps)
2. Remove all-null predictors/response vars
3. ~~Standardize the data~~ *Not actually necessary for tree-based models!*
4. Divide into spatial CV splits
5. Exclude location columns
6. Test out training
7. Identify optimal hyperparams with grid search + spatial CV
8. Repeat training, but remove ANY rows with NA

Use just one response variable while developing the methodology. In this case, use specific leaf area (SLA).

In [None]:
Y = Y[["geometry", "iNat_SLA_05deg_ln"]]

# Drop response variable NAs
Y = Y.dropna()

Y.head(5)

<p>1. Match predictors with response variable</p>

In [None]:
X = X.loc[X["geometry"].isin(Y["geometry"])]
print_shapes(X, Y)

<p>2. Remove all-NA predictors and match response variable with new predictors</p>

In [None]:
X = X.dropna(subset=X.columns.difference(["geometry"]), how="all")
print_shapes(X, Y)

Y = Y.loc[Y["geometry"].isin(X["geometry"])]
print_shapes(X, Y)

<p>3. Standardize the data by centering to the mean and scaling to the STD (skipped because not actually necessary for tree-based models)</p>

<p>4. Divide into geographic splits for spatial K-fold cross-validation</p>

4.1 Generate variograms to determine block size

In [None]:
import skgstat as skg

In [None]:
XYs = X["geometry"]
coords = np.asarray(list(map(lambda x : (x.x, x.y), XYs)))
values = X["wc2.1_0.5_deg_bio_1"].values

In [None]:
coords = coords[~np.isnan(values)]
values = values[~np.isnan(values)]

In [None]:
V = skg.Variogram(coordinates=coords, values=values)

In [None]:
print(V)

In [None]:
V.plot()

In [None]:
import spacv
from spacv.visualisation import plot_autocorrelation_ranges
import numpy as np

# df = gpd.read_file('/opt/conda/lib/python3.7/site-packages/libpysal/examples/baltim/baltim.shp')

# XYs = df['geometry']
# cols = ['NROOM', 'PRICE', 'AGE', 'SQFT']
# X = df[cols]
# y = df['PATIO']

XYs = X["geometry"]
x = X[["wc2.1_0.5_deg_bio_1"]]

# xys = XYs.iloc[~np.isnan(x).values]
# x = x[~np.isnan(x).values]

lags = np.arange(0, 400, 50)
bw = 5

plot_autocorrelation_ranges(XYs, X[X.columns.difference(["geometry"])], lags, bw)


In [None]:
import matplotlib.pyplot as plt
import spacv

XYs = X["geometry"]
skcv = spacv.SKCV(n_splits=10, buffer_radius=10)

plot_splits(skcv, XYs)

In [None]:
from spacv.grid_builder import construct_blocks

fig, ax = plt.subplots()
construct_blocks(X, method='random', tiles_x=10, tiles_y=10, n_groups=3).plot(column='grid_id', ax=ax, edgecolor='black', cmap='viridis')

## Old

Drop NaNs from labels and convert dataframes to numpy arrays

In [None]:
X_np = X.to_numpy()
Y_np = Y.dropna().to_numpy()

Split into train and test and convert data into DMatrices for XGBoost

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X_np, Y_np, test_size=2.0)

D_train = xgb.DMatrix(X_train, label=Y_train)
D_test = xgb.DMatrix(X_test, label=Y_test)