# Model Building

## Import Dependencies

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas as gpd
import os
from glob import glob
import argparse
from tqdm import tqdm
from sklearn.neighbors import BallTree
from sklearn.preprocessing import RobustScaler
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
import optuna
from optuna.samplers import TPESampler
import logging

import warnings
warnings.filterwarnings('ignore')

## Define Files and Directory Paths

In [3]:
# Main directory containing raw raster and vector datasets
MAIN_DATA_DIR = r'/beegfs/halder/DATA'

# GitHub-linked project directory where processed data and results are stored
PROJECT_DATA_DIR = r'/beegfs/halder/GITHUB/RESEARCH/Landscape-Analysis/data'

# Temporary directory used for storing intermediate files
TEMP_DIR = os.path.join(PROJECT_DATA_DIR, 'TEMP')

# Output directory
OUT_DIR = os.path.join(PROJECT_DATA_DIR, 'OUTPUT')

## Load Hexagonal Grid for Germany

In [4]:
# Define the distance and epsg
DISTANCE = 2.5
EPSG = 25832
CROP = 'WW'
CROP_CODE = 1110

# Path to grid shapefile
GRID_PATH = os.path.join(PROJECT_DATA_DIR, 'VECTOR', f'DE_Hexbins_{DISTANCE}sqkm_EPSG_{EPSG}.shp')

# Load grid as a GeoDataFrame and retain relevant columns
grids_gdf = gpd.read_file(GRID_PATH)
grids_gdf = grids_gdf[['id', 'geometry']]
grids_gdf['id'] = grids_gdf['id'].astype(int)

print('Grids Shape:', grids_gdf.shape)
print('Successfully read the grids!')

Grids Shape: (67545, 2)
Successfully read the grids!


## Load the Data

In [5]:
# Load the data for model training
data = pd.read_csv(os.path.join(OUT_DIR, f'Landscape_Data_{DISTANCE}KM.csv'))

# Drop the geometry column
data.drop(columns=['geometry'], inplace=True)

print(data.shape)
data.head()

(1151795, 52)


Unnamed: 0,id,NUTS_ID,year,landscape_shannon_diversity_index,landscape_patch_density,landscape_euclidean_nearest_neighbor_mn,productive_prop_landscape,productive_patch_density,productive_edge_density,productive_shape_index,...,tmin_mean,tmin_std,tmax_max,tmax_mean,tmax_std,rad_min,rad_max,rad_mean,rad_std,distributed_yield
0,177,DEA29,2001,1.275164,50.633356,35.429559,63.120532,17.000981,103.266872,11.399003,...,6.474413,5.301266,31.35,13.94203,7.33597,1.76025,30.2035,9.949048,7.644765,9.660467
1,177,DEA29,2002,1.275164,50.633356,35.429559,63.120532,17.000981,103.266872,11.399003,...,6.452189,5.814843,34.225,14.195707,7.24716,1.75925,29.8535,10.030397,7.199292,8.783925
2,177,DEA29,2003,1.275164,50.633356,35.429559,63.120532,17.000981,103.266872,11.399003,...,5.692172,6.386867,35.55,14.497307,8.406264,1.75925,28.2605,11.004956,8.078055,8.787525
3,177,DEA29,2004,1.275164,50.633356,35.429559,63.120532,17.000981,103.266872,11.399003,...,5.100421,5.369863,30.75,13.31229,7.010546,1.75925,28.94,10.089957,7.322121,8.694622
4,177,DEA29,2005,1.275164,50.633356,35.429559,63.120532,17.000981,103.266872,11.399003,...,5.644631,6.005391,33.75,13.636912,8.237541,1.76325,29.40525,10.146389,7.63338,9.065824


In [15]:
# Extra processing
data = data.drop(columns=['id']).groupby(by=['NUTS_ID', 'year']).mean().reset_index()
print(data.shape)
data.head()

(7355, 51)


Unnamed: 0,NUTS_ID,year,landscape_shannon_diversity_index,landscape_patch_density,landscape_euclidean_nearest_neighbor_mn,productive_prop_landscape,productive_patch_density,productive_edge_density,productive_shape_index,productive_enn_mn,...,tmin_mean,tmin_std,tmax_max,tmax_mean,tmax_std,rad_min,rad_max,rad_mean,rad_std,distributed_yield
0,DE111,2001,1.133976,91.081012,40.694999,58.959762,32.306092,123.813543,14.933838,36.269269,...,5.462172,5.62652,32.204076,13.642001,7.579564,2.123588,30.559248,10.904269,8.337542,6.05
1,DE111,2002,1.133976,91.081012,40.694999,58.959762,32.306092,123.813543,14.933838,36.269269,...,4.649203,7.078542,35.685532,13.398965,8.574135,2.124509,30.031833,11.370202,7.933198,6.79
2,DE111,2003,1.133976,91.081012,40.694999,58.959762,32.306092,123.813543,14.933838,36.269269,...,5.066559,7.391742,35.123403,14.172588,9.497394,2.123687,30.656636,11.779034,8.297264,5.87
3,DE111,2004,1.133976,91.081012,40.694999,58.959762,32.306092,123.813543,14.933838,36.269269,...,3.812845,5.956027,31.172983,12.510505,8.002659,2.124509,30.555884,11.297833,7.824148,7.68
4,DE111,2005,1.133976,91.081012,40.694999,58.959762,32.306092,123.813543,14.933838,36.269269,...,3.947408,7.160355,34.336317,12.500369,9.350449,2.123588,31.302025,10.924573,8.076493,7.44


## Leave-Location-and-Time-Out with Moving Window (LLTO-MW) Split

In [16]:
def leave_location_and_time_out_moving_window(
    data,
    year_col,
    space_col,
    train_years=10,
    test_years=1,
    test_frac=0.3,
    random_state=42
):
    """
    Leave-Location-and-Time-Out with Moving Window (LLTO-MW)

    Implements a generalized LLTO cross-validation strategy:
    - Trains on a moving window of fixed training years
    - Leaves out a random 30% of locations for testing in the next test_year(s)
    
    Each split:
    - Trains on `train_years` using 70% of locations
    - Tests on the immediately following `test_years` using the remaining 30% of locations

    Parameters:
    ----------
    data : pd.DataFrame
        Input dataframe containing temporal and spatial columns.
    year_col : str
        Column name representing the year (int or datetime).
    space_col : str
        Column name representing the spatial unit (e.g., district, grid).
    train_years : int, default=10
        Number of consecutive years to use for training.
    test_years : int, default=1
        Number of consecutive years to use for testing (after training window).
    test_frac : float, default=0.3
        Fraction of locations to leave out for testing.
    random_state : int, default=42
        Seed for reproducibility of location sampling.

    Returns:
    -------
    splits : list of (train_indices, test_indices)
        Index tuples for training and testing data in each LLTO-MW fold.
    """
    data = data.copy()
    splits = []

    all_years = sorted(data[year_col].unique())
    all_locations = np.array(data[space_col].unique())
    rng = np.random.default_rng(seed=random_state)

    print("\nLeave-Location-and-Time-Out with Moving Window (LLTO-MW)\n" + "-" * 60)

    for start_idx in range(len(all_years) - train_years - test_years + 1):
        train_year_start = all_years[start_idx]
        train_year_end = all_years[start_idx + train_years - 1]
        test_year_start = all_years[start_idx + train_years]
        test_year_end = test_year_start + test_years - 1

        # Random 70-30 location split
        shuffled_locations = rng.permutation(all_locations)
        split_idx = int((1 - test_frac) * len(shuffled_locations))
        train_locs = shuffled_locations[:split_idx]
        test_locs = shuffled_locations[split_idx:]

        train_mask = (
            data[year_col].between(train_year_start, train_year_end) &
            data[space_col].isin(train_locs)
        )
        test_mask = (
            data[year_col].between(test_year_start, test_year_end) &
            data[space_col].isin(test_locs)
        )

        train_idx = data[train_mask].index
        test_idx = data[test_mask].index

        if len(train_idx) > 0 and len(test_idx) > 0:
            splits.append((list(train_idx), list(test_idx)))
            print(f"Train: {train_year_start}-{train_year_end} | Test: {test_year_start}-{test_year_end} | "
                  f"Train Locs: {len(train_locs)} | Test Locs: {len(test_locs)} "
                  f"({len(train_idx)} train, {len(test_idx)} test)")

    return splits

In [17]:
def robust_scale_train_test(train_df, test_df):
    """
    Applies RobustScaler to train and test DataFrames based on train data statistics.
    NaN values are ignored during fitting and preserved in output.

    Parameters:
    - train_df (pd.DataFrame): Training dataset
    - test_df (pd.DataFrame): Testing dataset

    Returns:
    - scaled_train (pd.DataFrame): Robust-scaled training data
    - scaled_test (pd.DataFrame): Robust-scaled test data
    """

    # Initialize the scaler
    scaler = RobustScaler()

    # Fit only on non-NaN values in training data
    scaler.fit(train_df.dropna())

    # Transform while preserving original index and column names
    scaled_train = pd.DataFrame(
        scaler.transform(train_df),
        index=train_df.index,
        columns=train_df.columns
    )

    scaled_test = pd.DataFrame(
        scaler.transform(test_df),
        index=test_df.index,
        columns=test_df.columns
    )

    # Preserve original NaN values
    scaled_train[train_df.isna()] = pd.NA
    scaled_test[test_df.isna()] = pd.NA

    return scaled_train, scaled_test

In [19]:
# Extract the group index
group_index = leave_location_and_time_out_moving_window(
    data=data, 
    year_col='year',
    space_col='NUTS_ID',
    train_years=10,
    test_years=1,
    test_frac=0.3,
    random_state=42
)

# Define the columns to be droppend from 'X'
# cols_to_be_dropped = ['id', 'NUTS_ID', 'year', 'distributed_yield']
cols_to_be_dropped = ['NUTS_ID', 'year', 'distributed_yield']


Leave-Location-and-Time-Out with Moving Window (LLTO-MW)
------------------------------------------------------------
Train: 2001-2010 | Test: 2011-2011 | Train Locs: 250 | Test Locs: 108 (2400 train, 103 test)
Train: 2002-2011 | Test: 2012-2012 | Train Locs: 250 | Test Locs: 108 (2406 train, 100 test)
Train: 2003-2012 | Test: 2013-2013 | Train Locs: 250 | Test Locs: 108 (2399 train, 98 test)
Train: 2004-2013 | Test: 2014-2014 | Train Locs: 250 | Test Locs: 108 (2391 train, 101 test)
Train: 2005-2014 | Test: 2015-2015 | Train Locs: 250 | Test Locs: 108 (2401 train, 95 test)
Train: 2006-2015 | Test: 2016-2016 | Train Locs: 250 | Test Locs: 108 (2344 train, 97 test)
Train: 2007-2016 | Test: 2017-2017 | Train Locs: 250 | Test Locs: 108 (2309 train, 90 test)
Train: 2008-2017 | Test: 2018-2018 | Train Locs: 250 | Test Locs: 108 (2275 train, 88 test)
Train: 2009-2018 | Test: 2019-2019 | Train Locs: 250 | Test Locs: 108 (2250 train, 83 test)
Train: 2010-2019 | Test: 2020-2020 | Train Locs: 2

## XGBoost Model Implementation

In [20]:
# Define the Optuna objective function
def objective(trial):
    params = {
        "max_depth": trial.suggest_int("max_depth", 4, 20),
        "learning_rate": trial.suggest_loguniform("learning_rate", 0.01, 1.0),
        "n_estimators": trial.suggest_int("n_estimators", 50, 1000),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "gamma": trial.suggest_loguniform("gamma", 0.001, 5),
        "subsample": trial.suggest_loguniform("subsample", 0.1, 1.0),
        "colsample_bytree": trial.suggest_loguniform("colsample_bytree", 0.1, 1.0),
        "reg_alpha": trial.suggest_loguniform("reg_alpha", 0.0001, 10),
        "reg_lambda": trial.suggest_loguniform("reg_lambda", 0.0001, 10)
    }

    # Log trial parameters
    logger.info(f"\n\n\nTrial {trial.number} - Params: {params}")

    model = XGBRegressor(**params, tree_method="hist", predictor="gpu_predictor", device="cuda")
    
    losses = []
    for fold, (train_index, test_index) in enumerate(group_index):
        X_train = data.loc[train_index].drop(cols_to_be_dropped, axis=1)
        y_train = data.loc[train_index]['distributed_yield']

        X_test = data.loc[test_index].drop(cols_to_be_dropped, axis=1)
        y_test = data.loc[test_index]['distributed_yield']

        # Standardize the data
        X_train_scaled, X_test_scaled = robust_scale_train_test(X_train, X_test)

        # Fit the model
        model.fit(X_train_scaled, y_train)

        # Predict on the test set
        test_preds = model.predict(X_test_scaled)

        # Calculate RMSE loss on the current test set
        loss = np.sqrt(mean_squared_error(y_test, test_preds))
        losses.append(loss)

        # Log fold results
        logger.info(f"\tFold {fold}, Test MSE: {loss:.4f}")
        
    # Calculate average loss over all folds
    average_loss = np.mean(losses)

    # Log final trial result
    logger.info(f"\tTrial {trial.number} - Average Test RMSE: {average_loss:.4f}\n")

    # Return the average score
    return average_loss

In [21]:
# Configure logging to save logs with timestamps
logging.basicConfig(
    filename=os.path.join(OUT_DIR, f"xgboost_{DISTANCE}KM_NUTS.log"),  # Log file name
    level=logging.INFO,  # Capture detailed logs
    format="%(asctime)s - %(levelname)s - %(message)s",  # Includes timestamp
    datefmt="%Y-%m-%d %H:%M:%S",  # Custom timestamp format
    filemode="w"  # "w" = overwrite each run, use "a" to append
)
logger = logging.getLogger()
optuna.logging.enable_propagation()  # Propagate logs to the root logger.
optuna.logging.disable_default_handler()  # Stop showing logs in sys.stderr.

sampler = TPESampler(seed=42)
study = optuna.create_study(study_name="xgboost", sampler=sampler, direction="minimize")

logger.info("Start optimization.")
study.optimize(objective, n_trials=10, show_progress_bar=True)

  0%|          | 0/10 [00:00<?, ?it/s]

[I 2025-06-24 17:41:07,118] Trial 0 finished with value: 25864887570.14415 and parameters: {'max_depth': 10, 'learning_rate': 0.7969454818643931, 'n_estimators': 746, 'min_child_weight': 6, 'gamma': 0.003776663327107336, 'subsample': 0.14321698289111517, 'colsample_bytree': 0.1143098387631322, 'reg_alpha': 2.1423021757741068, 'reg_lambda': 0.10129197956845731}. Best is trial 0 with value: 25864887570.14415.
[I 2025-06-24 17:41:33,357] Trial 1 finished with value: 0.9797308275507576 and parameters: {'max_depth': 16, 'learning_rate': 0.010994335574766204, 'n_estimators': 972, 'min_child_weight': 9, 'gamma': 0.00610149136730271, 'subsample': 0.1519934830130981, 'colsample_bytree': 0.1525472945805261, 'reg_alpha': 0.0033205591037519565, 'reg_lambda': 0.042051564509138675}. Best is trial 1 with value: 0.9797308275507576.
[I 2025-06-24 17:41:56,235] Trial 2 finished with value: 0.9901563523829773 and parameters: {'max_depth': 11, 'learning_rate': 0.038234752246751866, 'n_estimators': 631, 'm