Notebook to train the XGBRegressor using the selected features and calibrated hyperparameters. An ensemble model of the best predictors and parameters for each fold is built.

In [1]:
import numpy as np
import pandas as pd
from xgboost import XGBRegressor
from sklearn.compose import TransformedTargetRegressor
from sklearn.preprocessing import FunctionTransformer
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.pipeline import Pipeline
from sklearn.ensemble import VotingRegressor
import pickle

Helper classes and functions used in the training pipeline.

In [48]:
class BGRBinaryEncoding(BaseEstimator, TransformerMixin):
    def __init__(self):
        self
    def fit(self, X , y=None):
        return self
    def transform(self, X):
        df = X.copy()

        df["bgr_tuple"] = [binary_encoding(t) for t in df.bgr]

        # df_bin = df['bgr_tuple'].apply(pd.Series, index=['bgr1','bgr2','bgr3'])
        # print(df_bin)
        df_bin=pd.DataFrame(df['bgr_tuple'].tolist())
        # print(df_bin)
        df_bin.columns=['bgr1', 'bgr2', 'bgr3']
        # print(df_bin)

        df = pd.concat([df,df_bin],axis='columns')

        # print("blablalbla")
        print(df)

        # df = df.dropna()

        df = df.drop(["bgr_tuple","bgr"],axis="columns")

        return df
    
# custom transformer for sklearn pipeline
class ColumnExtractor(TransformerMixin, BaseEstimator):
    def __init__(self, cols):
        self.cols = cols

    def transform(self, X):
        return X[self.cols]

    def fit(self, X, y=None):
        return self    
    
class LogarithmizeWaterObservables(BaseEstimator, TransformerMixin):
    def __init__(self, cols):
        self.cols = cols
    def fit(self, X , y=None):
        return self
    def transform(self, X):
        df = X.copy()
        log_preds = log_predictors(self.cols)
        df[log_preds] = np.log(df[log_preds]+1)     
        return df    

def keep_predictor(plist,pname,pval):
    if pval>0.5:
        plist.append(pname)
    return plist


def build_predictor_list(predictor_dict):
    plist = []
    for predictor in predictor_dict:
        plist = keep_predictor(plist,predictor,predictor_dict[predictor])
    return plist    

def func(x):
    return np.log(x+1)

def inverse_func(x):
    return np.exp(x)-1

def binary_encoding(bgr):
    match bgr:
        case "Palearctic":
            return (0,0,0)
        case "Indomalayan":
            return (0,1,0)
        case "Australasia":
            return (0,0,1)
        case "Nearctic":
            return (0,1,1)
        case "Afrotropic":
            return (1,0,0)
        case "Neotropic":
            return (1,1,0)                    

log_transformer = FunctionTransformer(func=func,inverse_func=inverse_func)

def log_predictors(predictor_list):
    all = ["yp","pwm","pet","ps","pcq","pdq","pwaq","pweq"]
    return [f for f in all if f in predictor_list]


def predictor_list(predictor_string):
    pred = predictor_string.split("-")
    return pred 

Load the optimal hyperparameters file and get the predictor list.

In [16]:

hp_file = "/home/dibepa/git/global.agb.ml/data/training/predictor_selection_onlybioclim/best_predictors_hp_absolute.csv"

hp = pd.read_csv(hp_file)

hp["combination"] = hp.combination.apply(lambda x: predictor_list(x))

Build the training pipeline

In [49]:
pipe_list = []

 # Iterate over each set of predictors and hyper-parameters
for index, row in hp.iterrows():

    bst = XGBRegressor(
        n_estimators=1000,
        learning_rate=float(row.e),
        max_depth = int(row.md),
        min_child_weight=float(row.mcw),
        subsample=float(row.subsample),
        min_split_loss = float(row.g),
        max_delta_step = float(row.mds),
        eval_metric = "rmse",
        objective='reg:squarederror',
    )

    regr = TransformedTargetRegressor(regressor=bst,func=func,inverse_func=inverse_func)

    plist = row.combination
    log_preds = log_predictors(plist)
    
    if len(log_preds)>0:
        if "bgr" in plist:
            estimator = Pipeline([
                ("col_extract", ColumnExtractor(plist)),
                ("bgr_binary", BGRBinaryEncoding()),
                ("log_water", LogarithmizeWaterObservables(plist)),
                ("regressor",regr)
            ])
        else:   
            estimator = Pipeline([
                ("col_extract", ColumnExtractor(plist)),
                ("log_water", LogarithmizeWaterObservables(plist)),
                ("regressor",regr)
            ])
    else:
        if "bgr" in plist:
            estimator = Pipeline([
                ("col_extract", ColumnExtractor(plist)),
                ("bgr_binary", BGRBinaryEncoding()),
                ("regressor",regr)
            ])
        else:
            estimator = Pipeline([
                ("col_extract", ColumnExtractor(plist)),
                ("regressor",regr)
            ])

    pipe_list.append(estimator)

estimators = [("pipe_{}".format(i),p) for i,p in enumerate(pipe_list)]

Build the ensemble regressor, train it and save the serialized model.

In [50]:
ensemble_regressor = VotingRegressor(
    estimators = estimators
)    

data_file = "/home/dibepa/git/global.agb.ml/data/training/detailed.allometries.model/ABD_training_dataset.csv"
data = pd.read_csv(data_file)

# Removing extremely large ABD values
data = data[data['abd']<5000].reset_index(drop=True)

y = np.array(data["abd"])
X = data.drop("abd",axis="columns")

print(X)

ensemble_regressor.fit(X,y)

pickle.dump(ensemble_regressor, open("/home/dibepa/git/global.agb.ml/data/training/detailed.allometries.model/abd_model_bioclim_notrees_noweights.pkl", "wb"))

      Unnamed: 0        iso       mtwm        mdr       mtwq       mtcm  \
0              0  55.911704  28.133846  10.004359  21.545770   1.074615   
1              1  66.978914  29.298148  10.506366  23.391744   7.881944   
2              2  64.616033  32.378330  12.503499  25.653198  10.787133   
3              3  42.409183  30.386528  12.644559  22.194214  -6.249741   
4              4  33.307705  26.276936   7.826691  20.531338   2.682968   
...          ...        ...        ...        ...        ...        ...   
3717        3735  23.608137  31.799999  11.025000  23.533333 -14.900000   
3718        3736  82.401665  37.799999  13.266666  30.400000  21.700001   
3719        3737  66.261810  20.700001  16.366667  12.750000  -4.000000   
3720        3738  64.270386  19.100000  14.975000  11.766666  -4.200000   
3721        3739  28.732849  31.000000  11.866667  25.316668 -10.300000   

             pcq        pdm         pdq        pwaq  ...        tdq  \
0     571.969231  20.376923 

  result = func(self.values, **kwargs)
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)


              pet         bgr           ts       mtcm        tar  bgr_tuple  \
0     1256.692308   Neotropic   635.379238   1.074615  27.059230  (1, 1, 0)   
1     1366.476852   Neotropic   413.874934   7.881944  21.416204  (1, 1, 0)   
2     1610.582393   Neotropic   337.989261  10.787133  21.591196  (1, 1, 0)   
3     1894.269430   Neotropic   911.284273  -6.249741  36.636270  (1, 1, 0)   
4     1206.600323   Neotropic   588.654297   2.682968  23.593968  (1, 1, 0)   
...           ...         ...          ...        ...        ...        ...   
3717  1330.000000  Palearctic  1331.256226 -14.900000  46.699997  (0, 0, 0)   
3718  2470.000000  Afrotropic    74.919144  21.700001  16.099998  (1, 0, 0)   
3719  2216.000000   Neotropic   312.212280  -4.000000  24.700001  (1, 1, 0)   
3720  2175.000000   Neotropic   313.579956  -4.200000  23.299999  (1, 1, 0)   
3721  1458.000000  Palearctic  1110.430908 -10.300000  41.299999  (0, 0, 0)   

      bgr1  bgr2  bgr3  
0        1     1     0  
1

  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)


              pet        iso         bgr           ts       mtcm        tar  \
0     1256.692308  55.911704   Neotropic   635.379238   1.074615  27.059230   
1     1366.476852  66.978914   Neotropic   413.874934   7.881944  21.416204   
2     1610.582393  64.616033   Neotropic   337.989261  10.787133  21.591196   
3     1894.269430  42.409183   Neotropic   911.284273  -6.249741  36.636270   
4     1206.600323  33.307705   Neotropic   588.654297   2.682968  23.593968   
...           ...        ...         ...          ...        ...        ...   
3717  1330.000000  23.608137  Palearctic  1331.256226 -14.900000  46.699997   
3718  2470.000000  82.401665  Afrotropic    74.919144  21.700001  16.099998   
3719  2216.000000  66.261810   Neotropic   312.212280  -4.000000  24.700001   
3720  2175.000000  64.270386   Neotropic   313.579956  -4.200000  23.299999   
3721  1458.000000  28.732849  Palearctic  1110.430908 -10.300000  41.299999   

      bgr_tuple  bgr1  bgr2  bgr3  
0     (1, 1, 0)

  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)


            tdq          pet        iso         bgr          ps           ts  \
0      7.757693  1256.692308  55.911704   Neotropic   60.131920   635.379238   
1     15.709645  1366.476852  66.978914   Neotropic   58.150249   413.874934   
2     20.202652  1610.582393  64.616033   Neotropic  100.731586   337.989261   
3      6.171243  1894.269430  42.409183   Neotropic   43.595186   911.284273   
4     20.391435  1206.600323  33.307705   Neotropic   30.780032   588.654297   
...         ...          ...        ...         ...         ...          ...   
3717  -1.433333  1330.000000  23.608137  Palearctic   17.837023  1331.256226   
3718  28.783333  2470.000000  82.401665  Afrotropic   66.814926    74.919144   
3719   7.133334  2216.000000  66.261810   Neotropic  117.479156   312.212280   
3720   5.966667  2175.000000  64.270386   Neotropic  115.432632   313.579956   
3721  -2.033333  1458.000000  28.732849  Palearctic  144.130203  1110.430908   

           mtcm         pcq        tar 

  result = func(self.values, **kwargs)
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)


              pet        iso         bgr           ts       mtcm  bgr_tuple  \
0     1256.692308  55.911704   Neotropic   635.379238   1.074615  (1, 1, 0)   
1     1366.476852  66.978914   Neotropic   413.874934   7.881944  (1, 1, 0)   
2     1610.582393  64.616033   Neotropic   337.989261  10.787133  (1, 1, 0)   
3     1894.269430  42.409183   Neotropic   911.284273  -6.249741  (1, 1, 0)   
4     1206.600323  33.307705   Neotropic   588.654297   2.682968  (1, 1, 0)   
...           ...        ...         ...          ...        ...        ...   
3717  1330.000000  23.608137  Palearctic  1331.256226 -14.900000  (0, 0, 0)   
3718  2470.000000  82.401665  Afrotropic    74.919144  21.700001  (1, 0, 0)   
3719  2216.000000  66.261810   Neotropic   312.212280  -4.000000  (1, 1, 0)   
3720  2175.000000  64.270386   Neotropic   313.579956  -4.200000  (1, 1, 0)   
3721  1458.000000  28.732849  Palearctic  1110.430908 -10.300000  (0, 0, 0)   

      bgr1  bgr2  bgr3  
0        1     1     0  
1

  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)


             pwm          pet        iso         bgr           ts       mtcm  \
0     266.846154  1256.692308  55.911704   Neotropic   635.379238   1.074615   
1     216.194444  1366.476852  66.978914   Neotropic   413.874934   7.881944   
2     220.164786  1610.582393  64.616033   Neotropic   337.989261  10.787133   
3      51.751295  1894.269430  42.409183   Neotropic   911.284273  -6.249741   
4      93.528710  1206.600323  33.307705   Neotropic   588.654297   2.682968   
...          ...          ...        ...         ...          ...        ...   
3717   24.000000  1330.000000  23.608137  Palearctic  1331.256226 -14.900000   
3718   71.000000  2470.000000  82.401665  Afrotropic    74.919144  21.700001   
3719   15.000000  2216.000000  66.261810   Neotropic   312.212280  -4.000000   
3720   18.000000  2175.000000  64.270386   Neotropic   313.579956  -4.200000   
3721  210.000000  1458.000000  28.732849  Palearctic  1110.430908 -10.300000   

             pcq        tar  bgr_tuple 

  result = func(self.values, **kwargs)
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)


              pet        iso         bgr       mtcm  bgr_tuple  bgr1  bgr2  \
0     1256.692308  55.911704   Neotropic   1.074615  (1, 1, 0)     1     1   
1     1366.476852  66.978914   Neotropic   7.881944  (1, 1, 0)     1     1   
2     1610.582393  64.616033   Neotropic  10.787133  (1, 1, 0)     1     1   
3     1894.269430  42.409183   Neotropic  -6.249741  (1, 1, 0)     1     1   
4     1206.600323  33.307705   Neotropic   2.682968  (1, 1, 0)     1     1   
...           ...        ...         ...        ...        ...   ...   ...   
3717  1330.000000  23.608137  Palearctic -14.900000  (0, 0, 0)     0     0   
3718  2470.000000  82.401665  Afrotropic  21.700001  (1, 0, 0)     1     0   
3719  2216.000000  66.261810   Neotropic  -4.000000  (1, 1, 0)     1     1   
3720  2175.000000  64.270386   Neotropic  -4.200000  (1, 1, 0)     1     1   
3721  1458.000000  28.732849  Palearctic -10.300000  (0, 0, 0)     0     0   

      bgr3  
0        0  
1        0  
2        0  
3        0 

  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  result = func(self.values, **kwargs)
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  result = func(self.values, **kwargs)
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)


            tcq         pwm         bgr          ps           ts  bgr_tuple  \
0      6.226538  266.846154   Neotropic   60.131920   635.379238  (1, 1, 0)   
1     13.450618  216.194444   Neotropic   58.150249   413.874934  (1, 1, 0)   
2     17.493492  220.164786   Neotropic  100.731586   337.989261  (1, 1, 0)   
3      0.079707   51.751295   Neotropic   43.595186   911.284273  (1, 1, 0)   
4      6.224134   93.528710   Neotropic   30.780032   588.654297  (1, 1, 0)   
...         ...         ...         ...         ...          ...        ...   
3717  -8.933333   24.000000  Palearctic   17.837023  1331.256226  (0, 0, 0)   
3718  28.566668   71.000000  Afrotropic   66.814926    74.919144  (1, 0, 0)   
3719   5.100000   15.000000   Neotropic  117.479156   312.212280  (1, 1, 0)   
3720   4.066667   18.000000   Neotropic  115.432632   313.579956  (1, 1, 0)   
3721  -2.033333  210.000000  Palearctic  144.130203  1110.430908  (0, 0, 0)   

      bgr1  bgr2  bgr3  
0        1     1     0  
1

  result = func(self.values, **kwargs)
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
