In [4]:
import pandas as pd
import ast
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV, KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PowerTransformer
import warnings
from matplotlib import pyplot as plt
import copy
import math

#### Pass the path of your final CSV file path to `MERGED_CSV_FILE_PATH` that has merged data from 2019, 2020, and 2021. We train one single model for all 3 years combined

#### This notebook was run with a high CPU count machine

In [5]:
MERGED_CSV_FILE_PATH = "YOUR_MERGED_FOLDER_PATH/merged_2019_2020_2021.csv"
combined_dataframe = pd.read_csv(MERGED_CSV_FILE_PATH)
combined_dataframe = combined_dataframe.drop(combined_dataframe.columns[0], axis=1)
combined_dataframe['agbd_points'] = combined_dataframe['agbd_points'].apply(ast.literal_eval)
combined_dataframe['overlap'] = combined_dataframe['overlap'].apply(ast.literal_eval)
list_ = combined_dataframe.values.tolist()

In [6]:
dummies = pd.get_dummies(combined_dataframe.Ecoregion_l3)
combined_dataframe_new = combined_dataframe.join(dummies)
list_ = combined_dataframe_new.values.tolist()

In [9]:
def clean_up_list(l, overlap_percentage=0.5):
  l = l.copy()
  filtered_list = []
  for i in l:
    agbd = i[18]
    overlap = i[23]
    filtered_agbd = [value for value, b_value in zip(agbd, overlap) if b_value >= overlap_percentage]
    i[18] = filtered_agbd.copy()
  return l.copy()

filterd_list = clean_up_list(copy.deepcopy(list_.copy()))

#### Add the value for `BEST_MEAN_COUNT` that was found from FeatureSelectionBulkOverlap.ipynb notebook
#### Add the indices for the best features that was found from FeatureSelectionBulkOverlap.ipynb notebook 

In [None]:
BEST_MEAN_COUNT = 14
BEST_FEATURES = [3, 9, 12, 14, 19, 21, 24]

In [None]:
filtered_by_region_mean = []
for index_, row in enumerate(filterd_list):
  agbd_values = row[18]
  if len(agbd_values) >= BEST_MEAN_COUNT:
    if all(value >= 0 for value in agbd_values):
      mean_agbd = sum(agbd_values) / len(agbd_values)
      values_to_add = [row[i] for i in BEST_FEATURES + list(range(26, 44))] 
      # 3, 9, 12, 14, 19, 21, 24 are the best features found from FeatureSelectionBulkOverlap.ipynb notebook. Be sure to add the index of columns correctly
      # 3 = yeojohnson
      # 9 = yeojohnson
      # 12 = yeojohnson
      # 14 = yeojohnson
      # 19 = None
      # 21 = yeojohnson
      # 24 = yeojohnson
      # These transformations were done by manually checking skewness
      # The absolute lowest skewness measure was chosen 
      values_to_add.append(math.log(mean_agbd))
      filtered_by_region_mean.append(values_to_add)

data_array = np.array(filtered_by_region_mean)
X, y = data_array[:, :-1], data_array[:, -1]
transformer_dict = {idx: PowerTransformer(method='yeo-johnson') for idx in [0, 1, 2, 3, 5, 6]}  # Map column indices to transformers
for idx in transformer_dict:
  X[:, idx] = transformer_dict[idx].fit_transform(X[:, idx].reshape(-1, 1)).flatten()
Q1 = np.percentile(y, 25)
Q3 = np.percentile(y, 75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
y = np.where((y < lower_bound) | (y > upper_bound), np.nan, y)
X = X[~np.isnan(y)]
y= y[~np.isnan(y)]

#### We do training in two parts since having just one models leads to overpredicting or underpredicting values
#### First fit an RF with training set
#### Compute residuals = y_train - y_pred
#### Fit an RF with these residuals as dependent variable against training set
#### While predection first predict from RF model 1, then predict from RF model 2
#### Add the results of both the models

In [11]:
rf_hyperparam_grid={
    "n_estimators": [100, 200, 250, 500],
    "max_features": ['auto', 7, 10, 15],
    "min_samples_leaf": [1, 5, 20, 100],
    "min_samples_split": [2, 10, 50, 250],
    "criterion": ["absolute_error", "squared_error"],
    "max_depth": [4, 6, 8, 10, None]
}
cv_split = KFold(n_splits=10, random_state=42, shuffle=True)

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=321)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [16]:
rf_random_search=RandomizedSearchCV(
    estimator=RandomForestRegressor(random_state=42),
    param_distributions=rf_hyperparam_grid,
    n_iter=1,
    scoring="neg_mean_absolute_error",
    refit=True,
    return_train_score=True,
    cv=cv_split,
    verbose=10,
    n_jobs=-1,
    random_state=42
)
tuned_random_model = rf_random_search.fit(X_train, y_train)

#### Copy the best parameters for the Random Forest for later use

In [None]:
tuned_random_model.best_params_

In [20]:
random_y_hat = tuned_random_model.predict(X_train)
residuals = y_train - random_y_hat
cv_split = KFold(n_splits=10, random_state=42, shuffle=True)

In [21]:
rf_random_search_bias=RandomizedSearchCV(
    estimator=RandomForestRegressor(random_state=42),
    param_distributions=rf_hyperparam_grid,
    n_iter=1,
    scoring="neg_mean_absolute_error",
    refit=True,
    return_train_score=True,
    cv=cv_split,
    verbose=10,
    n_jobs=-1,
    random_state=42
)
tuned_bias_random_model = rf_random_search_bias.fit(X_train, residuals)

#### Copy the best parameters for the bias estimator Random Forest for later use

In [22]:
tuned_bias_random_model.best_params_