In [25]:
import pandas as pd
import os
import ast
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_absolute_percentage_error
from multiprocessing import Pool
import multiprocessing as mp
import warnings
import copy
import math
from pickle import dump

#### Pass the path of your final CSV file path to `MERGED_CSV_FILE_PATH` that has merged data from 2019, 2020, and 2021. We train one single model for all 3 years combined

#### This notebook was run with a high CPU count machine

In [3]:
MERGED_CSV_FILE_PATH = "YOUR_MERGED_FOLDER_PATH/merged_2019_2020_2021.csv"
combined_dataframe = pd.read_csv(MERGED_CSV_FILE_PATH)
combined_dataframe = combined_dataframe.drop(combined_dataframe.columns[0], axis=1)
combined_dataframe['agbd_points'] = combined_dataframe['agbd_points'].apply(ast.literal_eval)
combined_dataframe['overlap'] = combined_dataframe['overlap'].apply(ast.literal_eval)
list_ = combined_dataframe.values.tolist()

In [4]:
dummies = pd.get_dummies(combined_dataframe.Ecoregion_l3)
combined_dataframe_new = combined_dataframe.join(dummies)

In [5]:
list_ = combined_dataframe_new.values.tolist()

In [6]:
def clean_up_list(l, overlap_percentage=0.5):
  l = l.copy()
  filtered_list = []
  for i in l:
    agbd = i[18]
    overlap = i[23]
    filtered_agbd = [value for value, b_value in zip(agbd, overlap) if b_value >= overlap_percentage]
    i[18] = filtered_agbd.copy()
  return l.copy()

filterd_list = clean_up_list(copy.deepcopy(list_.copy()))

#### We do training in two parts since having just one models leads to overpredicting or underpredicting values
#### First fit an RF with training set
#### Compute residuals = y_train - y_pred
#### Fit an RF with these residuals as dependent variable against training set
#### While predection first predict from RF model 1, then predict from RF model 2
#### Add the results of both the models

In [10]:
def rf_bias_reg_IQ(X, y):
  mae_list = []
  rmse_list = []
  mape_list = []
  nrmse_list = []
  r2_list = []

  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=321)
  scaler = StandardScaler()
  X_train = scaler.fit_transform(X_train)
  X_test = scaler.transform(X_test)

  model = RandomForestRegressor(n_estimators=500,
                              min_samples_split=2,
                              min_samples_leaf=1,
                              max_features=10,
                              max_depth=None,
                              criterion="absolute_error",
                              random_state=42)
  model.fit(X_train, y_train)
  y_pred_train = model.predict(X_train)

  residuals = y_train - y_pred_train
  model_bias = RandomForestRegressor(n_estimators=200,
                                    min_samples_split=250,
                                    min_samples_leaf=5,
                                    max_features=10,
                                    max_depth=10,
                                    criterion="absolute_error",
                                     random_state=42)
  model_bias.fit(X_train, residuals)

  y_pred_test = model.predict(X_test)
  estimated_residuals = model_bias.predict(X_test)
  bias_corrected_predictions = y_pred_test + estimated_residuals
  bias_corrected_predictions = np.exp(bias_corrected_predictions)

  y_test = np.exp(y_test)

  mse = mean_squared_error(y_test, bias_corrected_predictions)
  mae = mean_absolute_error(y_test, bias_corrected_predictions)
  r2_test = r2_score(y_test, bias_corrected_predictions)
  mape = mean_absolute_percentage_error(y_test, bias_corrected_predictions) * 100.0
  mae_list.append(mae)
  rmse_list.append(mse**0.5)
  r2_list.append(r2_test)
  nrmse_list.append(((mse**0.5)/y_test.mean())*100.0)
  mape_list.append(mape)

  return mae_list, rmse_list, nrmse_list, mape_list, r2_list, "rf_bias_reg_IQ", [model, model_bias], [scaler]

#### Add the indices for the best features that was found from FeatureSelectionBulkOverlap.ipynb notebook 

In [None]:
BEST_FEATURES = [3, 9, 12, 14, 19, 21, 24]

In [11]:
warnings.filterwarnings("ignore", message="There are no meaningful features")
def do_analysis(mean_count):
  print("Started Doing for mean count {}\n".format(mean_count))
  filtered_by_region_mean = []
  for index_, row in enumerate(filterd_list):
    agbd_values = row[18]
    if len(agbd_values) >= mean_count:
      if all(value >= 0 for value in agbd_values):
        """3 = yeojohnson
        9 = yeojohnson
        12 = yeojohnson
        14 = yeojohnson
        19 = None
        21 = yeojohnson
        24 = yeojohnson"""
        mean_agbd = sum(agbd_values) / len(agbd_values)
        values_to_add = [row[i] for i in BEST_FEATURES + list(range(26, 44))]
        values_to_add.append(math.log(mean_agbd))
        filtered_by_region_mean.append(values_to_add)

  data_array = np.array(filtered_by_region_mean)
  X, y = data_array[:, :-1], data_array[:, -1]
  transformer_dict = {idx: [PowerTransformer(method='yeo-johnson'), combined_dataframe_new.columns[original_idx]]  for original_idx, idx in zip([3, 9, 12, 14, 21, 24], [0, 1, 2, 3, 5, 6])}  # Map column indices to transformers
  for idx in transformer_dict:
      X[:, idx] = transformer_dict[idx][0].fit_transform(X[:, idx].reshape(-1, 1)).flatten()
  Q1 = np.percentile(y, 25)
  Q3 = np.percentile(y, 75)
  IQR = Q3 - Q1
  lower_bound = Q1 - 1.5 * IQR
  upper_bound = Q3 + 1.5 * IQR
  y = np.where((y < lower_bound) | (y > upper_bound), np.nan, y)
  X = X[~np.isnan(y)]
  y = y[~np.isnan(y)]
  results = []
  args_ = (X, y)
  results.append(rf_bias_reg_IQ(X, y))
  for mae_c, rmse_c, mape_c, nrmse_c, r2, name_, model_, scaler_ in results:
    rf_bias_reg_IQ_var = { "mae": mae_c, "rmse": rmse_c, "mape": mape_c,
              "nrmse": nrmse_c, "r2": r2, "model": model_, "scaler": scaler_, "Power_Transformer_states": transformer_dict}

  print("Finished Doing for mean count {}\n".format(mean_count))
  return mean_count, rf_bias_reg_IQ_var

#### Add the value for `BEST_MEAN_COUNT` that was found from Feature_SelectionBulk_Overlap.ipynb notebook

In [13]:
BEST_MEAN_COUNT = 14
results = do_analysis(BEST_MEAN_COUNT)

Started Doing for mean count 14

Finished Doing for mean count 14



#### Save the models for future use

In [26]:
with open("RF_base_model.pkl", "wb") as f:
    dump(results[1]['model'][0], f, protocol=5)

In [27]:
with open("RF_bias_model.pkl", "wb") as f:
    dump(results[1]['model'][1], f, protocol=5)

In [28]:
with open("Standard_Scaler_Params.pkl", "wb") as f:
    dump(results[1]['model'][0], f, protocol=5)

In [None]:
for i in results[1]['Power_Transformer_states'].values():
  with open(f"Power_transform_params_{i[1]}.pkl", "wb") as f:
    dump(i[0], f, protocol=5)