# Roandom Forest

## 0. Setup

### 0.1. Install packages

In [15]:
#!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
#!pip install panelsplit
#!pip install openpyxl

### 0.2. Load packages

In [16]:
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures

# from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression
import random

import zipfile
import os
import openpyxl
import pandas as pd

from panelsplit.cross_validation import PanelSplit

import shap
import matplotlib.pyplot as plt
from sklearn.linear_model import Lasso

### 0.3. Load Data

In [20]:
### Check working directory
#print(os.getcwd()) 

### Define file and path
file_path = r"c:\Users\mmier\OneDrive - Hertie School\3. Estudio\2025 MDS\2025-1 MDS Thesis\MDS_thesis\Data\OSC" #Use a raw string (r"") when defining paths
file = "Datos-ICM-2023.xlsx"
full_path = os.path.join(file_path, file)

### List files in directory
#print(os.listdir(file_path))

### Load excel file
df = pd.read_excel(full_path, engine="openpyxl")

# Display the first few rows
#print(df.head())

### 0.4. Correct df format

In [21]:
### Rename columns names with row 4
df.columns = df.iloc[3]

### Delete first (index 0) and third (index 2) row
df = df.drop([0, 1, 2, 3], axis=0)

### Reset index 
df = df.reset_index(drop=True)


## 1. Preparations

### 1.1. Define train, test and evaluation set

Evaluation set: 2019 to 2022

In [None]:
### Separate evaluation set 
final_df = df[df["AÑO"] < 2019]

### 1.2. Define y and X1

y: deforestation

M-03-25	Hectáreas de bosque deforestadas

X1: general vaqriables

ICM-00-0	Índice de Ciudades Modernas
PCC-00-0	Índice de Productividad, Competitividad y Complementariedad Económica
GPI-00-0	Índice de Gobernanza, Participación e Instituciones
EIS-00-0	Índice de Equidad e inclusión social
CTI-00-0	Índice de Ciencia, Tecnología e Innovación
SEG-00-0	Índice de Seguridad
SOS-00-0	Índice de Sostenibilidad


In [23]:
### Define y
y = final_df["M-03-25"]

### Define X1: general variables 
X1 = final_df[["ICM-00-0", "PCC-00-0", "GPI-00-0", "EIS-00-0", "CTI-00-0", "SEG-00-0", "SOS-00-0"]]

## 2. Random Forest Model

In [24]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import TimeSeriesSplit  # Alternative to PanelSplit

# Set seed for reproducibility
seed_value = 17
np.random.seed(seed_value)

# Define Time-Series Cross-Validation (5 splits)
tscv = TimeSeriesSplit(n_splits=5)

# Hyperparameters for Random Forest
n_estimators_values = [50, 100, 200, 300]  # Number of trees to test
max_depth_values = [5, 10, 20, None]  # Depth of trees

# Function to compute Adjusted R²
def adjusted_r2(r2, n, k):
    return 1 - (1 - r2) * ((n - 1) / (n - k - 1))

# Dictionary to store results
results = {}

# Perform Time-Series Cross-Validation
for train_idx, test_idx in tscv.split(X1):
    
    # Create polynomial interaction terms (degree=2, only interactions)
    poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
    X_interactions = poly.fit_transform(X1)

    # Split dataset into train & test per fold
    X_train1, X_test1 = X_interactions[train_idx], X_interactions[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    # Train and evaluate for each combination of hyperparameters
    for n_estimators in n_estimators_values:
        for max_depth in max_depth_values:
            
            # Define the Random Forest model
            model = RandomForestRegressor(
                n_estimators=n_estimators,
                max_depth=max_depth,
                random_state=seed_value,
                n_jobs=-1  # Use all available processors
            )

            # Train the model
            model.fit(X_train1, y_train.ravel())

            # Predictions
            y_train_pred = model.predict(X_train1)
            y_test_pred = model.predict(X_test1)

            # Compute performance metrics
            mse = mean_squared_error(y_test, y_test_pred)
            rmse = np.sqrt(mse)
            mae = mean_absolute_error(y_test, y_test_pred)
            r2_test = r2_score(y_test, y_test_pred)
            r2_train = r2_score(y_train, y_train_pred)

            # Compute Adjusted R²
            n_train, k = X_train1.shape
            n_test = X_test1.shape[0]
            adj_r2_train = adjusted_r2(r2_train, n_train, k)
            adj_r2_test = adjusted_r2(r2_test, n_test, k)

            # Store results for this combination
            results[(n_estimators, max_depth)] = {
                "MSE": mse, "RMSE": rmse, "MAE": mae, "R2_test": r2_test, "R2_train": r2_train,
                "Adj_R2_test": adj_r2_test, "Adj_R2_train": adj_r2_train
            }

# Find the best hyperparameter combination (minimize MSE, maximize R²)
best_params = min(results, key=lambda x: (results[x]["MSE"], -results[x]["R2_test"]))
best_metrics = results[best_params]

# Print optimal hyperparameters and their performance
print(f"🌲 Optimal Random Forest Parameters: n_estimators={best_params[0]}, max_depth={best_params[1]}")
print(f"📊 Best MSE: {best_metrics['MSE']:.4f}")
print(f"📊 Best RMSE: {best_metrics['RMSE']:.4f}")
print(f"📊 Best MAE: {best_metrics['MAE']:.4f}")
print(f"📊 R² (Train): {best_metrics['R2_train']:.4f}, Adjusted R² (Train): {best_metrics['Adj_R2_train']:.4f}")
print(f"📊 R² (Test): {best_metrics['R2_test']:.4f}, Adjusted R² (Test): {best_metrics['Adj_R2_test']:.4f}")


  model.fit(X_train1, y_train.ravel())
  model.fit(X_train1, y_train.ravel())
  model.fit(X_train1, y_train.ravel())
  model.fit(X_train1, y_train.ravel())
  model.fit(X_train1, y_train.ravel())
  model.fit(X_train1, y_train.ravel())
  model.fit(X_train1, y_train.ravel())
  model.fit(X_train1, y_train.ravel())
  model.fit(X_train1, y_train.ravel())
  model.fit(X_train1, y_train.ravel())
  model.fit(X_train1, y_train.ravel())
  model.fit(X_train1, y_train.ravel())
  model.fit(X_train1, y_train.ravel())
  model.fit(X_train1, y_train.ravel())
  model.fit(X_train1, y_train.ravel())
  model.fit(X_train1, y_train.ravel())
  model.fit(X_train1, y_train.ravel())


ValueError: Input contains NaN.