## 0.Setup

### Install packages

In [12]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip install panelsplit

Looking in indexes: https://download.pytorch.org/whl/cu118


### Load packages

In [2]:
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures

# from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression
import random

import zipfile
import os

from panelsplit.cross_validation import PanelSplit

import shap
import matplotlib.pyplot as plt
from sklearn.linear_model import Lasso

### Device

In [14]:
import torch

# Define device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(f"Using device: {device}")  # Check if GPU or CPU is being used

Using device: cpu


## 0. Load Data

### Kaggle

In [15]:
import os
import pandas as pd

# Define dataset path
ds_path = "/kaggle/input/final-ds"

# Create a list with the files in the dataset (dataframes)
ds_files = os.listdir(ds_path) #list available files in the dataset

# Load each file into a dictionary (assuming all files are csv)
ds = {file: pd.read_csv(f"{ds_path}/{file}") for file in ds_files}

# Create the specific dataframe
final_df = ds[ds_files[0]] #first csv file
final_df.head()

Unnamed: 0,codmpio,year,tc_loss,gandina,gcaribe,gpacifica,gorinoquia,gamazonia,areaoficialhm2,discapital,...,subnational1_VAUPES,subnational1_VICHADA,cluster_kmeans_0,cluster_kmeans_1,cluster_kmeans_2,cluster_kmeans_3,cluster_kmeans_4,cluster_kmeans_5,cluster_kmeans_6,tc_loss_area
0,5873.0,2003.0,2.0,1.0,0.0,0.0,0.0,0.0,180100.0,122.0,...,0,0,0,0,0,0,0,1,0,1.1e-05
1,5873.0,2004.0,18.0,1.0,0.0,0.0,0.0,0.0,180100.0,122.0,...,0,0,0,0,0,0,0,1,0,0.0001
2,5873.0,2005.0,13.375,1.0,0.0,0.0,0.0,0.0,180100.0,122.0,...,0,0,0,0,0,0,0,1,0,7.4e-05
3,5873.0,2006.0,10.875,1.0,0.0,0.0,0.0,0.0,180100.0,122.0,...,0,0,0,0,0,0,0,1,0,6e-05
4,5873.0,2007.0,23.0,1.0,0.0,0.0,0.0,0.0,180100.0,122.0,...,0,0,0,0,0,0,0,1,0,0.000128


## 1. 

In [16]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler


# Data after 2007
final_df = final_df[final_df['year'] > 2006]
# y 
y = final_df['tc_loss']

# Normalize output
scaler = StandardScaler()
y = scaler.fit_transform(y.values.reshape(-1, 1))

# Get rid of columns that start with 'subnational1_' and 'cluster_' in train and test   
#X1 = final_df.loc[:,~final_df.columns.str.startswith('g')]
X1 = final_df.loc[:,~final_df.columns.str.startswith('subnational1_')]
X1 = X1.loc[:,~X1.columns.str.startswith('cluster_')]


# Get rid of all disaggregated columns
X1 = X1.loc[:,~X1.columns.str.startswith('ac_')]
X1 = X1.loc[:,~X1.columns.str.startswith('as_')]
X1 = X1.loc[:,~X1.columns.str.startswith('p_')]
X1 = X1.loc[:,~X1.columns.str.startswith('r_')]
X1 = X1.loc[:,~X1.columns.str.startswith('nuf_')]
X1 = X1.loc[:,~X1.columns.str.startswith('vrf_')]

# X final
X1 = X1.drop(columns=['year', 'tc_loss', 'tc_loss_area', 'codmpio'])

# Keep feature names
original_feature_names = list(X1.columns) 

# Create polynomial interaction terms (degree=2, only interactions, no bias term)
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
X_interactions = poly.fit_transform(X1)

# Get the feature names including interactions
interaction_feature_names = poly.get_feature_names_out(original_feature_names)

# Compute standard deviation of each feature
stds = X1.std(axis=0)

# Find columns where std = 0
zero_variance_features = np.where(stds == 0)[0]

if len(zero_variance_features) > 0:
    print(f"⚠️ Removing {len(zero_variance_features)} features with zero variance.")
    print(f"🔍 Removed feature indices: {zero_variance_features}")

    # If feature names are available, print them
    if isinstance(X1, pd.DataFrame):  # If X is a DataFrame
        removed_feature_names = X1.columns[zero_variance_features]
        print(f"📌 Removed feature names: {list(removed_feature_names)}")

    X1 = np.delete(X1, zero_variance_features, axis=1)  # Remove constant columns


# Normalize features
X1 = scaler.fit_transform(X1)

In [17]:
class LassoRegression(nn.Module):
    def __init__(self, input_dim, l1_lambda=0.01):
        super(LassoRegression, self).__init__()
        self.linear = nn.Linear(input_dim, 1)
        self.l1_lambda = l1_lambda  # Regularization strength

    def forward(self, x):
        return self.linear(x)

    def l1_regularization_loss(self):
        return self.l1_lambda * torch.norm(self.linear.weight, p=1)  # L1 Regularization (Lasso)


In [18]:
import torch
import torch.nn as nn
import torch.optim as optim


# Set seed for reproducibility
seed_value = 17
random.seed(seed_value)
np.random.seed(seed_value)
torch.manual_seed(seed_value)
torch.cuda.manual_seed_all(seed_value)  # If using GPU

# Define TimeSeriesSplit (e.g., 5 splits)
panel_split = PanelSplit(periods = final_df.year, n_splits = 5)

# Hyperparameters
lambda_values = [0.00001, 0.0001, 0.001, 0.01, 0.05, 1]  # Different L1 values
epochs = 1000


# Adjusted R² function
def adjusted_r2(r2, n, k):
    return 1 - (1 - r2) * ((n - 1) / (n - k - 1))

# Dictionary to store results across all CV splits
results = {}

fold_results = []  # Store results for each fold

# Perform TimeSeriesSplit Cross-Validation
for train_idx, test_idx in panel_split.split(X1):

    # Create interaction terms (degree=2 means pairwise interactions)
    poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
    X_interactions = poly.fit_transform(X1)
    
    # Split dataset into train & test per fold
    X_train1, X_test1 = X_interactions[train_idx], X_interactions[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    # Convert to PyTorch tensors
    X_train_tensor1 = torch.tensor(X_train1, dtype=torch.float32).to(device)
    X_test_tensor1 = torch.tensor(X_test1, dtype=torch.float32).to(device)
    y_tensor_train = torch.tensor(y_train, dtype=torch.float32).to(device)
    y_tensor_test = torch.tensor(y_test, dtype=torch.float32).to(device)

    # Train and evaluate for each lambda value
    for l1_lambda in lambda_values:
        model = LassoRegression(X_train_tensor1.shape[1], l1_lambda=l1_lambda).to(device)
        optimizer = optim.Adam(model.parameters(), lr=0.001)
        criterion = nn.MSELoss()

        # Training Loop
        for epoch in range(epochs):
            optimizer.zero_grad()
            y_pred = model(X_train_tensor1)
            loss = criterion(y_pred, y_tensor_train) + model.l1_regularization_loss()
            loss.backward()
            optimizer.step()

        # Evaluate on train & test sets
        model.eval()
        with torch.no_grad():
            y_train_pred = model(X_train_tensor1)  # Training predictions
            y_test_pred = model(X_test_tensor1)   # Test predictions

        # Convert predictions to NumPy for evaluation
        y_train_pred_numpy = y_train_pred.cpu().numpy().flatten()
        y_test_pred_numpy = y_test_pred.cpu().numpy().flatten()
        y_train_numpy = y_tensor_train.cpu().numpy().flatten()
        y_test_numpy = y_tensor_test.cpu().numpy().flatten()

        # Compute metrics
        mse = mean_squared_error(y_test_numpy, y_test_pred_numpy)
        rmse = np.sqrt(mse)
        mae = mean_absolute_error(y_test_numpy, y_test_pred_numpy)
        r2_test = r2_score(y_test_numpy, y_test_pred_numpy)  # Test R²
        r2_train = r2_score(y_train_numpy, y_train_pred_numpy)  # Train R²

        # Compute Adjusted R²
        n_train, k = X_train1.shape
        n_test = X_test1.shape[0]

        adj_r2_train = adjusted_r2(r2_train, n_train, k)
        adj_r2_test = adjusted_r2(r2_test, n_test, k)

        # Store results for this fold
        fold_results.append({
            "MSE": mse, "RMSE": rmse, "MAE": mae, "R2_test": r2_test, "R2_train": r2_train,
            "Adj_R2_test": adj_r2_test, "Adj_R2_train": adj_r2_train
        })

# Compute average performance across all folds for each (k, lambda)
avg_results = {
    metric: np.mean([fold[metric] for fold in fold_results])
    for metric in fold_results[0].keys()
}
results[(l1_lambda)] = avg_results

# Find the best lambda
best_lambda = min(results, key=lambda x: (results[x]["MSE"], -results[x]["R2_test"]))  # Minimize MSE, maximize R²
best_metrics = results[best_lambda]

# Print optimal hyperparameters and their performance
print(f"🔢 Number of features after Polynomial Features: {X_interactions.shape[1]}")
print(f"✅  Optimal L1 lambda: {best_lambda}")
print(f"📊 Best MSE: {best_metrics['MSE']:.4f}")
print(f"📊 Best RMSE: {best_metrics['RMSE']:.4f}")
print(f"📊 Best MAE: {best_metrics['MAE']:.4f}")
print(f"📊 R² (Train): {best_metrics['R2_train']:.4f}, Adjusted R² (Train): {best_metrics['Adj_R2_train']:.4f}")
print(f"📊 R² (Test): {best_metrics['R2_test']:.4f}, Adjusted R² (Test): {best_metrics['Adj_R2_test']:.4f}")

🔢 Number of features after Polynomial Features: 120
✅  Optimal L1 lambda: 1
📊 Best MSE: 0.5431
📊 Best RMSE: 0.7081
📊 Best MAE: 0.2424
📊 R² (Train): 0.5355, Adjusted R² (Train): 0.5316
📊 R² (Test): 0.4552, Adjusted R² (Test): 0.3853


## 2. e

In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import TimeSeriesSplit  # Alternative to PanelSplit

# Set seed for reproducibility
seed_value = 17
np.random.seed(seed_value)

# Define Time-Series Cross-Validation (5 splits)
tscv = TimeSeriesSplit(n_splits=5)

# Hyperparameters for Random Forest
n_estimators_values = [50, 100, 200, 300]  # Number of trees to test
max_depth_values = [5, 10, 20, None]  # Depth of trees

# Function to compute Adjusted R²
def adjusted_r2(r2, n, k):
    return 1 - (1 - r2) * ((n - 1) / (n - k - 1))

# Dictionary to store results
results = {}

# Perform Time-Series Cross-Validation
for train_idx, test_idx in tscv.split(X1):
    
    # Create polynomial interaction terms (degree=2, only interactions)
    poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
    X_interactions = poly.fit_transform(X1)

    # Split dataset into train & test per fold
    X_train1, X_test1 = X_interactions[train_idx], X_interactions[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    # Train and evaluate for each combination of hyperparameters
    for n_estimators in n_estimators_values:
        for max_depth in max_depth_values:
            
            # Define the Random Forest model
            model = RandomForestRegressor(
                n_estimators=n_estimators,
                max_depth=max_depth,
                random_state=seed_value,
                n_jobs=-1  # Use all available processors
            )

            # Train the model
            model.fit(X_train1, y_train.ravel())

            # Predictions
            y_train_pred = model.predict(X_train1)
            y_test_pred = model.predict(X_test1)

            # Compute performance metrics
            mse = mean_squared_error(y_test, y_test_pred)
            rmse = np.sqrt(mse)
            mae = mean_absolute_error(y_test, y_test_pred)
            r2_test = r2_score(y_test, y_test_pred)
            r2_train = r2_score(y_train, y_train_pred)

            # Compute Adjusted R²
            n_train, k = X_train1.shape
            n_test = X_test1.shape[0]
            adj_r2_train = adjusted_r2(r2_train, n_train, k)
            adj_r2_test = adjusted_r2(r2_test, n_test, k)

            # Store results for this combination
            results[(n_estimators, max_depth)] = {
                "MSE": mse, "RMSE": rmse, "MAE": mae, "R2_test": r2_test, "R2_train": r2_train,
                "Adj_R2_test": adj_r2_test, "Adj_R2_train": adj_r2_train
            }

# Find the best hyperparameter combination (minimize MSE, maximize R²)
best_params = min(results, key=lambda x: (results[x]["MSE"], -results[x]["R2_test"]))
best_metrics = results[best_params]

# Print optimal hyperparameters and their performance
print(f"🌲 Optimal Random Forest Parameters: n_estimators={best_params[0]}, max_depth={best_params[1]}")
print(f"📊 Best MSE: {best_metrics['MSE']:.4f}")
print(f"📊 Best RMSE: {best_metrics['RMSE']:.4f}")
print(f"📊 Best MAE: {best_metrics['MAE']:.4f}")
print(f"📊 R² (Train): {best_metrics['R2_train']:.4f}, Adjusted R² (Train): {best_metrics['Adj_R2_train']:.4f}")
print(f"📊 R² (Test): {best_metrics['R2_test']:.4f}, Adjusted R² (Test): {best_metrics['Adj_R2_test']:.4f}")
