# Experimenting with Ensembles
This notebook goes over the possible combinations of models that we could use. 

In [1]:
import torch
from torch import nn
import pytorch_lightning as pl
from torchmetrics import MeanSquaredError, MeanAbsoluteError, R2Score

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, accuracy_score, make_scorer
from scipy.stats import kendalltau
from sklearn.ensemble import RandomForestRegressor
from torch.utils.data import TensorDataset, DataLoader  # Added missing imports
from pytorch_lightning.callbacks import EarlyStopping  # Added missing import
from pytorch_tabnet.tab_model import TabNetRegressor  # Import TabNet



## Data

In [3]:
# Loading Dataset
#data = pd.read_csv("/content/drive/MyDrive/ECE324_Project/Model/dataset.csv") # change path for your env
#data = pd.read_csv("SmartStudy\\notebooks\\database.csv") # change path for your env
data = pd.read_csv("dataset.csv") # change path for your env
data.head()

# Data Splitting & Normalization
scaler = StandardScaler()
input = data.drop(columns=['GPA'], errors='ignore')
input = scaler.fit_transform(input)
labels = data['GPA']
X_train, X_temp, Y_train, Y_temp = train_test_split(input, labels, test_size=0.3, random_state=42)
X_test, X_val, Y_test, Y_val = train_test_split(X_temp, Y_temp, test_size=0.5, random_state=42)



## Soft Ordering 1DCNN + TabNet

In [4]:
class SoftOrdering1DCNN(pl.LightningModule):

    def __init__(self, input_dim, output_dim=1, sign_size=32, cha_input=16, cha_hidden=32, 
                 K=2, dropout_input=0.2, dropout_hidden=0.2, dropout_output=0.2, learning_rate=1e-3):
        super().__init__()

        hidden_size = sign_size * cha_input
        sign_size1 = sign_size
        sign_size2 = sign_size // 2
        output_size = (sign_size2) * cha_hidden  # Corrected output size calculation

        self.hidden_size = hidden_size
        self.cha_input = cha_input
        self.cha_hidden = cha_hidden
        self.K = K
        self.sign_size1 = sign_size1
        self.sign_size2 = sign_size2
        self.output_size = output_size
        self.dropout_input = dropout_input
        self.dropout_hidden = dropout_hidden
        self.dropout_output = dropout_output
        self.learning_rate = learning_rate

        self.batch_norm1 = nn.BatchNorm1d(input_dim)
        self.dropout1 = nn.Dropout(dropout_input)
        dense1 = nn.Linear(input_dim, hidden_size, bias=False)
        self.dense1 = nn.utils.weight_norm(dense1)

        # 1st conv layer
        self.batch_norm_c1 = nn.BatchNorm1d(cha_input)
        conv1 = nn.Conv1d(
            cha_input, 
            cha_input * K, 
            kernel_size=5, 
            stride=1, 
            padding=2,  
            groups=cha_input, 
            bias=False)
        self.conv1 = nn.utils.weight_norm(conv1, dim=None)

        self.ave_po_c1 = nn.AdaptiveAvgPool1d(output_size=sign_size2)

        # 2nd conv layer
        self.batch_norm_c2 = nn.BatchNorm1d(cha_input * K)
        self.dropout_c2 = nn.Dropout(dropout_hidden)
        conv2 = nn.Conv1d(
            cha_input * K, 
            cha_hidden, 
            kernel_size=3, 
            stride=1, 
            padding=1, 
            bias=False)
        self.conv2 = nn.utils.weight_norm(conv2, dim=None)

        # 3rd conv layer (Output layer)
        self.batch_norm_c3 = nn.BatchNorm1d(cha_hidden)
        self.dropout_c3 = nn.Dropout(dropout_output)
        self.dense2 = nn.Linear(output_size, output_dim)  # Corrected dense2 input size
        
        self.mse = MeanSquaredError()
        self.mae = MeanAbsoluteError()
        self.r2 = R2Score()

    def forward(self, x):
        if x.shape[1] != self.dense1.in_features:
            raise ValueError(f"Input feature size mismatch. Expected {self.dense1.in_features}, got {x.shape[1]}.")

        x = self.batch_norm1(x)
        x = self.dropout1(x)
        x = torch.relu(self.dense1(x))
        
        x = x.reshape(x.shape[0], self.cha_input, self.sign_size1) 
        
        x = self.batch_norm_c1(x)
        x = torch.relu(self.conv1(x))
        
        x = self.ave_po_c1(x)
        
        x = self.batch_norm_c2(x)
        x = self.dropout_c2(x)
        x = torch.relu(self.conv2(x))
        
        x = self.batch_norm_c3(x)
        x = self.dropout_c3(x)
        x = x.view(x.size(0), -1) 
        x = self.dense2(x)
        
        return x

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = nn.MSELoss()(y_hat, y)  # Using MSE loss
        self.log('train_loss', loss)
        return loss
    
    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = nn.MSELoss()(y_hat, y)
        self.log('val_loss', loss)
        self.log('val_mse', self.mse(y_hat, y))
        self.log('val_mae', self.mae(y_hat, y))
        self.log('val_r2', self.r2(y_hat, y))
        
    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
        return optimizer
    

# Assuming X_train, Y_train, X_test, Y_test are your data

# 1. Convert data to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
Y_train_tensor = torch.tensor(Y_train.values, dtype=torch.float32).reshape(-1, 1) 
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
Y_test_tensor = torch.tensor(Y_test.values, dtype=torch.float32).reshape(-1, 1) 

# 2. Create datasets and dataloaders
train_dataset = TensorDataset(X_train_tensor, Y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, Y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True) 
test_loader = DataLoader(test_dataset, batch_size=32) 

# 3. Instantiate the model
input_dim = X_train_tensor.shape[1]  
model = SoftOrdering1DCNN(input_dim=input_dim)

# 4. Configure Trainer and callbacks
early_stopping = EarlyStopping(monitor="val_loss", mode="min", patience=5)  
trainer = pl.Trainer(max_epochs=50, callbacks=[early_stopping]) 

# 5. Train the model
trainer.fit(model, train_loader, test_loader)  # Use train and validation loaders

# 6. Make predictions and evaluate
# Removed the redundant modification of `dense2` after training
predictions = []
model.eval()  # Set model to evaluation mode
with torch.no_grad():
    for x, _ in test_loader:
        predictions.append(model(x))
predictions = torch.cat(predictions).detach().numpy()

# 7. Calculate and print evaluation metrics
mse = mean_squared_error(Y_test, predictions)
mae = mean_absolute_error(Y_test, predictions)
r2 = r2_score(Y_test, predictions)
kendall_tau_corr, _ = kendalltau(Y_test, predictions)

print('Mean Squared Error:', mse)
print('Mean Absolute Error:', mae)
print('R2 Score:', r2)
print('Kendall Tau:', kendall_tau_corr)

class EnsembleModel(pl.LightningModule):
    def __init__(self, cnn_model, tabnet_model, cnn_weight=0.5, tabnet_weight=0.5):
        super().__init__()
        self.cnn_model = cnn_model
        self.tabnet_model = tabnet_model
        self.cnn_weight = cnn_weight
        self.tabnet_weight = tabnet_weight

    def forward(self, x):
        cnn_pred = self.cnn_model(x)
        tabnet_pred = self.tabnet_model.predict(x.numpy())  # TabNet expects numpy input
        tabnet_pred = torch.tensor(tabnet_pred, dtype=torch.float32).to(cnn_pred.device)
        return self.cnn_weight * cnn_pred + self.tabnet_weight * tabnet_pred

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = nn.MSELoss()(y_hat, y)
        self.log('train_loss', loss)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = nn.MSELoss()(y_hat, y)
        self.log('val_loss', loss)

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
        return optimizer


# Reshape labels for TabNet compatibility
Y_train = Y_train.values.reshape(-1, 1)  # Reshape to 2D
Y_val = Y_val.values.reshape(-1, 1)      # Reshape to 2D

# Instantiate TabNet model
tabnet_model = TabNetRegressor()
tabnet_model.fit(
    X_train, Y_train,  # Use reshaped Y_train
    eval_set=[(X_val, Y_val)],  # Use reshaped Y_val
    eval_metric=['rmse'],
    patience=5,
    max_epochs=50
)

# Instantiate CNN model
input_dim = X_train_tensor.shape[1]
cnn_model = SoftOrdering1DCNN(input_dim=input_dim)

# Instantiate Ensemble model
ensemble_model = EnsembleModel(cnn_model, tabnet_model)

# Train Ensemble model
trainer = pl.Trainer(max_epochs=50, callbacks=[early_stopping])
trainer.fit(ensemble_model, train_loader, test_loader)

# Evaluate Ensemble model
ensemble_predictions = []
ensemble_model.eval()
with torch.no_grad():
    for x, _ in test_loader:
        ensemble_predictions.append(ensemble_model(x))
ensemble_predictions = torch.cat(ensemble_predictions).detach().numpy()

# Calculate and print evaluation metrics for ensemble
mse = mean_squared_error(Y_test, ensemble_predictions)
mae = mean_absolute_error(Y_test, ensemble_predictions)
r2 = r2_score(Y_test, ensemble_predictions)
kendall_tau_corr, _ = kendalltau(Y_test, ensemble_predictions)

print('Ensemble Mean Squared Error:', mse)
print('Ensemble Mean Absolute Error:', mae)
print('Ensemble R2 Score:', r2)
print('Ensemble Kendall Tau:', kendall_tau_corr)

  WeightNorm.apply(module, name, dim)
You are using the plain ModelCheckpoint callback. Consider using LitModelCheckpoint which with seamless uploading to Model registry.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

   | Name          | Type              | Params | Mode 
-------------------------------------------------------------
0  | batch_norm1   | BatchNorm1d       | 20     | train
1  | dropout1      | Dropout           | 0      | train
2  | dense1        | Linear            | 5.6 K  | train
3  | batch_norm_c1 | BatchNorm1d       | 32     | train
4  | conv1         | Conv1d            | 161    | train
5  | ave_po_c1     | AdaptiveAvgPool1d | 0      | train
6  | batch_norm_c2 | BatchNorm1d       | 64     | train
7  | dropout_c2    | Dropout           | 0      | train
8  | conv2         | Conv1d            | 3.1 K  | train
9  | batch_norm_c3 | BatchNorm1d       | 64     | train
10 | dropout_c3    | Dropout           

Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]

c:\Users\eblac\anaconda3\envs\smartstudy_env\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:425: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.


                                                                           

c:\Users\eblac\anaconda3\envs\smartstudy_env\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.


Epoch 10: 100%|██████████| 53/53 [00:01<00:00, 26.83it/s, v_num=0]
Mean Squared Error: 0.15685813911031657
Mean Absolute Error: 0.3136181722649528
R2 Score: 0.8057703633096137
Kendall Tau: 0.782200712718445




epoch 0  | loss: 2.26988 | val_0_rmse: 1.52637 |  0:00:00s
epoch 1  | loss: 1.43018 | val_0_rmse: 1.25739 |  0:00:00s
epoch 2  | loss: 1.09108 | val_0_rmse: 1.22301 |  0:00:00s
epoch 3  | loss: 0.84355 | val_0_rmse: 1.18238 |  0:00:01s
epoch 4  | loss: 0.79073 | val_0_rmse: 1.0447  |  0:00:01s
epoch 5  | loss: 0.6575  | val_0_rmse: 0.92731 |  0:00:01s
epoch 6  | loss: 0.55936 | val_0_rmse: 0.85949 |  0:00:02s
epoch 7  | loss: 0.46131 | val_0_rmse: 0.82749 |  0:00:02s
epoch 8  | loss: 0.40028 | val_0_rmse: 0.80773 |  0:00:02s
epoch 9  | loss: 0.37541 | val_0_rmse: 0.76381 |  0:00:03s
epoch 10 | loss: 0.35498 | val_0_rmse: 0.72976 |  0:00:03s
epoch 11 | loss: 0.29711 | val_0_rmse: 0.66916 |  0:00:03s
epoch 12 | loss: 0.30476 | val_0_rmse: 0.62699 |  0:00:04s
epoch 13 | loss: 0.2657  | val_0_rmse: 0.57247 |  0:00:04s
epoch 14 | loss: 0.2405  | val_0_rmse: 0.55886 |  0:00:04s
epoch 15 | loss: 0.24222 | val_0_rmse: 0.53386 |  0:00:05s
epoch 16 | loss: 0.22363 | val_0_rmse: 0.52516 |  0:00:0

  WeightNorm.apply(module, name, dim)
You are using the plain ModelCheckpoint callback. Consider using LitModelCheckpoint which with seamless uploading to Model registry.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name      | Type              | Params | Mode 
--------------------------------------------------------
0 | cnn_model | SoftOrdering1DCNN | 9.6 K  | train
--------------------------------------------------------
9.6 K     Trainable params
0         Non-trainable params
9.6 K     Total params
0.038     Total estimated model params size (MB)
16        Modules in train mode
0         Modules in eval mode


epoch 47 | loss: 0.10168 | val_0_rmse: 0.32924 |  0:00:13s

Early stopping occurred at epoch 47 with best_epoch = 42 and best_val_0_rmse = 0.32585
                                                                           

c:\Users\eblac\anaconda3\envs\smartstudy_env\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:425: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.
c:\Users\eblac\anaconda3\envs\smartstudy_env\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.


Epoch 25: 100%|██████████| 53/53 [00:03<00:00, 15.61it/s, v_num=1]
Ensemble Mean Squared Error: 0.09633288412530863
Ensemble Mean Absolute Error: 0.24918601451456238
Ensemble R2 Score: 0.8807157780200571
Ensemble Kendall Tau: 0.8079083736636529


## XGBoost + TabPFN

In [6]:
from tabpfn import TabPFNRegressor
from sklearn.base import BaseEstimator, RegressorMixin

class XGBoostTabPFNEnsemble(BaseEstimator, RegressorMixin):
    def __init__(self, xgb_model, tabpfn_model, xgb_weight=0.5, tabpfn_weight=0.5):
        self.xgb_model = xgb_model
        self.tabpfn_model = tabpfn_model
        self.xgb_weight = xgb_weight
        self.tabpfn_weight = tabpfn_weight

    def fit(self, X, y):
        self.xgb_model.fit(X, y)
        self.tabpfn_model.fit(X, y)
        return self

    def predict(self, X):
        xgb_pred = self.xgb_model.predict(X)
        tabpfn_pred = self.tabpfn_model.predict(X)
        return self.xgb_weight * xgb_pred + self.tabpfn_weight * tabpfn_pred

# Instantiate XGBoost model
best_params = {
    'gamma': 0.0563056841989118,
    'learning_rate': 0.10822466143464428,
    'max_depth': int(4.469228010863449),
    'min_child_weight': 8.445729116830403,
    'n_estimators': int(228.70928755928722)
}
xgb_model = XGBRegressor(objective='reg:squarederror',
                                    random_state=42,
                                    **best_params)

# Instantiate TabPFN model
tabpfn_model = TabPFNRegressor(device='cpu')  # Use 'cuda' if GPU is available

# Instantiate the ensemble model
ensemble_model = XGBoostTabPFNEnsemble(xgb_model, tabpfn_model)

# Evaluate the ensemble model
ensemble_model.fit(X_train, Y_train)
ensemble_predictions = ensemble_model.predict(X_test)

# Calculate and print evaluation metrics
mse = mean_squared_error(Y_test, ensemble_predictions)
mae = mean_absolute_error(Y_test, ensemble_predictions)
r2 = r2_score(Y_test, ensemble_predictions)
kendall_tau_corr, _ = kendalltau(Y_test, ensemble_predictions)

print('Ensemble Mean Squared Error:', mse)
print('Ensemble Mean Absolute Error:', mae)
print('Ensemble R2 Score:', r2)
print('Ensemble Kendall Tau:', kendall_tau_corr)

  y = column_or_1d(y, warn=True)


RuntimeError: Running on CPU with more than 1000 samples is not allowed by default due to slow performance.
To override this behavior, set the environment variable TABPFN_ALLOW_CPU_LARGE_DATASET=1.
Alternatively, consider using a GPU or the tabpfn-client API: https://github.com/PriorLabs/tabpfn-client