# Experimenting with Ensembles
This notebook goes over the possible combinations of models that we could use. 

In [1]:
import torch
from torch import nn
import pytorch_lightning as pl
from torchmetrics import MeanSquaredError, MeanAbsoluteError, R2Score

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, accuracy_score, make_scorer
from scipy.stats import kendalltau
from sklearn.ensemble import RandomForestRegressor
from torch.utils.data import TensorDataset, DataLoader  # Added missing imports
from pytorch_lightning.callbacks import EarlyStopping  # Added missing import
from pytorch_tabnet.tab_model import TabNetRegressor  # Import TabNet



## Data

In [3]:
# Loading Dataset
#data = pd.read_csv("/content/drive/MyDrive/ECE324_Project/Model/dataset.csv") # change path for your env
#data = pd.read_csv("SmartStudy\\notebooks\\database.csv") # change path for your env

from smartstudy.config import PROCESSED_DATA_DIR
data = pd.read_csv(PROCESSED_DATA_DIR / "processed_data.csv") # change path for your env
# data = pd.read_csv("dataset.csv") # change path for your env
data.head()

# Data Splitting & Normalization
scaler = StandardScaler()
input = data.drop(columns=['GPA'], errors='ignore')
input = scaler.fit_transform(input)
labels = data['GPA']
X_train, X_temp, Y_train, Y_temp = train_test_split(input, labels, test_size=0.3, random_state=42)
X_test, X_val, Y_test, Y_val = train_test_split(X_temp, Y_temp, test_size=0.5, random_state=42)



## Soft Ordering 1DCNN + TabNet

In [4]:
class SoftOrdering1DCNN(pl.LightningModule):

    def __init__(self, input_dim, output_dim=1, sign_size=32, cha_input=16, cha_hidden=32, 
                 K=2, dropout_input=0.2, dropout_hidden=0.2, dropout_output=0.2, learning_rate=1e-3):
        super().__init__()

        hidden_size = sign_size * cha_input
        sign_size1 = sign_size
        sign_size2 = sign_size // 2
        output_size = (sign_size2) * cha_hidden

        self.hidden_size = hidden_size
        self.cha_input = cha_input
        self.cha_hidden = cha_hidden
        self.K = K
        self.sign_size1 = sign_size1
        self.sign_size2 = sign_size2
        self.output_size = output_size
        self.dropout_input = dropout_input
        self.dropout_hidden = dropout_hidden
        self.dropout_output = dropout_output
        self.learning_rate = learning_rate

        self.batch_norm1 = nn.BatchNorm1d(input_dim)
        self.dropout1 = nn.Dropout(dropout_input)
        dense1 = nn.Linear(input_dim, hidden_size, bias=False)
        self.dense1 = nn.utils.weight_norm(dense1)

        # 1st conv layer
        self.batch_norm_c1 = nn.BatchNorm1d(cha_input)
        conv1 = nn.Conv1d(
            cha_input, 
            cha_input * K, 
            kernel_size=5, 
            stride=1, 
            padding=2,  
            groups=cha_input, 
            bias=False)
        self.conv1 = nn.utils.weight_norm(conv1, dim=None)

        self.ave_po_c1 = nn.AdaptiveAvgPool1d(output_size=sign_size2)

        # 2nd conv layer
        self.batch_norm_c2 = nn.BatchNorm1d(cha_input * K)
        self.dropout_c2 = nn.Dropout(dropout_hidden)
        conv2 = nn.Conv1d(
            cha_input * K, 
            cha_hidden, 
            kernel_size=3, 
            stride=1, 
            padding=1, 
            bias=False)
        self.conv2 = nn.utils.weight_norm(conv2, dim=None)

        # 3rd conv layer (Output layer)
        self.batch_norm_c3 = nn.BatchNorm1d(cha_hidden)
        self.dropout_c3 = nn.Dropout(dropout_output)
        self.dense2 = nn.Linear(output_size, output_dim) 
        
        self.mse = MeanSquaredError()
        self.mae = MeanAbsoluteError()
        self.r2 = R2Score()

    def forward(self, x):
        if x.shape[1] != self.dense1.in_features:
            raise ValueError(f"Input feature size mismatch. Expected {self.dense1.in_features}, got {x.shape[1]}.")

        x = self.batch_norm1(x)
        x = self.dropout1(x)
        x = torch.relu(self.dense1(x))
        
        x = x.reshape(x.shape[0], self.cha_input, self.sign_size1) 
        
        x = self.batch_norm_c1(x)
        x = torch.relu(self.conv1(x))
        
        x = self.ave_po_c1(x)
        
        x = self.batch_norm_c2(x)
        x = self.dropout_c2(x)
        x = torch.relu(self.conv2(x))
        
        x = self.batch_norm_c3(x)
        x = self.dropout_c3(x)
        x = x.view(x.size(0), -1) 
        x = self.dense2(x)
        
        return x

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = nn.MSELoss()(y_hat, y)
        self.log('train_loss', loss)
        return loss
    
    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = nn.MSELoss()(y_hat, y)
        self.log('val_loss', loss)
        self.log('val_mse', self.mse(y_hat, y))
        self.log('val_mae', self.mae(y_hat, y))
        self.log('val_r2', self.r2(y_hat, y))
        
    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
        return optimizer
    
    
# Convert data to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
Y_train_tensor = torch.tensor(Y_train.values, dtype=torch.float32).reshape(-1, 1) 
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
Y_test_tensor = torch.tensor(Y_test.values, dtype=torch.float32).reshape(-1, 1) 

# Create datasets and dataloaders
train_dataset = TensorDataset(X_train_tensor, Y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, Y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True) 
test_loader = DataLoader(test_dataset, batch_size=32) 

# Instantiate the model
input_dim = X_train_tensor.shape[1]  
model = SoftOrdering1DCNN(input_dim=input_dim)

# Configure Trainer and callbacks
early_stopping = EarlyStopping(monitor="val_loss", mode="min", patience=5)  
trainer = pl.Trainer(max_epochs=50, callbacks=[early_stopping]) 

# Train the model
trainer.fit(model, train_loader, test_loader)  # Use train and validation loaders

# Make predictions and evaluate
predictions = []
model.eval()  
with torch.no_grad():
    for x, _ in test_loader:
        predictions.append(model(x))
predictions = torch.cat(predictions).detach().numpy()

# Calculate and print evaluation metrics
mse = mean_squared_error(Y_test, predictions)
mae = mean_absolute_error(Y_test, predictions)
r2 = r2_score(Y_test, predictions)
kendall_tau_corr, _ = kendalltau(Y_test, predictions)

print('Mean Squared Error:', mse)
print('Mean Absolute Error:', mae)
print('R2 Score:', r2)
print('Kendall Tau:', kendall_tau_corr)

class EnsembleModel(pl.LightningModule):
    def __init__(self, cnn_model, tabnet_model, cnn_weight=0.5, tabnet_weight=0.5):
        super().__init__()
        self.cnn_model = cnn_model
        self.tabnet_model = tabnet_model
        self.cnn_weight = cnn_weight
        self.tabnet_weight = tabnet_weight

    def forward(self, x):
        cnn_pred = self.cnn_model(x)
        tabnet_pred = self.tabnet_model.predict(x.numpy())  # TabNet expects numpy input
        tabnet_pred = torch.tensor(tabnet_pred, dtype=torch.float32).to(cnn_pred.device)
        return self.cnn_weight * cnn_pred + self.tabnet_weight * tabnet_pred

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = nn.MSELoss()(y_hat, y)
        self.log('train_loss', loss)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = nn.MSELoss()(y_hat, y)
        self.log('val_loss', loss)

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
        return optimizer


# Reshape labels for TabNet compatibility
Y_train = Y_train.values.reshape(-1, 1)  # Reshape to 2D
Y_val = Y_val.values.reshape(-1, 1)      # Reshape to 2D

# Instantiate TabNet model
tabnet_model = TabNetRegressor()
tabnet_model.fit(
    X_train, Y_train,  # Use reshaped Y_train
    eval_set=[(X_val, Y_val)],  # Use reshaped Y_val
    eval_metric=['rmse'],
    patience=5,
    max_epochs=50
)

# Instantiate CNN model
input_dim = X_train_tensor.shape[1]
cnn_model = SoftOrdering1DCNN(input_dim=input_dim)

# Instantiate Ensemble model
ensemble_model = EnsembleModel(cnn_model, tabnet_model)

# Train Ensemble model
trainer = pl.Trainer(max_epochs=50, callbacks=[early_stopping])
trainer.fit(ensemble_model, train_loader, test_loader)

# Evaluate Ensemble model
ensemble_predictions = []
ensemble_model.eval()
with torch.no_grad():
    for x, _ in test_loader:
        ensemble_predictions.append(ensemble_model(x))
ensemble_predictions = torch.cat(ensemble_predictions).detach().numpy()

# Calculate and print evaluation metrics for ensemble
mse = mean_squared_error(Y_test, ensemble_predictions)
mae = mean_absolute_error(Y_test, ensemble_predictions)
r2 = r2_score(Y_test, ensemble_predictions)
kendall_tau_corr, _ = kendalltau(Y_test, ensemble_predictions)

print('Ensemble Mean Squared Error:', mse)
print('Ensemble Mean Absolute Error:', mae)
print('Ensemble R2 Score:', r2)
print('Ensemble Kendall Tau:', kendall_tau_corr)

  WeightNorm.apply(module, name, dim)
You are using the plain ModelCheckpoint callback. Consider using LitModelCheckpoint which with seamless uploading to Model registry.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

   | Name          | Type              | Params | Mode 
-------------------------------------------------------------
0  | batch_norm1   | BatchNorm1d       | 22     | train
1  | dropout1      | Dropout           | 0      | train
2  | dense1        | Linear            | 6.1 K  | train
3  | batch_norm_c1 | BatchNorm1d       | 32     | train
4  | conv1         | Conv1d            | 161    | train
5  | ave_po_c1     | AdaptiveAvgPool1d | 0      | train
6  | batch_norm_c2 | BatchNorm1d       | 64     | train
7  | dropout_c2    | Dropout           | 0      | train
8  | conv2         | Conv1d            | 3.1 K  | train
9  | batch_norm_c3 | BatchNorm1d       | 64     | train
10 | dropout_c3    | Dropout           

                                                                           

c:\Users\eblac\anaconda3\envs\smartstudy_env\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:425: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.
c:\Users\eblac\anaconda3\envs\smartstudy_env\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.


Epoch 8: 100%|██████████| 53/53 [00:01<00:00, 42.54it/s, v_num=9]
Mean Squared Error: 0.11332881944070748
Mean Absolute Error: 0.26926889027341006
R2 Score: 0.8596705561383823
Kendall Tau: 0.8152222965717931




epoch 0  | loss: 2.03159 | val_0_rmse: 1.27661 |  0:00:00s
epoch 1  | loss: 1.43149 | val_0_rmse: 1.19889 |  0:00:00s
epoch 2  | loss: 1.16549 | val_0_rmse: 1.12089 |  0:00:00s
epoch 3  | loss: 0.98182 | val_0_rmse: 1.01765 |  0:00:00s
epoch 4  | loss: 0.87742 | val_0_rmse: 0.93789 |  0:00:01s
epoch 5  | loss: 0.743   | val_0_rmse: 0.90094 |  0:00:01s
epoch 6  | loss: 0.62199 | val_0_rmse: 0.88696 |  0:00:01s
epoch 7  | loss: 0.56044 | val_0_rmse: 0.86854 |  0:00:01s
epoch 8  | loss: 0.53052 | val_0_rmse: 0.77366 |  0:00:01s
epoch 9  | loss: 0.44133 | val_0_rmse: 0.69411 |  0:00:01s
epoch 10 | loss: 0.40083 | val_0_rmse: 0.63817 |  0:00:02s
epoch 11 | loss: 0.39459 | val_0_rmse: 0.61472 |  0:00:02s
epoch 12 | loss: 0.38203 | val_0_rmse: 0.58867 |  0:00:02s
epoch 13 | loss: 0.32186 | val_0_rmse: 0.57388 |  0:00:02s
epoch 14 | loss: 0.32932 | val_0_rmse: 0.57867 |  0:00:02s
epoch 15 | loss: 0.31144 | val_0_rmse: 0.55841 |  0:00:02s
epoch 16 | loss: 0.28097 | val_0_rmse: 0.54684 |  0:00:0

  WeightNorm.apply(module, name, dim)
You are using the plain ModelCheckpoint callback. Consider using LitModelCheckpoint which with seamless uploading to Model registry.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name      | Type              | Params | Mode 
--------------------------------------------------------
0 | cnn_model | SoftOrdering1DCNN | 10.1 K | train
--------------------------------------------------------
10.1 K    Trainable params
0         Non-trainable params
10.1 K    Total params
0.040     Total estimated model params size (MB)
16        Modules in train mode
0         Modules in eval mode


epoch 49 | loss: 0.10554 | val_0_rmse: 0.32701 |  0:00:08s
Stop training because you reached max_epochs = 50 with best_epoch = 46 and best_val_0_rmse = 0.31721
Sanity Checking DataLoader 0: 100%|██████████| 2/2 [00:00<00:00, 47.94it/s]

c:\Users\eblac\anaconda3\envs\smartstudy_env\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:425: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.


                                                                           

c:\Users\eblac\anaconda3\envs\smartstudy_env\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.


Epoch 10: 100%|██████████| 53/53 [00:01<00:00, 27.58it/s, v_num=10]
Ensemble Mean Squared Error: 0.07082640160815223
Ensemble Mean Absolute Error: 0.20876302934490445
Ensemble R2 Score: 0.9122991874666833
Ensemble Kendall Tau: 0.8397472806212165
