In [1]:
!pip install pytorch-lightning
!pip install torchmetrics
!pip install xgboost
!pip install catboost
!pip install tabpfn

Collecting pytorch-lightning
  Downloading pytorch_lightning-2.5.1-py3-none-any.whl.metadata (20 kB)
Collecting torchmetrics>=0.7.0 (from pytorch-lightning)
  Downloading torchmetrics-1.7.0-py3-none-any.whl.metadata (21 kB)
Collecting lightning-utilities>=0.10.0 (from pytorch-lightning)
  Downloading lightning_utilities-0.14.3-py3-none-any.whl.metadata (5.6 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.1.0->pytorch-lightning)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.1.0->pytorch-lightning)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.1.0->pytorch-lightning)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.1.0->pytorch-lightning)
  Downloadi

# Experimenting with Ensembles
This notebook goes over the possible combinations of models that we could use.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from xgboost import XGBRegressor

import torch
from torch import nn
from torchmetrics import MeanSquaredError, MeanAbsoluteError, R2Score

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, accuracy_score, make_scorer
from scipy.stats import kendalltau
from sklearn.ensemble import RandomForestRegressor
from torch.utils.data import TensorDataset, DataLoader  # Added missing imports
from pytorch_lightning.callbacks import EarlyStopping  # Added missing import
from tabpfn import TabPFNRegressor
from sklearn.base import BaseEstimator, RegressorMixin

## Data

In [4]:
# mount dirve
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Loading Dataset
#data = pd.read_csv("/content/drive/MyDrive/ECE324_Project/Model/dataset.csv") # change path for your env
#data = pd.read_csv("SmartStudy\\notebooks\\database.csv") # change path for your env
#data = pd.read_csv("dataset.csv") # change path for your env
#data.head()

from smartstudy.config import PROCESSED_DATA_DIR
data_path = PROCESSED_DATA_DIR / "processed_data.csv"
data = pd.read_csv(data_path)

# Data Splitting & Normalization
scaler = StandardScaler()
input = data.drop(columns=['GPA'], errors='ignore')
input = scaler.fit_transform(input)
labels = data['GPA']
X_train, X_temp, Y_train, Y_temp = train_test_split(input, labels, test_size=0.3, random_state=42)
X_test, X_val, Y_test, Y_val = train_test_split(X_temp, Y_temp, test_size=0.5, random_state=42)



## XGBoost + TabPFN

In [3]:
class XGBoostTabPFNEnsemble(BaseEstimator, RegressorMixin):
    def __init__(self, xgb_model, tabpfn_model, xgb_weight=0.5, tabpfn_weight=0.5):
        self.xgb_model = xgb_model
        self.tabpfn_model = tabpfn_model
        self.xgb_weight = xgb_weight
        self.tabpfn_weight = tabpfn_weight

    def fit(self, X, y):
        self.xgb_model.fit(X, y)
        self.tabpfn_model.fit(X, y)
        return self

    def predict(self, X):
        xgb_pred = self.xgb_model.predict(X)
        tabpfn_pred = self.tabpfn_model.predict(X)
        return self.xgb_weight * xgb_pred + self.tabpfn_weight * tabpfn_pred

In [4]:
# Instantiate XGBoost model
best_params = {
    'gamma': 0.0563056841989118,
    'learning_rate': 0.10822466143464428,
    'max_depth': int(4.469228010863449),
    'min_child_weight': 8.445729116830403,
    'n_estimators': int(228.70928755928722)
}
xgb_model = XGBRegressor(objective='reg:squarederror',
                                    random_state=42,
                                    **best_params)

# Instantiate TabPFN model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tabpfn_model = TabPFNRegressor(n_estimators=8, device="auto")  # Use 'cuda' if GPU is available

# Instantiate the ensemble model
ensemble_model = XGBoostTabPFNEnsemble(xgb_model, tabpfn_model)

In [5]:
# Evaluate the ensemble model
ensemble_model.fit(X_train, Y_train)
ensemble_predictions = ensemble_model.predict(X_test)

# Calculate and print evaluation metrics
mse = mean_squared_error(Y_test, ensemble_predictions)
mae = mean_absolute_error(Y_test, ensemble_predictions)
r2 = r2_score(Y_test, ensemble_predictions)
kendall_tau_corr, _ = kendalltau(Y_test, ensemble_predictions)

print('Ensemble Mean Squared Error:', mse)
print('Ensemble Mean Absolute Error:', mae)
print('Ensemble R2 Score:', r2)
print('Ensemble Kendall Tau:', kendall_tau_corr)

Ensemble Mean Squared Error: 0.03937037771856031
Ensemble Mean Absolute Error: 0.15596295987113637
Ensemble R2 Score: 0.9512496182600938
Ensemble Kendall Tau: 0.8662018953953409
