In [None]:
# # if using google colab, uncomment the following line to install required packages
# !pip install numpy pandas matplotlib seaborn scikit-learn xgboost bayesian-optimization tabpfn catboost

Collecting numpy
  Using cached numpy-2.2.4-cp311-cp311-win_amd64.whl.metadata (60 kB)
Collecting pandas
  Using cached pandas-2.2.3-cp311-cp311-win_amd64.whl.metadata (19 kB)
Collecting matplotlib
  Using cached matplotlib-3.10.1-cp311-cp311-win_amd64.whl.metadata (11 kB)
Collecting seaborn
  Using cached seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Collecting scikit-learn
  Using cached scikit_learn-1.6.1-cp311-cp311-win_amd64.whl.metadata (15 kB)
Collecting xgboost
  Using cached xgboost-3.0.0-py3-none-win_amd64.whl.metadata (2.1 kB)
Collecting bayesian-optimization
  Using cached bayesian_optimization-2.0.3-py3-none-any.whl.metadata (9.0 kB)
Collecting tabpfn
  Using cached tabpfn-2.0.8-py3-none-any.whl.metadata (25 kB)
Collecting catboost
  Using cached catboost-1.2.7-cp311-cp311-win_amd64.whl.metadata (1.2 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Using cached tzdata-20

Could not find platform independent libraries <prefix>
ERROR: Could not install packages due to an OSError: [WinError 32] The process cannot access the file because it is being used by another process: 'c:\\Users\\eblac\\Documents\\Academics\\ECE324\\SmartStudy\\.venv\\Lib\\site-packages\\mpmath\\libmp\\libhyper.py'
Check the permissions.



In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

In [2]:
import sklearn
print(sklearn.__version__)

1.6.1


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor

In [None]:
# connect google drive (if using collab)
# from google.colab import drive
# drive.mount('/content/drive')

In [4]:
# Loading Dataset
# data = pd.read_csv("/content/drive/MyDrive/ECE324_Project/Model/dataset.csv") # change path for your env
data = pd.read_csv("dataset.csv") # change path for your env
data.head()

Unnamed: 0,Age,Gender,StudyTimeWeekly,Absences,Extracurricular,Sports,Music,Volunteering,GPA,ParentalInfluence,TutoringEffect
0,17,1,19.833723,7,0,0,1,0,2.929196,4,19.833723
1,18,0,15.408756,0,0,0,0,0,3.042915,1,0.0
2,15,0,4.21057,26,0,0,0,0,0.112602,6,0.0
3,17,1,10.028829,14,1,0,0,0,2.054218,9,0.0
4,17,1,4.672495,17,0,0,0,0,1.288061,6,4.672495


In [5]:
# Data Splitting & Normalization
scaler = StandardScaler()
input = data.drop(columns=['GPA'], errors='ignore')
input = scaler.fit_transform(input)
labels = data['GPA']
X_train, X_temp, Y_train, Y_temp = train_test_split(input, labels, test_size=0.3, random_state=42)
X_test, X_val, Y_test, Y_val = train_test_split(X_temp, Y_temp, test_size=0.5, random_state=42)

# 1. Forest Regressor Models

## 1.1 Random Forest

In [None]:
# Random Forest Regression
reg_model = RandomForestRegressor(n_estimators=20, random_state=42)
reg_model.fit(X_train, Y_train)
reg_pred = reg_model.predict(X_test)
print('Mean Squared Error:', mean_squared_error(Y_test, reg_pred))
print('Mean Absolute Error:', mean_absolute_error(Y_test, reg_pred))
print('R2 Score:', reg_model.score(X_test, Y_test))

## 1.2 CatBoost

In [None]:
# Train CatBoost
cat_model = CatBoostRegressor(random_state=42, verbose=0)
cat_model.fit(X_train, Y_train)
cat_pred = cat_model.predict(X_test)

# evaluation
print('Mean Squared Error:', mean_squared_error(Y_test, cat_pred))
print('Mean Absolute Error:', mean_absolute_error(Y_test, cat_pred))
print('R2 Score:', cat_model.score(X_test, Y_test))


## 1.3 XGBoost (tuned with Bayesian Optimization)

In [None]:
best_params = {
    'gamma': 0.0563056841989118,
    'learning_rate': 0.10822466143464428,
    'max_depth': int(4.469228010863449),
    'min_child_weight': 8.445729116830403,
    'n_estimators': int(228.70928755928722)
}

xgb_model = XGBRegressor(objective='reg:squarederror',
                                    random_state=42,
                                    **best_params)
xgb_model.fit(X_train, Y_train)

# prediction
Y2_pred = xgb_model.predict(X_test)

# evaluation
print('Mean Squared Error:', mean_squared_error(Y_test, Y2_pred))
print('Mean Absolute Error:', mean_absolute_error(Y_test, Y2_pred))
print('R2 Score:', xgb_model.score(X_test, Y_test))


# 2. Neural Network Models

## 2.1 MLP Model

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Define the MLP model function for KerasRegressor
def create_mlp_model():
    model = Sequential([
        Dense(64, input_dim=X_train.shape[1], activation='relu'),
        Dense(32, activation='relu'),
        Dense(1, activation='linear')
    ])
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

# Wrap the MLP model with KerasRegressor
mlp_model = KerasRegressor(build_fn=create_mlp_model, epochs=50, batch_size=32, verbose=1)

# Perform cross-validation
scoring_metrics = {
    'MSE': 'neg_mean_squared_error',
    'MAE': 'neg_mean_absolute_error'
}

all_scores = {}
for metric_name, scoring in scoring_metrics.items():
    scores = cross_val_score(mlp_model, X_train, Y_train, cv=5, scoring=scoring)
    # Invert scores for loss functions (neg_mean_squared_error, etc.)
    if scoring.startswith('neg_'):
        scores = -scores
    all_scores[metric_name] = scores.mean()

# Print cross-validation results
print('Cross-Validation Results:')
for metric, score in all_scores.items():
    print(f'{metric}: {score}')

# Train the model on the full training set
mlp_model.fit(X_train, Y_train)

# Predict using the model
mlp_pred = mlp_model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(Y_test, mlp_pred)
mae = mean_absolute_error(Y_test, mlp_pred)
print('Mean Squared Error:', mse)
print('Mean Absolute Error:', mae)

## 2.2 TabPFN

In [None]:
# TabPFN
from tabpfn import TabPFNRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Train and predict TabPFN
reg = TabPFNRegressor(random_state=42)
reg.fit(X_train, Y_train)
tabpfn_pred = reg.predict(X_test)

# evaluation
print('Mean Squared Error:', mean_squared_error(Y_test, tabpfn_pred))
print('Mean Absolute Error:', mean_absolute_error(Y_test, tabpfn_pred))

# Evaluation

## Comparison based on validation set

In [None]:
# Evaluate models on X_val
models = {
    'RandomForest': reg_model,
    'CatBoost': cat_model,
    'XGBoost': xgb_model,
    'MLP': mlp_model,
    'TabPFN': reg
}

# Store evaluation metrics
evaluation_results = {}

for model_name, model in models.items():
    # Predict on X_val
    if model_name == 'MLP':  # MLP model requires reshaping predictions
        predictions = model.predict(X_val).reshape(-1)
    else:
        predictions = model.predict(X_val)
    
    # Calculate metrics
    mse = mean_squared_error(Y_val, predictions)
    mae = mean_absolute_error(Y_val, predictions)
    
    # Store results
    evaluation_results[model_name] = {'MSE': mse, 'MAE': mae}

# Display results
for model_name, metrics in evaluation_results.items():
    print(f"{model_name}:")
    print(f"  Mean Squared Error: {metrics['MSE']}")
    print(f"  Mean Absolute Error: {metrics['MAE']}")

## Cross-Validation

In [None]:
# Define the MLP model function for KerasRegressor
def create_mlp_model():
    model = Sequential([
        Dense(64, input_dim=X_train.shape[1], activation='relu'),
        Dense(32, activation='relu'),
        Dense(1, activation='linear')
    ])
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

# Overall Comparison
models = [
    ('RandomForest', RandomForestRegressor(random_state=42)),
    ('CatBoost', CatBoostRegressor(random_state=42, verbose=0)),
    ('XGBoost', XGBRegressor(random_state=42)),
    ('MLP', KerasRegressor(build_fn=create_mlp_model, epochs=10, batch_size=32, verbose=0)),  # Wrapped MLP model
    ('TabPFN', TabPFNRegressor(random_state=42))
]

# Calculate scores
scoring_metrics = {
    'MSE': 'neg_mean_squared_error',
    'MAE': 'neg_mean_absolute_error'
}

all_scores = {}
for name, model in models:
    all_scores[name] = {}
    for metric_name, scoring in scoring_metrics.items():
        if name == 'TabPFN':  # TabPFN does not support n_jobs=-1
            scores = cross_val_score(model, X_train, Y_train, cv=5, scoring=scoring)
        else:
            scores = cross_val_score(model, X_train, Y_train, cv=5, scoring=scoring, n_jobs=-1)
        # Invert scores for loss functions (neg_mean_squared_error, etc.)
        if scoring.startswith('neg_'):
            scores = -scores
        all_scores[name][metric_name] = scores.mean()

# Create DataFrame for plotting
df = pd.DataFrame(all_scores).T  # Transpose to have models as rows

# Plot results
ax = df.plot(kind='bar', figsize=(10, 6))
ax.set_title('Model Comparison (5-fold Cross-validation)')
ax.set_ylabel('Error')
ax.set_xlabel('Model')
plt.xticks(rotation=0)  # Keep x-axis labels horizontal
plt.legend(title='Metric')
plt.tight_layout()
plt.show()