In [4]:
# Cell 1: Imports & device setup
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from pytorch_tabnet.tab_model import TabNetClassifier
import torch

# Choose device
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")


Using device: cpu


In [13]:
# Cell 2: Load dataset & validate columns
DATA_PATH = "student_data.xlsx"   # your Excel file
df = pd.read_excel(DATA_PATH)

expected_columns = [
    "Gender", "Academic Percentage", "Study Stream",
    "Analytical", "Logical", "Explaining", "Creative",
    "Detail-Oriented", "Helping", "Activity Preference",
    "Project Preference", "Degree Program"
]
missing = set(expected_columns) - set(df.columns)
assert not missing, f"Missing columns: {missing}"

df.dropna(subset=expected_columns, inplace=True)
print(f"Data shape after dropna: {df.shape}")


Data shape after dropna: (1536, 13)


In [14]:
# Cell 3: Preprocess features
# 1) Ensure Gender is int
df["Gender"] = df["Gender"].astype(int)

# 2) One-hot encode Study Stream
stream_ohe = pd.get_dummies(df["Study Stream"], prefix="Stream")
df = pd.concat([df.drop("Study Stream", axis=1), stream_ohe], axis=1)

# 3) Assemble feature matrix X and target y
feature_cols = [
    "Gender", "Academic Percentage",
    "Analytical", "Logical", "Explaining", "Creative",
    "Detail-Oriented", "Helping",
    "Activity Preference", "Project Preference"
] + list(stream_ohe.columns)

X = df[feature_cols].astype(np.float32)
y = df["Degree Program"]
print(f"Feature matrix X shape: {X.shape}")


Feature matrix X shape: (1536, 13)


In [15]:
# Cell 4: Encode the target
label_encoder = LabelEncoder().fit(y)
y_enc = label_encoder.transform(y)
print(f"Encoded {len(label_encoder.classes_)} classes: {label_encoder.classes_}")


Encoded 16 classes: ['BDS' 'BS Aerospace Engineering' 'BS Artificial Intelligence'
 'BS Biomedical Engineering' 'BS Chemical Engineering'
 'BS Civil Engineering' 'BS Computer Science' 'BS Cyber Security'
 'BS Data Science' 'BS Electrical Engineering' 'BS Mechanical Engineering'
 'BS Nursing' 'BS Software Engineering' 'DPT' 'MBBS' 'Pharm-D']


In [16]:
# Cell 5: Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X.values, y_enc,
    test_size=0.20,
    random_state=42,
    stratify=y_enc
)
print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")


Train shape: (1228, 13), Test shape: (308, 13)


In [17]:
# Cell 6: Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [18]:
# Cell 7: Initialize TabNetClassifier
clf = TabNetClassifier(
    n_d=16,
    n_a=16,
    n_steps=5,
    gamma=1.5,
    lambda_sparse=1e-4,
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2),
    mask_type="sparsemax",
    scheduler_params={"step_size":50, "gamma":0.9},
    scheduler_fn=torch.optim.lr_scheduler.StepLR,
    device_name=device
)




In [19]:
# Cell 8: Train the model
clf.fit(
    X_train, y_train,
    eval_set=[(X_train, y_train), (X_test, y_test)],
    eval_name=["train", "test"],
    eval_metric=["accuracy"],
    max_epochs=200,
    patience=20,
    batch_size=128,
    virtual_batch_size=32,
    num_workers=0,
    drop_last=False
)


epoch 0  | loss: 3.51886 | train_accuracy: 0.23127 | test_accuracy: 0.25    |  0:00:00s
epoch 1  | loss: 2.60515 | train_accuracy: 0.26873 | test_accuracy: 0.26948 |  0:00:01s
epoch 2  | loss: 2.32717 | train_accuracy: 0.33225 | test_accuracy: 0.33766 |  0:00:02s
epoch 3  | loss: 2.0892  | train_accuracy: 0.4943  | test_accuracy: 0.52273 |  0:00:02s
epoch 4  | loss: 1.86926 | train_accuracy: 0.55293 | test_accuracy: 0.58117 |  0:00:03s
epoch 5  | loss: 1.73289 | train_accuracy: 0.56515 | test_accuracy: 0.56494 |  0:00:04s
epoch 6  | loss: 1.60998 | train_accuracy: 0.56759 | test_accuracy: 0.6039  |  0:00:04s
epoch 7  | loss: 1.58207 | train_accuracy: 0.60342 | test_accuracy: 0.61039 |  0:00:05s
epoch 8  | loss: 1.49347 | train_accuracy: 0.61075 | test_accuracy: 0.6461  |  0:00:06s
epoch 9  | loss: 1.45866 | train_accuracy: 0.65309 | test_accuracy: 0.67532 |  0:00:07s
epoch 10 | loss: 1.34589 | train_accuracy: 0.69463 | test_accuracy: 0.70455 |  0:00:08s
epoch 11 | loss: 1.27838 | train



In [23]:
# Cell 9: Evaluate on test set
y_pred = clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {acc:.4f}\n")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))


Test Accuracy: 0.8247

                            precision    recall  f1-score   support

                       BDS       0.67      0.50      0.57         8
  BS Aerospace Engineering       0.92      0.85      0.88        26
BS Artificial Intelligence       1.00      0.87      0.93        30
 BS Biomedical Engineering       0.83      0.80      0.82        25
   BS Chemical Engineering       0.85      0.74      0.79        23
      BS Civil Engineering       0.64      0.82      0.72        22
       BS Computer Science       1.00      0.92      0.96        24
         BS Cyber Security       0.74      0.83      0.78        24
           BS Data Science       0.86      0.93      0.89        27
 BS Electrical Engineering       0.88      0.81      0.84        26
 BS Mechanical Engineering       0.77      0.85      0.81        27
                BS Nursing       0.00      0.00      0.00         3
   BS Software Engineering       0.87      0.96      0.91        27
                       D

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [24]:
# Cell 10: Top-3 recommendations for first 5 test samples
proba = clf.predict_proba(X_test[:5])
top_k = 3
for i, probs in enumerate(proba, start=1):
    idxs = np.argsort(probs)[-top_k:][::-1]
    labels = label_encoder.inverse_transform(idxs)
    print(f"Sample {i} recommendations:")
    for lbl, p in zip(labels, probs[idxs]):
        print(f"  • {lbl}: {p:.4f}")
    print()


Sample 1 recommendations:
  • BS Mechanical Engineering: 0.2270
  • BS Civil Engineering: 0.2199
  • BS Biomedical Engineering: 0.1386

Sample 2 recommendations:
  • BS Computer Science: 0.9752
  • BS Aerospace Engineering: 0.0135
  • BS Cyber Security: 0.0043

Sample 3 recommendations:
  • BS Artificial Intelligence: 0.9610
  • BS Aerospace Engineering: 0.0347
  • BS Civil Engineering: 0.0021

Sample 4 recommendations:
  • BS Artificial Intelligence: 0.9772
  • BS Aerospace Engineering: 0.0129
  • BS Data Science: 0.0070

Sample 5 recommendations:
  • BS Civil Engineering: 0.3667
  • BS Chemical Engineering: 0.3279
  • Pharm-D: 0.0538



In [26]:
# Cell 11: Inline real-time recommendation function
def recommend_degree_inline(input_data, top_k=3):
    """
    input_data keys:
      Gender (0/1), Academic Percentage (float),
      Study Stream (str), Analytical, Logical, Explaining,
      Creative, Detail-Oriented, Helping (ints 1-5),
      Activity Preference, Project Preference (ints 1-3).
    """
    # Build DataFrame
    df_in = pd.DataFrame([input_data])

    # One-hot encode Study Stream
    so = pd.get_dummies(df_in["Study Stream"], prefix="Stream")
    for col in ["Stream_Computer Science","Stream_Pre-Engineering","Stream_Pre-Medical"]:
        if col not in so:
            so[col] = 0
    df_in = pd.concat([df_in.drop("Study Stream", axis=1), so], axis=1)

    # Align & cast
    df_in = df_in[X.columns].astype(np.float32)

    # Scale (use .values to suppress warnings)
    X_new = scaler.transform(df_in.values)

    # Predict
    probs = clf.predict_proba(X_new)[0]
    top_idxs = np.argsort(probs)[-top_k:][::-1]
    recs = label_encoder.inverse_transform(top_idxs)

    # Print
    print(f"Top-{top_k} degree recommendations:")
    for rank, idx in enumerate(top_idxs, start=1):
        print(f"{rank}. {recs[rank-1]} ({probs[idx]:.4f})")


In [31]:
# Cell 12: Example real-time test
test_input = {
    "Gender": 0,
    "Academic Percentage": 55.0,
    "Study Stream": "Computer Science",
    "Analytical": 5,
    "Logical": 5,
    "Explaining": 5,
    "Creative": 1,
    "Detail-Oriented": 5,
    "Helping": 5,
    "Activity Preference": 2,
    "Project Preference": 1
}

recommend_degree_inline(test_input, top_k=3)


Top-3 degree recommendations:
1. MBBS (0.3429)
2. BS Cyber Security (0.1317)
3. BS Electrical Engineering (0.0840)
