In [4]:
# Cell 1: imports
import pandas as pd
import numpy as np
import os

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report

import joblib


In [15]:
# Cell 2: load data and drop 'University'
print("Files in cwd:", os.listdir("."))
df = pd.read_excel("DataTraining.xlsx")
df = df.drop(columns=["University"])
print("Dataset shape (after drop):", df.shape)
df.head()


Files in cwd: ['.gitignore', '.ipynb_checkpoints', 'data.xlsx', 'DataTraining.xlsx', 'degree_label_encoder.joblib', 'degree_model_pipeline.joblib', 'feature_columns.joblib', 'Include', 'Lib', 'mlp_degree_program_model.joblib', 'mlp_model.joblib', 'model_hashing_pipeline.joblib', 'pyvenv.cfg', 'Scripts', 'src', 'Untitled Folder', 'Untitled10.ipynb', 'Untitled6.ipynb', 'Untitled7.ipynb', 'Untitled8.ipynb', 'Untitled9.ipynb']
Dataset shape (after drop): (1198, 12)


Unnamed: 0,Gender,Academic Percentage,Study Stream,Degree Program,Analytical,Logical,Explaining,Creative,Detail-Oriented,Helping,Activity Preference,Project Preference
0,0,82.55,Pre-Medical,BS Electrical Engineering,4,5,3,3,4,2,2,3
1,0,79.0,Pre-Medical,BS Software Engineering,4,5,3,4,3,2,1,1
2,0,79.0,Pre-Medical,BS Civil Engineering,4,4,3,3,4,2,2,3
3,1,83.86,Computer Science,BS Software Engineering,4,5,3,4,3,2,1,1
4,1,61.23,Pre-Medical,BS Aerospace Engineering,5,5,3,4,4,1,2,3


In [17]:
# Cell 3: Inputs and output
FEATURES_NUM = [
    "Gender",
    "Academic Percentage",
    "Analytical",
    "Logical",
    "Explaining",
    "Creative", 
    "Detail-Oriented",
    "Helping",
    "Activity Preference",
    "Project Preference"
]
FEATURE_CAT = ["Study Stream"]
TARGET = "Degree Program"

X_num = df[FEATURES_NUM]
X_cat = df[FEATURE_CAT].astype(str)
y     = df[TARGET]
print("Unique Study Streams:", sorted(X_cat["Study Stream"].unique()))


Unique Study Streams: ['Computer Science', 'Pre-Engineering', 'Pre-Medical']


In [18]:
# Cell 4: one-hot encode Study Stream + scale numerics
X_cat_enc = pd.get_dummies(X_cat, columns=["Study Stream"], drop_first=True)

scaler = StandardScaler()
X_num_scaled = pd.DataFrame(
    scaler.fit_transform(X_num),
    columns=FEATURES_NUM
)

# Final feature matrix
X_all = pd.concat([X_num_scaled, X_cat_enc], axis=1)
print("Final feature matrix shape:", X_all.shape)


Final feature matrix shape: (1198, 12)


In [8]:
# Cell 5: split
X_train, X_test, y_train, y_test = train_test_split(
    X_all, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

print("Train/Test sizes:", X_train.shape, X_test.shape)


Train/Test sizes: (1228, 12) (308, 12)


In [19]:
# Cell 6: encode y and train
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc  = le.transform(y_test)

mlp = MLPClassifier(
    hidden_layer_sizes=(64, 32),
    activation='relu',
    solver='adam',
    max_iter=300,
    random_state=42
)
mlp.fit(X_train, y_train_enc)




In [20]:
# Cell 7: evaluate
y_pred = mlp.predict(X_test)
print(f"Test Accuracy: {accuracy_score(y_test_enc, y_pred):.4f}\n")
print(classification_report(
    y_test_enc, y_pred,
    target_names=le.classes_
))


Test Accuracy: 0.8312

                            precision    recall  f1-score   support

                       BDS       0.67      0.50      0.57         8
  BS Aerospace Engineering       0.76      0.85      0.80        26
BS Artificial Intelligence       0.96      0.87      0.91        30
 BS Biomedical Engineering       0.87      0.80      0.83        25
   BS Chemical Engineering       0.85      0.74      0.79        23
      BS Civil Engineering       0.77      0.77      0.77        22
       BS Computer Science       0.81      0.92      0.86        24
         BS Cyber Security       0.95      0.83      0.89        24
           BS Data Science       0.80      0.89      0.84        27
 BS Electrical Engineering       0.84      0.81      0.82        26
 BS Mechanical Engineering       0.89      0.89      0.89        27
                BS Nursing       0.67      0.67      0.67         3
   BS Software Engineering       0.74      0.96      0.84        27
                       D

In [21]:
# Cell 8: persist model, encoders, scaler, feature names
joblib.dump({
    "mlp": mlp,
    "scaler": scaler,
    "label_encoder": le,
    "cat_columns": X_cat_enc.columns.tolist(),
    "numeric_columns": FEATURES_NUM
}, "degree_model_pipeline.joblib")
print("Saved pipeline to degree_model_pipeline.joblib")


Saved pipeline to degree_model_pipeline.joblib


In [22]:
# Cell 9: load pipeline
art = joblib.load("degree_model_pipeline.joblib")
mlp            = art["mlp"]
scaler         = art["scaler"]
le             = art["label_encoder"]
cat_columns    = art["cat_columns"]
numeric_cols   = art["numeric_columns"]


In [23]:
# Cell 10: single-record predict helper
def recommend_degree(record: dict) -> str:
    """
    record must have keys = numeric_cols + ['Study Stream']
    """
    # 1) numeric→scaled array
    nums = np.array([record[c] for c in numeric_cols]).reshape(1,-1)
    nums = scaler.transform(nums)
    # 2) cat→one-hot
    dfc = pd.DataFrame([{"Study Stream": record["Study Stream"]}])
    cat_enc = pd.get_dummies(dfc, columns=["Study Stream"], drop_first=True)
    # add missing cat columns
    for c in cat_columns:
        if c not in cat_enc:
            cat_enc[c] = 0
    cat_enc = cat_enc[cat_columns].values
    # 3) combine
    X_new = np.hstack([nums, cat_enc])
    # 4) predict & decode
    idx = mlp.predict(X_new)[0]
    return le.inverse_transform([idx])[0]


In [3]:
# Cell 11: try with two different inputs
student1 = {
    "Gender": 0,
    "Academic Percentage": 95.2,
    "Analytical": 1,
    "Logical": 1, 
    "Explaining": 1,
    "Creative": 1,
    "Detail-Oriented": 1,
    "Helping": 1,
    "Activity Preference": 1,
    "Project Preference": 1,
    "Study Stream": "Pre-Engineering"
}
student2 = student1.copy()
student2["Academic Percentage"] = 90.1
student2["Analytical"] = 5
student2["Study Stream"] = "Pre-Medical"

print("Rec for student1:", recommend_degree(student1))
print("Rec for student2:", recommend_degree(student2))


NameError: name 'recommend_degree' is not defined