In [1]:
import pandas as pd

# Load the merged data
df = pd.read_excel("../data/processed/updated-panel-project.xlsx")

print(df)

      area_encoded  type_encoded  lecturer_encoded
0                0             1                 1
1                0             1                 2
2                1             0                 3
3                1             0                 4
4                1             0                 5
...            ...           ...               ...
3170             6             0                81
3171             4             1                12
3172             4             1                71
3173             3             1               106
3174             0             1               108

[3175 rows x 3 columns]


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from collections import defaultdict

def train_and_evaluate_models(df):
    X = df[["type_encoded", "area_encoded"]]
    y = df["lecturer_encoded"]

    # Split dataset
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=42
    )

    # Lecturer history mapping for custom metric
    lecturer_history = defaultdict(set)
    for i in range(len(X_train)):
        area_type = (X_train.iloc[i]["area_encoded"], X_train.iloc[i]["type_encoded"])
        lecturer = y_train.iloc[i]
        lecturer_history[lecturer].add(area_type)

    def custom_metric(y_pred, X_test):
        valid_matches = 0
        for i in range(len(y_pred)):
            area_type = (X_test.iloc[i]["area_encoded"], X_test.iloc[i]["type_encoded"])
            predicted_lecturer = y_pred[i]
            if area_type in lecturer_history[predicted_lecturer]:
                valid_matches += 1
        return valid_matches / len(y_pred)

    models = {
        "Decision Tree": DecisionTreeClassifier(random_state=42),
        "Random Forest": RandomForestClassifier(random_state=42),
        "Logistic Regression": LogisticRegression(max_iter=1000),
        "k-NN": KNeighborsClassifier(n_neighbors=5),
        "Naive Bayes": GaussianNB(),
    }

    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        acc = accuracy_score(y_test, y_pred)
        match_ratio = custom_metric(y_pred, X_test)

        print(f"\n=== {name} ===")
        print(f"Accuracy: {acc:.4f}")
        print(f"Valid History Match Ratio: {match_ratio:.4f}")


In [5]:
train_and_evaluate_models(df)


=== Decision Tree ===
Accuracy: 0.0331
Valid History Match Ratio: 1.0000

=== Random Forest ===
Accuracy: 0.0331
Valid History Match Ratio: 1.0000

=== Logistic Regression ===
Accuracy: 0.0220
Valid History Match Ratio: 0.7827

=== k-NN ===
Accuracy: 0.0409
Valid History Match Ratio: 0.9984

=== Naive Bayes ===
Accuracy: 0.0142
Valid History Match Ratio: 0.5559
