In [2]:
import pandas as pd

# Load the merged data
df = pd.read_excel("../data/processed/updated-panel-project.xlsx")

print(df)

      area_encoded  type_encoded  lecturer_encoded
0                0             1                 1
1                0             1                 2
2                1             0                 3
3                1             0                 4
4                1             0                 5
...            ...           ...               ...
3170             6             0                81
3171             4             1                12
3172             4             1                71
3173             3             1               106
3174             0             1               108

[3175 rows x 3 columns]


In [None]:
#train using decision tree classifier

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

X = df[["type_encoded", "area_encoded"]]
y = df["lecturer_encoded"]

# Train-test split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

modelUnb = DecisionTreeClassifier(random_state=42)
modelUnb.fit(X_train, y_train)

y_pred_unb = modelUnb.predict(X_test)

modelBal = DecisionTreeClassifier(random_state=42, class_weight='balanced')
modelBal.fit(X_train, y_train)

y_pred_bal = modelBal.predict(X_test)

In [None]:
#define custom validation score function

from collections import defaultdict

def custom_validation_score(X_train, y_train, X_test, y_pred):
    """
    Returns how many predicted lecturers have history with the predicted (area_encoded, type_encoded)
    """
    lecturer_history = defaultdict(set)

    # Build history mapping from training data
    for i in range(len(X_train)):
        area_type = (X_train.iloc[i]["area_encoded"], X_train.iloc[i]["type_encoded"])
        lecturer = y_train.iloc[i]
        lecturer_history[lecturer].add(area_type)

    # Evaluate predictions
    valid_matches = 0
    total = len(y_pred)

    for i in range(total):
        area_type = (X_test.iloc[i]["area_encoded"], X_test.iloc[i]["type_encoded"])
        predicted_lecturer = y_pred[i]

        if area_type in lecturer_history[predicted_lecturer]:
            valid_matches += 1

    accuracy = valid_matches / total
    return accuracy


In [None]:
#custom validation score

score_unb = custom_validation_score(X_train, y_train, X_test, y_pred_unb)
score_bal = custom_validation_score(X_train, y_train, X_test, y_pred_bal)

print(f"Custom validation score (Unbalanced): {score_unb:.2%}")
print(f"Custom validation score (Balanced): {score_bal:.2%}")


Custom validation score (Unbalanced): 100.00%
Custom validation score (Balanced): 100.00%


In [None]:
#test with normal metrics

from sklearn.metrics import accuracy_score

# Unbalanced model
print("=== Unbalanced Model ===")
print("Accuracy:", accuracy_score(y_test, y_pred_unb))

# Balanced model
print("\n=== Balanced Model ===")
print("Accuracy:", accuracy_score(y_test, y_pred_bal))

=== Unbalanced Model ===
Accuracy: 0.03307086614173228

=== Balanced Model ===
Accuracy: 0.028346456692913385


In [11]:
#train using random forest classifier

from sklearn.ensemble import RandomForestClassifier

# Unbalanced Random Forest
rfUnb = RandomForestClassifier(random_state=42)
rfUnb.fit(X_train, y_train)
y_pred_rf_unb = rfUnb.predict(X_test)

# Balanced Random Forest
rfBal = RandomForestClassifier(random_state=42, class_weight='balanced')
rfBal.fit(X_train, y_train)
y_pred_rf_bal = rfBal.predict(X_test)


In [13]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

print("=== Random Forest (Unbalanced) ===")
print("Accuracy:", accuracy_score(y_test, y_pred_rf_unb))

print("\n=== Random Forest (Balanced) ===")
print("Accuracy:", accuracy_score(y_test, y_pred_rf_bal))


=== Random Forest (Unbalanced) ===
Accuracy: 0.03307086614173228

=== Random Forest (Balanced) ===
Accuracy: 0.026771653543307086
