In [4]:
import pandas as pd

# Load the merged data
df = pd.read_excel("../data/processed/updated-panel-project.xlsx")

print(df)

      area_encoded  type_encoded  lecturer_encoded
0                0             1                 1
1                0             1                 2
2                1             0                 3
3                1             0                 4
4                1             0                 5
...            ...           ...               ...
3170             6             0                81
3171             4             1                12
3172             4             1                71
3173             3             1               106
3174             0             1               108

[3175 rows x 3 columns]


In [6]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import pandas as pd

# Load data
X = df[["type_encoded", "area_encoded"]]
y = df["lecturer_encoded"]

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Hyperparameter grid
max_depths = [None, 5, 10, 15]
min_samples_splits = [2, 5, 10]
criteria = ['gini', 'entropy']
random_states = [0, 1, 5, 10, 20]

best_acc = 0
best_params = {}

# Grid search
for max_depth in max_depths:
    for min_split in min_samples_splits:
        for crit in criteria:
            for rand_state in random_states:
                model = DecisionTreeClassifier(
                    max_depth=max_depth,
                    min_samples_split=min_split,
                    criterion=crit,
                    random_state=rand_state
                )
                model.fit(X_train, y_train)
                preds = model.predict(X_test)
                acc = accuracy_score(y_test, preds)

                print(f"max_depth={max_depth}, min_split={min_split}, criterion={crit}, random_state={rand_state} → Accuracy: {acc:.4f}")

                if acc > best_acc:
                    best_acc = acc
                    best_params = {
                        'max_depth': max_depth,
                        'min_samples_split': min_split,
                        'criterion': crit,
                        'random_state': rand_state
                    }

print("\n✅ Best parameters for Decision Tree:")
print(best_params)
print(f"Best Accuracy: {best_acc:.4f}")



max_depth=None, min_split=2, criterion=gini, random_state=0 → Accuracy: 0.0331
max_depth=None, min_split=2, criterion=gini, random_state=1 → Accuracy: 0.0331
max_depth=None, min_split=2, criterion=gini, random_state=5 → Accuracy: 0.0331
max_depth=None, min_split=2, criterion=gini, random_state=10 → Accuracy: 0.0331
max_depth=None, min_split=2, criterion=gini, random_state=20 → Accuracy: 0.0331
max_depth=None, min_split=2, criterion=entropy, random_state=0 → Accuracy: 0.0331
max_depth=None, min_split=2, criterion=entropy, random_state=1 → Accuracy: 0.0331
max_depth=None, min_split=2, criterion=entropy, random_state=5 → Accuracy: 0.0331
max_depth=None, min_split=2, criterion=entropy, random_state=10 → Accuracy: 0.0331
max_depth=None, min_split=2, criterion=entropy, random_state=20 → Accuracy: 0.0331
max_depth=None, min_split=5, criterion=gini, random_state=0 → Accuracy: 0.0331
max_depth=None, min_split=5, criterion=gini, random_state=1 → Accuracy: 0.0331
max_depth=None, min_split=5, crit