In [1]:
import pandas as pd

# Load the merged data
df = pd.read_excel("../data/processed/updated-panel-project.xlsx")

print(df)

      area_encoded  type_encoded  lecturer_encoded
0                0             1                 1
1                0             1                 2
2                1             0                 3
3                1             0                 4
4                1             0                 5
...            ...           ...               ...
3170             6             0                81
3171             4             1                12
3172             4             1                71
3173             3             1               106
3174             0             1               108

[3175 rows x 3 columns]


In [None]:
#test for max leaf nodes

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# Load data
X = df[["type_encoded", "area_encoded"]]
y = df["lecturer_encoded"]

# Consistent train-test split
X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=42
    )

# Define candidate values for each hyperparameter
n_estimators_list = [10, 20, 30, 40, 44, 50]
random_states = [0, 1, 5, 10, 20]
max_leaf_nodes_list = [10, 15, 20, 25, 30, None]  # None = no limit

# Tracking the best results
best_accuracy = 0
best_params = {}

# Grid search
for n_estimators in n_estimators_list:
    for random_state in random_states:
        for max_leaf_nodes in max_leaf_nodes_list:
            model = RandomForestClassifier(
                n_estimators=n_estimators,
                random_state=random_state,
                max_leaf_nodes=max_leaf_nodes
            )
            model.fit(X_train, y_train)
            preds = model.predict(X_test)
            acc = accuracy_score(y_test, preds)

            print(f"n_estimators={n_estimators}, random_state={random_state}, max_leaf_nodes={max_leaf_nodes} -> Accuracy: {acc:.4f}")

            if acc > best_accuracy:
                best_accuracy = acc
                best_params = {
                    'n_estimators': n_estimators,
                    'random_state': random_state,
                    'max_leaf_nodes': max_leaf_nodes
                }

print("\n✅ Best parameters for Rainforest:")
print(best_params)
print(f"Best Accuracy: {best_accuracy:.4f}")


n_estimators=10, random_state=0, max_leaf_nodes=10 -> Accuracy: 0.0315
n_estimators=10, random_state=0, max_leaf_nodes=15 -> Accuracy: 0.0346
n_estimators=10, random_state=0, max_leaf_nodes=20 -> Accuracy: 0.0331
n_estimators=10, random_state=0, max_leaf_nodes=25 -> Accuracy: 0.0362
n_estimators=10, random_state=0, max_leaf_nodes=30 -> Accuracy: 0.0362
n_estimators=10, random_state=0, max_leaf_nodes=None -> Accuracy: 0.0362
n_estimators=10, random_state=1, max_leaf_nodes=10 -> Accuracy: 0.0299
n_estimators=10, random_state=1, max_leaf_nodes=15 -> Accuracy: 0.0331
n_estimators=10, random_state=1, max_leaf_nodes=20 -> Accuracy: 0.0362
n_estimators=10, random_state=1, max_leaf_nodes=25 -> Accuracy: 0.0331
n_estimators=10, random_state=1, max_leaf_nodes=30 -> Accuracy: 0.0346
n_estimators=10, random_state=1, max_leaf_nodes=None -> Accuracy: 0.0346
n_estimators=10, random_state=5, max_leaf_nodes=10 -> Accuracy: 0.0331
n_estimators=10, random_state=5, max_leaf_nodes=15 -> Accuracy: 0.0378
n_

In [None]:
#test for max depth

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# Load data
X = df[["type_encoded", "area_encoded"]]
y = df["lecturer_encoded"]

# Consistent train-test split
X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=42
    )

# Define candidate values
n_estimators_list = [10, 20, 30, 40, 44, 50]
random_states = [0, 1, 5, 10, 20]
max_depth_list = [3, 5, 10, 15, 20, None]  # None = unlimited depth

# Track best result
best_accuracy = 0
best_params = {}

# Grid search
for n_estimators in n_estimators_list:
    for random_state in random_states:
        for max_depth in max_depth_list:
            model = RandomForestClassifier(
                n_estimators=n_estimators,
                random_state=random_state,
                max_depth=max_depth
            )
            model.fit(X_train, y_train)
            preds = model.predict(X_test)
            acc = accuracy_score(y_test, preds)

            print(f"n_estimators={n_estimators}, random_state={random_state}, max_depth={max_depth} -> Accuracy: {acc:.4f}")

            if acc > best_accuracy:
                best_accuracy = acc
                best_params = {
                    'n_estimators': n_estimators,
                    'random_state': random_state,
                    'max_depth': max_depth
                }

print("\n✅ Best combination:")
print(best_params)
print(f"Highest Accuracy: {best_accuracy:.4f}")

n_estimators=10, random_state=0, max_depth=3 -> Accuracy: 0.0331
n_estimators=10, random_state=0, max_depth=5 -> Accuracy: 0.0331
n_estimators=10, random_state=0, max_depth=10 -> Accuracy: 0.0362
n_estimators=10, random_state=0, max_depth=15 -> Accuracy: 0.0362
n_estimators=10, random_state=0, max_depth=20 -> Accuracy: 0.0362
n_estimators=10, random_state=0, max_depth=None -> Accuracy: 0.0362
n_estimators=10, random_state=1, max_depth=3 -> Accuracy: 0.0331
n_estimators=10, random_state=1, max_depth=5 -> Accuracy: 0.0315
n_estimators=10, random_state=1, max_depth=10 -> Accuracy: 0.0346
n_estimators=10, random_state=1, max_depth=15 -> Accuracy: 0.0346
n_estimators=10, random_state=1, max_depth=20 -> Accuracy: 0.0346
n_estimators=10, random_state=1, max_depth=None -> Accuracy: 0.0346
n_estimators=10, random_state=5, max_depth=3 -> Accuracy: 0.0362
n_estimators=10, random_state=5, max_depth=5 -> Accuracy: 0.0346
n_estimators=10, random_state=5, max_depth=10 -> Accuracy: 0.0362
n_estimators