# RF Model

We now train a Random Forest model which, in theory, should outperform the k-NN baseline due to its ability to capture complex interactions and non-linear relationships in the data. The Random Forest model is robust to overfitting and can handle high-dimensional feature spaces effectively.

In [1]:
import os

import joblib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.calibration import calibration_curve
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    auc,
    average_precision_score,
    brier_score_loss,
    classification_report,
    confusion_matrix,
    f1_score,
    precision_recall_curve,
    roc_curve,
)
from sklearn.model_selection import (
    RandomizedSearchCV,
    StratifiedKFold,
)
from sklearn.preprocessing import StandardScaler, label_binarize

In [2]:
OUT_DIR = "results/models"
OUT_VIS = "results/figures"
OUT_CSV = "results/csv"
for d in [OUT_DIR, OUT_VIS, OUT_CSV]:
    os.makedirs(d, exist_ok=True)

In [3]:
X_train = pd.read_csv("data/processed/X/train.csv").select_dtypes(include=[np.number])
X_val = pd.read_csv("data/processed/X/val.csv").select_dtypes(include=[np.number])
y_train = pd.read_csv("data/processed/Y/train.csv").squeeze()
y_val = pd.read_csv("data/processed/Y/val.csv").squeeze()

threshold = 50
counts = y_train.value_counts()
rare = counts[counts < threshold].index.tolist()
y_train.replace({c: "Other" for c in rare}, inplace=True)
y_val.replace({c: "Other" for c in rare}, inplace=True)

In [4]:
pipeline = ImbPipeline(
    [
        ("smote", SMOTE(random_state=42)),
        ("scaler", StandardScaler()),
        ("rf", RandomForestClassifier(class_weight="balanced", random_state=42)),
    ]
)

In [5]:
param_dist = {
    "rf__n_estimators": [100, 200],
    "rf__max_depth": [None, 20],
    "rf__min_samples_split": [2, 5],
    "rf__min_samples_leaf": [1, 2],
}
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_dist,
    n_iter=20,
    cv=cv,
    scoring="f1_weighted",
    n_jobs=4,
    verbose=4,
    random_state=42,
)

In [None]:
print("Starting RandomizedSearchCV...")
with joblib.parallel_backend("threading"):
    search.fit(X_train, y_train)
best = search.best_estimator_
print(f"Best params: {search.best_params_}")
print(f"Best CV F1 (weighted): {search.best_score_:.4f}")
joblib.dump(best, os.path.join(OUT_DIR, "rf_v2_best.pkl"))

Starting RandomizedSearchCV...
Fitting 3 folds for each of 16 candidates, totalling 48 fits




[CV 1/3] END rf__max_depth=None, rf__min_samples_leaf=1, rf__min_samples_split=2, rf__n_estimators=100;, score=0.767 total time=23.8min
[CV 2/3] END rf__max_depth=None, rf__min_samples_leaf=1, rf__min_samples_split=2, rf__n_estimators=100;, score=0.773 total time=24.5min
[CV 3/3] END rf__max_depth=None, rf__min_samples_leaf=1, rf__min_samples_split=2, rf__n_estimators=100;, score=0.771 total time=24.6min
[CV 1/3] END rf__max_depth=None, rf__min_samples_leaf=1, rf__min_samples_split=2, rf__n_estimators=200;, score=0.772 total time=47.5min
[CV 1/3] END rf__max_depth=None, rf__min_samples_leaf=1, rf__min_samples_split=5, rf__n_estimators=100;, score=0.766 total time=23.7min
[CV 2/3] END rf__max_depth=None, rf__min_samples_leaf=1, rf__min_samples_split=5, rf__n_estimators=100;, score=0.775 total time=24.3min
[CV 3/3] END rf__max_depth=None, rf__min_samples_leaf=1, rf__min_samples_split=2, rf__n_estimators=200;, score=0.776 total time=47.9min
[CV 3/3] END rf__max_depth=None, rf__min_samples

In [None]:
X_comb = pd.concat([X_train, X_val], axis=0)
y_comb = pd.concat([y_train, y_val], axis=0)
print("Retraining best model on combined set...")
best.fit(X_comb, y_comb)
joblib.dump(best, os.path.join(OUT_DIR, "rf_v2_retrained.pkl"))

In [None]:
X_test = pd.read_csv("data/processed/X/test.csv").select_dtypes(include=[np.number])
y_test = pd.read_csv("data/processed/Y/test.csv").squeeze()
X_test = X_test[X_train.columns]

y_pred = best.predict(X_test)
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="weighted")
print(f"Test Accuracy: {acc:.4f}")
print(f"Test F1 (weighted): {f1:.4f}")

In [None]:
with open(os.path.join(OUT_DIR, "rf_v2_test_metrics.txt"), "w") as f:
    f.write(f"Accuracy: {acc:.4f}\nF1-weighted: {f1:.4f}\n")
    f.write(classification_report(y_test, y_pred))

In [None]:
cm = confusion_matrix(y_test, y_pred, labels=best.named_steps["rf"].classes_)
cm_df = pd.DataFrame(
    cm, index=best.named_steps["rf"].classes_, columns=best.named_steps["rf"].classes_
)
cm_df.to_csv(os.path.join(OUT_CSV, "rf_v2_confusion_matrix.csv"))
sns.heatmap(cm_df, annot=True, fmt="d", cmap="Blues")
plt.title("RF v2 Confusion Matrix")
plt.tight_layout()
plt.savefig(os.path.join(OUT_VIS, "rf_v2_confusion_matrix.png"))
plt.close()

In [None]:
classes = best.named_steps["rf"].classes_
y_bin = label_binarize(y_test, classes=classes)
y_scores = best.predict_proba(X_test)
fpr, tpr, roc_auc = {}, {}, {}
for i, cls in enumerate(classes):
    fpr[i], tpr[i], _ = roc_curve(y_bin[:, i], y_scores[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

In [None]:
plt.figure(figsize=(8, 6))
for i, cls in enumerate(classes):
    plt.plot(fpr[i], tpr[i], label=f"{cls} (AUC={roc_auc[i]:.2f})")
plt.plot([0, 1], [0, 1], "k--")
plt.legend()
plt.title("ROC Curves")
plt.tight_layout()
plt.savefig(os.path.join(OUT_VIS, "rf_v2_roc.png"))
plt.close()

In [None]:
pr_auc = {}
plt.figure(figsize=(8, 6))
for i, cls in enumerate(classes):
    prec, rec, _ = precision_recall_curve(y_bin[:, i], y_scores[:, i])
    pr_auc[i] = average_precision_score(y_bin[:, i], y_scores[:, i])
    plt.plot(rec, prec, label=f"{cls} (AP={pr_auc[i]:.2f})")
plt.legend()
plt.title("Precision-Recall Curves")
plt.tight_layout()
plt.savefig(os.path.join(OUT_VIS, "rf_v2_pr.png"))
plt.close()

In [None]:
brier = {}
plt.figure(figsize=(8, 6))
for i, cls in enumerate(classes):
    prob = y_scores[:, i]
    frac_pos, mean_pred = calibration_curve(y_bin[:, i], prob, n_bins=10)
    plt.plot(mean_pred, frac_pos, marker="o", label=f"{cls}")
    brier[cls] = brier_score_loss(y_bin[:, i], prob)
plt.plot([0, 1], [0, 1], "k--")
plt.legend()
plt.title("Calibration Curves")
plt.tight_layout()
plt.savefig(os.path.join(OUT_VIS, "rf_v2_calibration.png"))
plt.close()
pd.DataFrame.from_dict(brier, orient="index", columns=["brier_score"]).to_csv(
    os.path.join(OUT_CSV, "rf_v2_brier.csv")
)