In [2]:
import os
import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, classification_report, roc_curve

import matplotlib.pyplot as plt

FEATURES_PATH = "../data/processed/example_features.csv"
LABELS_PATH   = "../data/processed/example_labels.csv"

os.makedirs("../results/figures", exist_ok=True)
os.makedirs("../results/tables", exist_ok=True)


ModuleNotFoundError: No module named 'numpy'

In [None]:
X = pd.read_csv(FEATURES_PATH)
y_df = pd.read_csv(LABELS_PATH)

label_col = [c for c in y_df.columns if c.lower() != "sample_id"][0]
y = y_df[label_col].astype(str)

print("X shape:", X.shape)
print("y shape:", y.shape)
print("Labels:", y.unique())


In [None]:
le = LabelEncoder()
y_enc = le.fit_transform(y)

pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("variance_filter", VarianceThreshold(threshold=0.0)),
    ("scaler", StandardScaler(with_mean=False)),
    ("rf", RandomForestClassifier(
        n_estimators=500,
        random_state=42,
        class_weight="balanced",
        n_jobs=-1
    ))
])

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)


In [None]:
y_proba = cross_val_predict(pipeline, X, y_enc, cv=cv, method="predict_proba")[:, 1]
y_pred  = (y_proba >= 0.5).astype(int)

auc = roc_auc_score(y_enc, y_proba)
acc = accuracy_score(y_enc, y_pred)

print(f"Baseline Random Forest | ROC-AUC: {auc:.3f} | Accuracy: {acc:.3f}")
print(classification_report(y_enc, y_pred, target_names=le.classes_))

metrics_df = pd.DataFrame([{
    "model": "RandomForest_baseline",
    "roc_auc": auc,
    "accuracy": acc,
    "n_samples": len(y_enc),
    "n_features": X.shape[1]
}])

metrics_df.to_csv("../results/tables/baseline_metrics.csv", index=False)
metrics_df


In [None]:
fpr, tpr, _ = roc_curve(y_enc, y_proba)

plt.figure()
plt.plot(fpr, tpr, label=f"RF baseline (AUC={auc:.2f})")
plt.plot([0, 1], [0, 1], linestyle="--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Baseline ROC Curve (Synthetic Example Data)")
plt.legend(loc="lower right")
plt.tight_layout()

plt.savefig("../results/figures/roc_baseline.png", dpi=200)
plt.show()
