In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    matthews_corrcoef, roc_auc_score, confusion_matrix, classification_report
)
import joblib

In [2]:
# Robust target loader: works whether 'target' header exists or not
def load_target(path):
    df = pd.read_csv(path)
    if 'target' in df.columns:
        return df['target'].astype(int)
    else:
        return df.iloc[:, 0].astype(int)

# 1) Load original (unscaled) features for Random Forest
X_train = pd.read_csv('X_train_original.csv')
X_test  = pd.read_csv('X_test_original.csv')

# 2) Load targets
y_train = load_target('y_train.csv')
y_test  = load_target('y_test.csv')

In [3]:
# Sanity checks
print("Shapes:")
print(f"  X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"  X_test:  {X_test.shape},  y_test:  {y_test.shape}")

# Ensure no NaNs
if X_train.isnull().sum().sum() > 0 or X_test.isnull().sum().sum() > 0:
    raise ValueError("NaNs found in X_train/X_test. Please re-run preprocessing.")
if y_train.isnull().sum() > 0 or y_test.isnull().sum() > 0:
    raise ValueError("NaNs found in y_train/y_test. Please re-run preprocessing.")

Shapes:
  X_train: (455, 30), y_train: (455,)
  X_test:  (114, 30),  y_test:  (114,)


In [4]:
# 3) Train Random Forest (baseline, robust settings)
rf = RandomForestClassifier(
    n_estimators=300,        # more trees for stability
    criterion='gini',        # or 'entropy'
    max_depth=None,          # let trees grow; can tune to reduce overfitting
    min_samples_split=2,
    min_samples_leaf=1,
    max_features='sqrt',     # good default for classification
    class_weight='balanced', # helpful if classes slightly imbalanced
    random_state=42,
    n_jobs=-1
)
rf.fit(X_train, y_train)

In [8]:
# 4) Predictions & probabilities
y_pred = rf.predict(X_test)  # <-- ensures y_pred is defined
y_proba = rf.predict_proba(X_test)[:, 1]  # for AUC

In [9]:
# 5) Metrics
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
mcc = matthews_corrcoef(y_test, y_pred)
auc = roc_auc_score(y_test, y_proba)

print("\n=== Random Forest Performance (Original features) ===")
print(f"Accuracy:  {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall:    {rec:.4f}")
print(f"F1 Score:  {f1:.4f}")
print(f"MCC:       {mcc:.4f}")
print(f"AUC:       {auc:.4f}")

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred, digits=4))


=== Random Forest Performance (Original features) ===
Accuracy:  0.9649
Precision: 1.0000
Recall:    0.9048
F1 Score:  0.9500
MCC:       0.9258
AUC:       0.9970

Confusion Matrix:
[[72  0]
 [ 4 38]]

Classification Report:
              precision    recall  f1-score   support

           0     0.9474    1.0000    0.9730        72
           1     1.0000    0.9048    0.9500        42

    accuracy                         0.9649       114
   macro avg     0.9737    0.9524    0.9615       114
weighted avg     0.9668    0.9649    0.9645       114



In [10]:
# 6) Feature importances
importances = pd.Series(rf.feature_importances_, index=X_train.columns).sort_values(ascending=False)
print("\nTop 10 Feature Importances:")
print(importances.head(10))
importances.to_csv('rf_feature_importances.csv', header=['importance'])
print("\nSaved: rf_feature_importances.csv")


Top 10 Feature Importances:
perimeter_worst         0.143452
area_worst              0.140756
concave points_worst    0.104602
concave points_mean     0.099638
radius_worst            0.074990
radius_mean             0.056577
perimeter_mean          0.051409
concavity_mean          0.048216
concavity_worst         0.039051
area_mean               0.038337
dtype: float64

Saved: rf_feature_importances.csv


In [11]:
# 7) Save model
joblib.dump(rf, 'random_forest_model.pkl')
print("Saved: random_forest_model.pkl")

Saved: random_forest_model.pkl
