In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.ensemble import RandomForestClassifier

In [2]:
# Step 1: Load a dataset with many features
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = data.target

print("Dataset shape:", X.shape)

Dataset shape: (569, 30)


In [10]:
# Step 2: Define parameters
n_iterations = 50      # Number of Boruta iterations
random_state = 42
rf = RandomForestClassifier(n_jobs=-1, max_depth=5, random_state=random_state)

In [11]:
# Step 3: Initialize lists to track feature status
feature_names = X.columns
confirmed = set()
rejected = set()

In [18]:
len(confirmed), len(rejected)

(28, 2)

In [16]:
# Step 4: Boruta iterations
for it in range(n_iterations):
    print(f"Iteration {it+1}/{n_iterations}")
    
    # 4a: Create shadow features (shuffle each column)
    X_shadow = X.apply(np.random.permutation)
    X_combined = pd.concat([X, X_shadow], axis=1)
    
    # 4b: Fit random forest
    rf.fit(X_combined, y)
    importances = rf.feature_importances_
    
    # Split original and shadow importances
    orig_importances = importances[:X.shape[1]]
    shadow_importances = importances[X.shape[1]:]
    shadow_max = shadow_importances.max()
    
    # 4c: Update confirmed and rejected features
    for i, f in enumerate(feature_names):
        if f in confirmed or f in rejected:
            continue
        if orig_importances[i] > shadow_max:
            confirmed.add(f)
        elif orig_importances[i] < shadow_max:
            rejected.add(f)
    
    # Stop early if all features are decided
    if len(confirmed) + len(rejected) == len(feature_names):
        break

Iteration 1/50


In [6]:
# Step 5: Tentative features
tentative = set(feature_names) - confirmed - rejected

In [7]:
# Step 6: Results
print("\nConfirmed Important Features:", confirmed)
print("Tentative Features:", tentative)
print("Rejected Features:", rejected)


Confirmed Important Features: {'area error', 'concavity error', 'mean texture', 'worst smoothness', 'worst compactness', 'worst radius', 'worst concave points', 'mean concavity', 'fractal dimension error', 'worst fractal dimension', 'smoothness error', 'mean concave points', 'mean symmetry', 'worst texture', 'mean perimeter', 'worst symmetry', 'perimeter error', 'texture error', 'worst area', 'concave points error', 'worst perimeter', 'worst concavity', 'mean smoothness', 'radius error', 'compactness error', 'mean radius', 'mean area', 'mean compactness'}
Tentative Features: set()
Rejected Features: {'symmetry error', 'mean fractal dimension'}
