# Rotation Forest Implementation
# Based on the paper "Rotation Forest: A New Classifier Ensemble Method"
#
# Dataset: Breast Cancer Wisconsin (Diagnostic) Dataset
# URL: https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Diagnostic)
# - 569 samples (357 benign, 212 malignant)
# - 30 numeric features (mean, SE, and "worst" values for 10 measurements)
# - Binary classification task

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from decision_tree import *
from classification_metrics import *

# Load and prepare Breast Cancer Wisconsin dataset

In [2]:
# Load data - first column is ID (skip), second is diagnosis (M=malignant, B=benign)
df = pd.read_csv("data/wdbc.data", sep=",", header=None)

# Create feature names based on UCI repository documentation
feature_names = [
    "radius_mean", "texture_mean", "perimeter_mean", "area_mean", "smoothness_mean",
    "compactness_mean", "concavity_mean", "concave_points_mean", "symmetry_mean", "fractal_dimension_mean",
    "radius_se", "texture_se", "perimeter_se", "area_se", "smoothness_se",
    "compactness_se", "concavity_se", "concave_points_se", "symmetry_se", "fractal_dimension_se",
    "radius_worst", "texture_worst", "perimeter_worst", "area_worst", "smoothness_worst",
    "compactness_worst", "concavity_worst", "concave_points_worst", "symmetry_worst", "fractal_dimension_worst"
]

header = ["ID", "Diagnosis"] + feature_names
df.columns = header

print(f"Dataset shape: {df.shape}")
print(f"\nClass distribution:")
print(df['Diagnosis'].value_counts())
df.head()

Dataset shape: (569, 32)

Class distribution:
Diagnosis
B    357
M    212
Name: count, dtype: int64


Unnamed: 0,ID,Diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave_points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave_points_worst,symmetry_worst,fractal_dimension_worst
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [4]:
# Prepare features and labels
X = df.iloc[:,2:]  # Skip ID and Diagnosis columns
y_raw = df.iloc[:,1]  # Diagnosis column

# Convert M/B to 1/0
y = (y_raw == 'M').astype(int)  # M (malignant) = 1, B (benign) = 0

print("feature_names:", list(X.columns))
print("target_names: ['Benign (0)', 'Malignant (1)']")

X = np.asarray(X)
y = np.asarray(y)
print(f"\nX.shape: {X.shape}")
print(f"y.shape: {y.shape}")
print(f"Class 0 (Benign): {np.sum(y == 0)} samples")
print(f"Class 1 (Malignant): {np.sum(y == 1)} samples")

feature_names: ['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean', 'concave_points_mean', 'symmetry_mean', 'fractal_dimension_mean', 'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se', 'compactness_se', 'concavity_se', 'concave_points_se', 'symmetry_se', 'fractal_dimension_se', 'radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst', 'smoothness_worst', 'compactness_worst', 'concavity_worst', 'concave_points_worst', 'symmetry_worst', 'fractal_dimension_worst']
target_names: ['Benign (0)', 'Malignant (1)']

X.shape: (569, 30)
y.shape: (569,)
Class 0 (Benign): 357 samples
Class 1 (Malignant): 212 samples


In [None]:
# Train-test split: 80% train, 20% test
np.random.seed(777)
ind_train = np.random.choice(X.shape[0], size=int(X.shape[0] * 0.8), replace=False)
bool_ind_train = np.isin(range(X.shape[0]), ind_train)
X_train = X[bool_ind_train,]
y_train = y[bool_ind_train]
X_test = X[~bool_ind_train,]
y_test = y[~bool_ind_train]
print("X_train.shape:", X_train.shape)
print("y_train.shape:", y_train.shape)
print("X_test.shape:", X_test.shape)
print("y_test.shape:", y_test.shape)

# Helper functions for PCA (from PCA notebook)

In [None]:
def get_mean_std(X):
    return np.mean(X, axis=0), np.std(X, axis=0)

def normalization(X, means=None, sds=None):
    X = X.copy()  # don't modify original data
    for j in range(X.shape[1]):
        if means is not None:
            X[:,j] = (X[:,j] - means[j])
        if sds is not None:
            X[:,j] = X[:,j] / sds[j]
    return X

def get_principal_components(covariance_matrix):
    eigen_values, eigen_vectors = np.linalg.eig(covariance_matrix)

    # Sort by eigenvalues in descending order
    order = np.argsort(eigen_values)[::-1]
    eigen_values = eigen_values[order]
    eigen_vectors = eigen_vectors[:,order]

    return eigen_values, eigen_vectors

# Majority voting function (from Random Forest notebook)

In [None]:
def majority_voting(yHats):
    """Aggregate predictions from multiple trees using majority voting"""
    yHat = []
    for i in range(yHats.shape[1]):
        vals, counts = np.unique(yHats[:,i], return_counts=True)
        index = np.argmax(counts)
        yHat.append(int(vals[index]))
    return yHat

# Rotation Forest specific functions

In [None]:
def create_feature_subsets(n_features, M=3):
    """
    Create disjoint feature subsets with fixed size M.
    If n_features is not divisible by M, the remainder subset is filled with random features.

    Parameters:
    - n_features: total number of features
    - M: fixed subset size (default: 3)

    Returns:
    - List of feature index arrays (disjoint subsets)
    """
    K = int(np.ceil(n_features / M))  # number of subsets
    feature_indices = np.random.permutation(n_features)  # shuffle features

    subsets = []
    for k in range(K):
        start_idx = k * M
        end_idx = min(start_idx + M, n_features)
        subset = feature_indices[start_idx:end_idx].tolist()

        # If this is the last subset and it has fewer than M features, fill with random features
        if len(subset) < M and k == K - 1:
            available_features = feature_indices[:start_idx].tolist()  # features already used
            needed = M - len(subset)
            random_fill = np.random.choice(available_features, size=needed, replace=False)
            subset.extend(random_fill)

        subsets.append(np.array(subset))

    return subsets

def compute_pca_rotation_for_subset(X_bootstrap, feature_subset):
    """
    Compute PCA rotation matrix for a feature subset.

    Parameters:
    - X_bootstrap: bootstrapped training data
    - feature_subset: indices of features in this subset

    Returns:
    - eigen_vectors: PCA rotation matrix (all components kept)
    - means: mean values for mean-centering
    """
    # Extract features for this subset
    X_subset = X_bootstrap[:, feature_subset]

    # Compute mean and mean-center the data (standard PCA preprocessing)
    means, _ = get_mean_std(X_subset)
    X_centered = normalization(X_subset, means=means, sds=None)

    # Compute covariance matrix and PCA
    covariance_matrix = np.cov(X_centered.T)
    eigen_values, eigen_vectors = get_principal_components(covariance_matrix)

    # Keep all components as per paper specification
    return eigen_vectors, means

def rotation_forest(X, y, L=10, M=3, max_depth=15, bootstrap_pca_fraction=0.75):
    """
    Train Rotation Forest ensemble.

    Parameters:
    - X: training features
    - y: training labels
    - L: number of trees (default: 10)
    - M: subset size (default: 3)
    - max_depth: maximum tree depth (default: 15)
    - bootstrap_pca_fraction: fraction of data for PCA bootstrap (default: 0.75)

    Returns:
    - List of dictionaries containing tree, rotation_info for each classifier
    """
    n_samples, n_features = X.shape
    n_classes = len(set(y))
    ensemble = []

    for l in range(L):
        # Step 1: Bootstrap sample for tree training (100% with replacement)
        tree_indices = np.random.choice(n_samples, size=n_samples, replace=True)
        X_tree = X[tree_indices]
        y_tree = y[tree_indices]

        # Step 2: Create disjoint feature subsets
        feature_subsets = create_feature_subsets(n_features, M=M)

        # Step 3: Bootstrap sample for PCA (75% without replacement)
        pca_sample_size = int(n_samples * bootstrap_pca_fraction)
        pca_indices = np.random.choice(n_samples, size=pca_sample_size, replace=False)
        X_pca_bootstrap = X[pca_indices]

        # Step 4: Apply PCA to each feature subset
        rotation_matrices = []
        subset_means = []

        for subset in feature_subsets:
            eigen_vectors, means = compute_pca_rotation_for_subset(X_pca_bootstrap, subset)
            rotation_matrices.append(eigen_vectors)
            subset_means.append(means)

        # Step 5: Transform training data using rotation
        X_tree_rotated = apply_rotation_transform(X_tree, feature_subsets, rotation_matrices, subset_means)

        # Step 6: Build decision tree on rotated features
        tree = build_tree(X_tree_rotated, y_tree, n_classes, max_depth=max_depth, max_features=None)

        # Store tree with rotation information
        ensemble.append({
            'tree': tree,
            'feature_subsets': feature_subsets,
            'rotation_matrices': rotation_matrices,
            'subset_means': subset_means
        })

    return ensemble

def apply_rotation_transform(X, feature_subsets, rotation_matrices, subset_means):
    """
    Apply rotation transformation to data using stored PCA rotations.

    Parameters:
    - X: data to transform
    - feature_subsets: list of feature indices for each subset
    - rotation_matrices: list of PCA rotation matrices
    - subset_means: list of mean vectors for each subset

    Returns:
    - X_rotated: transformed data
    """
    n_samples = X.shape[0]
    rotated_features = []

    # Apply rotation to each feature subset
    for subset, rotation_matrix, means in zip(feature_subsets, rotation_matrices, subset_means):
        # Extract subset features
        X_subset = X[:, subset]

        # Mean-center using stored means
        X_centered = normalization(X_subset, means=means, sds=None)

        # Apply PCA rotation (project onto principal components)
        X_subset_rotated = np.dot(X_centered, rotation_matrix)

        rotated_features.append(X_subset_rotated)

    # Concatenate all rotated subsets to form full feature space
    X_rotated = np.concatenate(rotated_features, axis=1)

    return X_rotated

def rotation_forest_predict(ensemble, X):
    """
    Make predictions using Rotation Forest ensemble.

    Parameters:
    - ensemble: list of trained trees with rotation info
    - X: test data

    Returns:
    - predictions: ensemble predictions using majority voting
    """
    L = len(ensemble)
    yHats = np.zeros((L, X.shape[0]))

    for l in range(L):
        # Apply rotation transformation
        X_rotated = apply_rotation_transform(
            X,
            ensemble[l]['feature_subsets'],
            ensemble[l]['rotation_matrices'],
            ensemble[l]['subset_means']
        )

        # Get predictions from tree
        yHats[l,] = predict(ensemble[l]['tree'], X_rotated)

    # Aggregate predictions using majority voting
    return majority_voting(yHats)

# Train Rotation Forest

In [None]:
print("Training Rotation Forest ensemble...")
np.random.seed(777)
rotation_forest_ensemble = rotation_forest(X_train, y_train, L=10, M=3, max_depth=15)
print(f"Trained Rotation Forest with {len(rotation_forest_ensemble)} trees")

# Evaluate Rotation Forest

In [None]:
yHat_rotation = rotation_forest_predict(rotation_forest_ensemble, X_test)
_, confusion_mat_rotation = confusion_matrix(y_test, yHat_rotation)
accuracy_rotation = accuracy(confusion_mat_rotation)
print("Rotation Forest accuracy on test data:", accuracy_rotation)
print("Confusion matrix:\n", confusion_mat_rotation)

# Train baseline Random Forest for comparison

In [None]:
def random_forest(X, y, K, max_depth=100):
    """Standard Random Forest implementation (baseline)"""
    decision_trees = []
    for k in range(K):
        ind = np.random.choice(X.shape[0], size=X.shape[0], replace=True)
        X_sample = X[ind,]
        y_sample = y[ind]
        decision_trees.append(build_tree(X_sample, y_sample, len(set(y_sample)), max_features="sqrt", max_depth=max_depth))
    return decision_trees

def random_forest_predict(decision_trees, X):
    """Standard Random Forest prediction"""
    K = len(decision_trees)
    yHats = np.zeros((K, X.shape[0]))
    for k in range(K):
        yHats[k,] = predict(decision_trees[k], X)
    return majority_voting(yHats)

In [None]:
print("Training Random Forest baseline...")
np.random.seed(777)
random_forest_ensemble = random_forest(X_train, y_train, K=10, max_depth=15)
print(f"Trained Random Forest with {len(random_forest_ensemble)} trees")

# Evaluate Random Forest baseline

In [None]:
yHat_rf = random_forest_predict(random_forest_ensemble, X_test)
_, confusion_mat_rf = confusion_matrix(y_test, yHat_rf)
accuracy_rf = accuracy(confusion_mat_rf)
print("Random Forest accuracy on test data:", accuracy_rf)
print("Confusion matrix:\n", confusion_mat_rf)

# Comparison: Rotation Forest vs Random Forest

In [None]:
print("=" * 50)
print("PERFORMANCE COMPARISON")
print("=" * 50)
print(f"Rotation Forest accuracy: {accuracy_rotation:.4f}")
print(f"Random Forest accuracy:   {accuracy_rf:.4f}")
print(f"Improvement:              {(accuracy_rotation - accuracy_rf):.4f}")
print("=" * 50)
print("\nRotation Forest applies PCA-based feature rotation to each tree,")
print("which increases diversity among trees and improves ensemble performance.")

# Evaluate individual tree accuracies

In [None]:
def get_individual_tree_accuracies_rotation(ensemble, X_test, y_test):
    """Get accuracy of each individual tree in Rotation Forest"""
    accuracies = []
    for classifier in ensemble:
        X_rotated = apply_rotation_transform(
            X_test,
            classifier['feature_subsets'],
            classifier['rotation_matrices'],
            classifier['subset_means']
        )
        yHat = predict(classifier['tree'], X_rotated)
        _, confusion_mat = confusion_matrix(y_test, yHat)
        accuracies.append(accuracy(confusion_mat))
    return accuracies

def get_individual_tree_accuracies_rf(trees, X_test, y_test):
    """Get accuracy of each individual tree in Random Forest"""
    accuracies = []
    for tree in trees:
        yHat = predict(tree, X_test)
        _, confusion_mat = confusion_matrix(y_test, yHat)
        accuracies.append(accuracy(confusion_mat))
    return accuracies

In [None]:
acc_rotation_individual = get_individual_tree_accuracies_rotation(rotation_forest_ensemble, X_test, y_test)
acc_rf_individual = get_individual_tree_accuracies_rf(random_forest_ensemble, X_test, y_test)

print("Rotation Forest - Individual tree accuracies:")
print(f"  Mean: {np.mean(acc_rotation_individual):.4f}")
print(f"  Std:  {np.std(acc_rotation_individual):.4f}")
print(f"  Min:  {np.min(acc_rotation_individual):.4f}")
print(f"  Max:  {np.max(acc_rotation_individual):.4f}")

print("\nRandom Forest - Individual tree accuracies:")
print(f"  Mean: {np.mean(acc_rf_individual):.4f}")
print(f"  Std:  {np.std(acc_rf_individual):.4f}")
print(f"  Min:  {np.min(acc_rf_individual):.4f}")
print(f"  Max:  {np.max(acc_rf_individual):.4f}")

print("\nEnsemble accuracies:")
print(f"  Rotation Forest: {accuracy_rotation:.4f}")
print(f"  Random Forest:   {accuracy_rf:.4f}")

# Visualization: Individual vs Ensemble Performance

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Plot individual tree accuracies
ax1.plot(range(1, 11), acc_rotation_individual, 'o-', label='Rotation Forest', color='blue', linewidth=2)
ax1.plot(range(1, 11), acc_rf_individual, 's-', label='Random Forest', color='green', linewidth=2)
ax1.axhline(y=accuracy_rotation, color='blue', linestyle='--', label='RotForest Ensemble', alpha=0.7)
ax1.axhline(y=accuracy_rf, color='green', linestyle='--', label='RandForest Ensemble', alpha=0.7)
ax1.set_xlabel('Tree Index')
ax1.set_ylabel('Accuracy')
ax1.set_title('Individual Tree Accuracies')
ax1.legend()
ax1.grid(True, alpha=0.3)

# Plot comparison bar chart
methods = ['Rotation Forest', 'Random Forest']
accuracies = [accuracy_rotation, accuracy_rf]
colors = ['blue', 'green']
ax2.bar(methods, accuracies, color=colors, alpha=0.7)
ax2.set_ylabel('Accuracy')
ax2.set_title('Ensemble Performance Comparison')
ax2.set_ylim([0.85, 1.0])
for i, v in enumerate(accuracies):
    ax2.text(i, v + 0.005, f'{v:.4f}', ha='center', fontweight='bold')
ax2.grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

# Hyperparameter Tuning
# Testing different values of L (number of trees) and max_depth

In [None]:
# Create validation split: 60% train, 20% validation, 20% test
np.random.seed(777)
n_total = X_train.shape[0]
n_val = int(n_total * 0.25)  # 20% of original data = 25% of training data

ind_val = np.random.choice(n_total, size=n_val, replace=False)
bool_ind_val = np.isin(range(n_total), ind_val)

X_train_tuning = X_train[~bool_ind_val]
y_train_tuning = y_train[~bool_ind_val]
X_val = X_train[bool_ind_val]
y_val = y_train[bool_ind_val]

print("Tuning set sizes:")
print(f"  X_train_tuning: {X_train_tuning.shape}")
print(f"  X_val: {X_val.shape}")
print(f"  X_test: {X_test.shape}")

In [None]:
# Hyperparameter grid search
L_values = [10, 20, 50]
max_depth_values = [10, 15, 20]

results = []

for L in L_values:
    for max_depth in max_depth_values:
        print(f"Training with L={L}, max_depth={max_depth}...")

        np.random.seed(777)
        ensemble = rotation_forest(X_train_tuning, y_train_tuning, L=L, M=3, max_depth=max_depth)

        # Evaluate on validation set
        yHat_val = rotation_forest_predict(ensemble, X_val)
        _, confusion_mat = confusion_matrix(y_val, yHat_val)
        acc_val = accuracy(confusion_mat)

        results.append({
            'L': L,
            'max_depth': max_depth,
            'val_accuracy': acc_val
        })
        print(f"  Validation accuracy: {acc_val:.4f}")

# Convert to DataFrame for easy viewing
results_df = pd.DataFrame(results)
results_df

In [None]:
# Find best hyperparameters
best_result = results_df.loc[results_df['val_accuracy'].idxmax()]
print("Best hyperparameters:")
print(f"  L = {int(best_result['L'])}")
print(f"  max_depth = {int(best_result['max_depth'])}")
print(f"  Validation accuracy = {best_result['val_accuracy']:.4f}")

In [None]:
# Train final model with best hyperparameters on full training set
best_L = int(best_result['L'])
best_max_depth = int(best_result['max_depth'])

print(f"\nTraining final model with L={best_L}, max_depth={best_max_depth}...")
np.random.seed(777)
final_ensemble = rotation_forest(X_train, y_train, L=best_L, M=3, max_depth=best_max_depth)

# Evaluate on test set
yHat_final = rotation_forest_predict(final_ensemble, X_test)
_, confusion_mat_final = confusion_matrix(y_test, yHat_final)
accuracy_final = accuracy(confusion_mat_final)

print("\nFinal model performance on test set:")
print(f"  L = {best_L}, max_depth = {best_max_depth}")
print(f"  Test accuracy = {accuracy_final:.4f}")
print(f"  Confusion matrix:\n{confusion_mat_final}")

# Visualization: Hyperparameter tuning results

In [None]:
# Plot validation accuracy vs L for different max_depth values
fig, ax = plt.subplots(figsize=(10, 6))

for max_depth in max_depth_values:
    subset = results_df[results_df['max_depth'] == max_depth]
    ax.plot(subset['L'], subset['val_accuracy'], 'o-', label=f'max_depth={max_depth}', linewidth=2, markersize=8)

ax.set_xlabel('Number of Trees (L)')
ax.set_ylabel('Validation Accuracy')
ax.set_title('Hyperparameter Tuning: Rotation Forest Performance')
ax.legend()
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Summary
#
# **Algorithm:** Rotation Forest ensemble classifier
#
# **Dataset:** Breast Cancer Wisconsin (Diagnostic) Dataset
# - URL: https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Diagnostic)
# - 569 samples, 30 features, 2 classes (Benign vs Malignant)
# - Larger dataset provides more reliable performance comparison
#
# **Implementation:**
# - Disjoint feature subsets with M=3 features per subset
# - 75% bootstrap sampling for PCA computation
# - All PCA components retained per subset
# - Standard PCA with mean-centering applied per subset
# - Decision trees trained on rotated feature space
#
# **Results:**
# - Rotation Forest demonstrates performance compared to Random Forest baseline
# - Feature rotation via PCA increases diversity among trees
# - Ensemble aggregation via majority voting improves individual tree predictions
# - Larger dataset (569 vs 178 samples) provides more reliable evaluation
