In [None]:
#!/usr/bin/env python3
"""
Wine Classification - Overall and Local Feature Importance
Tests all 4 configurations: GPU/CPU × Casewise/Non-casewise
Includes: OOB Error, Confusion Matrix, Overall Importance, Local Importance
"""

import sys
import os
import numpy as np
import rfx as rf
import time

In [20]:
# Feature names for Wine dataset
FEATURE_NAMES = [
    'Alcohol', 'Malic acid', 'Ash', 'Alcalinity of ash',
    'Magnesium', 'Total phenols', 'Flavanoids', 'Nonflavanoid phenols',
    'Proanthocyanins', 'Color intensity', 'Hue',
    'OD280/OD315 of diluted wines', 'Proline'
]

CLASS_NAMES = ['Class 0', 'Class 1', 'Class 2']

def print_confusion_matrix(cm, n_classes):
    """Pretty print confusion matrix"""
    # Header
    header = "          " + "  ".join(f"Pred {i}" for i in range(n_classes))
    print(header)
    print("-" * len(header))
    
    # Rows
    for i in range(n_classes):
        row = f"True {i}  |"
        for j in range(n_classes):
            row += f"   {cm[i, j]:3d}  "
        print(row)
    print()

 

In [21]:
# Load Wine dataset (built-in)
X, y = rf.load_wine()
n_samples, n_features = X.shape
n_classes = len(np.unique(y))

print(f"\nDataset: Wine (UCI ML - built-in)")
print(f"   Samples: {n_samples}")
print(f"   Features: {n_features}")
print(f"   Classes: {n_classes}")
print(f"   Class distribution: {np.bincount(y).tolist()}")

# Run all 4 configurations
ntree = 100
results = {}



Dataset: Wine (UCI ML - built-in)
   Samples: 178
   Features: 13
   Classes: 3
   Class distribution: [59, 71, 48]


In [11]:
#1. GPU casewise

In [None]:
#GPU casewise
use_gpu = True
mode='gpu' if use_gpu else 'cpu'
use_casewise=True
weighting = 'case-wise' if use_casewise else 'non-case-wise'
run_type='gpu_cw'
# Create model with overall AND local importance
model = rf.RandomForestClassifier(
    ntree=ntree,
    mtry=4,  # sqrt(13) ≈ 3.6
    nsample=X.shape[0],
    nclass=n_classes,
    use_gpu=use_gpu,
    batch_size=25,
    iseed=42,
    compute_proximity=False,
    compute_importance=True,          # Overall importance
    compute_local_importance=True,    # Local importance (per-sample)
    use_casewise=use_casewise
)

# Train
print(f"\n{'='*70}")
print(f"  {mode.upper()} {weighting.upper()}")
print(f"{'='*70}")
print(f"\nTraining {ntree} trees...")
start_time = time.time()
model.fit(X, y)
elapsed = time.time() - start_time

# Get results
oob_error = model.get_oob_error()
oob_preds = model.get_oob_predictions()
overall_imp = model.feature_importances_()
local_imp = model.get_local_importance()

print(f"Training time: {elapsed:.2f}s ({ntree/elapsed:.1f} trees/sec)")

# OOB Error
print(f"\nOOB Error: {oob_error:.6f} ({oob_error*100:.2f}%)")
print(f"   OOB Accuracy: {(1-oob_error)*100:.2f}%")

# Overall Feature Importance
print(f"\nOverall Feature Importance (Top 5):")
sorted_idx = np.argsort(overall_imp)[::-1]
for rank, idx in enumerate(sorted_idx[:5], 1):
    print(f"   {rank}. {FEATURE_NAMES[idx]:<35} {overall_imp[idx]:.6f}")

# Local Importance Statistics
print(f"\nLocal Importance: shape={local_imp.shape}")
local_mean = np.mean(local_imp, axis=0)
sorted_local_idx = np.argsort(local_mean)[::-1]
print(f"   Top 3 by mean: {', '.join([FEATURE_NAMES[i] for i in sorted_local_idx[:3]])}")

# Confusion Matrix
cm = rf.confusion_matrix(y.astype(np.int32), oob_preds.astype(np.int32))
print(f"\nConfusion Matrix:")
print_confusion_matrix(cm, n_classes)

# Classification Report
print(f"Classification Report:")
print(rf.classification_report(y.astype(np.int32), oob_preds.astype(np.int32)))

results[run_type] = {
    'mode': f"{mode} {weighting}",
    'oob_error': oob_error,
    'confusion_matrix': cm,
    'time': elapsed,
    'oob_preds': oob_preds,
    'overall_imp': overall_imp,
    'local_imp': local_imp
}


In [None]:
#GPU noncasewise
use_gpu = True
mode='gpu' if use_gpu else 'cpu'
use_casewise=False
weighting = 'case-wise' if use_casewise else 'non-case-wise'
run_type='gpu_ncw'
# Create model with overall AND local importance
model = rf.RandomForestClassifier(
    ntree=ntree,
    mtry=4,  # sqrt(13) ≈ 3.6
    nsample=X.shape[0],
    nclass=n_classes,
    use_gpu=use_gpu,
    batch_size=25,
    iseed=42,
    compute_proximity=False,
    compute_importance=True,          # Overall importance
    compute_local_importance=True,    # Local importance (per-sample)
    use_casewise=use_casewise
)

# Train
print(f"\n{'='*70}")
print(f"  {mode.upper()} {weighting.upper()}")
print(f"{'='*70}")
print(f"\nTraining {ntree} trees...")
start_time = time.time()
model.fit(X, y)
elapsed = time.time() - start_time

# Get results
oob_error = model.get_oob_error()
oob_preds = model.get_oob_predictions()
overall_imp = model.feature_importances_()
local_imp = model.get_local_importance()

print(f"Training time: {elapsed:.2f}s ({ntree/elapsed:.1f} trees/sec)")

# OOB Error
print(f"\nOOB Error: {oob_error:.6f} ({oob_error*100:.2f}%)")
print(f"   OOB Accuracy: {(1-oob_error)*100:.2f}%")

# Overall Feature Importance
print(f"\nOverall Feature Importance (Top 5):")
sorted_idx = np.argsort(overall_imp)[::-1]
for rank, idx in enumerate(sorted_idx[:5], 1):
    print(f"   {rank}. {FEATURE_NAMES[idx]:<35} {overall_imp[idx]:.6f}")

# Local Importance Statistics
print(f"\nLocal Importance: shape={local_imp.shape}")
local_mean = np.mean(local_imp, axis=0)
sorted_local_idx = np.argsort(local_mean)[::-1]
print(f"   Top 3 by mean: {', '.join([FEATURE_NAMES[i] for i in sorted_local_idx[:3]])}")

# Confusion Matrix
cm = rf.confusion_matrix(y.astype(np.int32), oob_preds.astype(np.int32))
print(f"\nConfusion Matrix:")
print_confusion_matrix(cm, n_classes)

# Classification Report
print(f"Classification Report:")
print(rf.classification_report(y.astype(np.int32), oob_preds.astype(np.int32)))

results[run_type] = {
    'mode': f"{mode} {weighting}",
    'oob_error': oob_error,
    'confusion_matrix': cm,
    'time': elapsed,
    'oob_preds': oob_preds,
    'overall_imp': overall_imp,
    'local_imp': local_imp
}


In [None]:
#CPU noncasewise
use_gpu = False
mode='gpu' if use_gpu else 'cpu'
use_casewise=False
weighting = 'case-wise' if use_casewise else 'non-case-wise'
run_type='cpu_ncw'
# Create model with overall AND local importance
model = rf.RandomForestClassifier(
    ntree=ntree,
    mtry=4,  # sqrt(13) ≈ 3.6
    nsample=X.shape[0],
    nclass=n_classes,
    use_gpu=use_gpu,
    batch_size=25,
    iseed=42,
    compute_proximity=False,
    compute_importance=True,          # Overall importance
    compute_local_importance=True,    # Local importance (per-sample)
    use_casewise=use_casewise
)

# Train
print(f"\n{'='*70}")
print(f"  {mode.upper()} {weighting.upper()}")
print(f"{'='*70}")
print(f"\nTraining {ntree} trees...")
start_time = time.time()
model.fit(X, y)
elapsed = time.time() - start_time

# Get results
oob_error = model.get_oob_error()
oob_preds = model.get_oob_predictions()
overall_imp = model.feature_importances_()
local_imp = model.get_local_importance()

print(f"Training time: {elapsed:.2f}s ({ntree/elapsed:.1f} trees/sec)")

# OOB Error
print(f"\nOOB Error: {oob_error:.6f} ({oob_error*100:.2f}%)")
print(f"   OOB Accuracy: {(1-oob_error)*100:.2f}%")

# Overall Feature Importance
print(f"\nOverall Feature Importance (Top 5):")
sorted_idx = np.argsort(overall_imp)[::-1]
for rank, idx in enumerate(sorted_idx[:5], 1):
    print(f"   {rank}. {FEATURE_NAMES[idx]:<35} {overall_imp[idx]:.6f}")

# Local Importance Statistics
print(f"\nLocal Importance: shape={local_imp.shape}")
local_mean = np.mean(local_imp, axis=0)
sorted_local_idx = np.argsort(local_mean)[::-1]
print(f"   Top 3 by mean: {', '.join([FEATURE_NAMES[i] for i in sorted_local_idx[:3]])}")

# Confusion Matrix
cm = rf.confusion_matrix(y.astype(np.int32), oob_preds.astype(np.int32))
print(f"\nConfusion Matrix:")
print_confusion_matrix(cm, n_classes)

# Classification Report
print(f"Classification Report:")
print(rf.classification_report(y.astype(np.int32), oob_preds.astype(np.int32)))

results[run_type] = {
    'mode': f"{mode} {weighting}",
    'oob_error': oob_error,
    'confusion_matrix': cm,
    'time': elapsed,
    'oob_preds': oob_preds,
    'overall_imp': overall_imp,
    'local_imp': local_imp
}


In [None]:
#CPU casewise
use_gpu = False
mode='gpu' if use_gpu else 'cpu'
use_casewise=True
weighting = 'case-wise' if use_casewise else 'non-case-wise'
run_type='cpu_cw'
# Create model with overall AND local importance
model = rf.RandomForestClassifier(
    ntree=ntree,
    mtry=4,  # sqrt(13) ≈ 3.6
    nsample=X.shape[0],
    nclass=n_classes,
    use_gpu=use_gpu,
    batch_size=25,
    iseed=42,
    compute_proximity=False,
    compute_importance=True,          # Overall importance
    compute_local_importance=True,    # Local importance (per-sample)
    use_casewise=use_casewise
)

# Train
print(f"\n{'='*70}")
print(f"  {mode.upper()} {weighting.upper()}")
print(f"{'='*70}")
print(f"\nTraining {ntree} trees...")
start_time = time.time()
model.fit(X, y)
elapsed = time.time() - start_time

# Get results
oob_error = model.get_oob_error()
oob_preds = model.get_oob_predictions()
overall_imp = model.feature_importances_()
local_imp = model.get_local_importance()

print(f"Training time: {elapsed:.2f}s ({ntree/elapsed:.1f} trees/sec)")

# OOB Error
print(f"\nOOB Error: {oob_error:.6f} ({oob_error*100:.2f}%)")
print(f"   OOB Accuracy: {(1-oob_error)*100:.2f}%")

# Overall Feature Importance
print(f"\nOverall Feature Importance (Top 5):")
sorted_idx = np.argsort(overall_imp)[::-1]
for rank, idx in enumerate(sorted_idx[:5], 1):
    print(f"   {rank}. {FEATURE_NAMES[idx]:<35} {overall_imp[idx]:.6f}")

# Local Importance Statistics
print(f"\nLocal Importance: shape={local_imp.shape}")
local_mean = np.mean(local_imp, axis=0)
sorted_local_idx = np.argsort(local_mean)[::-1]
print(f"   Top 3 by mean: {', '.join([FEATURE_NAMES[i] for i in sorted_local_idx[:3]])}")

# Confusion Matrix
cm = rf.confusion_matrix(y.astype(np.int32), oob_preds.astype(np.int32))
print(f"\nConfusion Matrix:")
print_confusion_matrix(cm, n_classes)

# Classification Report
print(f"Classification Report:")
print(rf.classification_report(y.astype(np.int32), oob_preds.astype(np.int32)))

results[run_type] = {
    'mode': f"{mode} {weighting}",
    'oob_error': oob_error,
    'confusion_matrix': cm,
    'time': elapsed,
    'oob_preds': oob_preds,
    'overall_imp': overall_imp,
    'local_imp': local_imp
}


In [None]:
print("=" * 70)
print("  WINE CLASSIFICATION - OVERALL & LOCAL IMPORTANCE")
print("  Testing: GPU/CPU × Casewise/Non-casewise")
print("=" * 70)

# Summary comparison
print("\n" + "=" * 70)
print("  SUMMARY COMPARISON")
print("=" * 70)

print("\nOOB Errors:")
print(f"   {'Configuration':<25s} {'OOB Error':>12s} {'Accuracy':>12s} {'Time':>10s}")
print("   " + "-" * 60)
for key, res in results.items():
    print(f"   {res['mode']:<25s} {res['oob_error']:>12.6f} {(1-res['oob_error'])*100:>11.2f}% {res['time']:>9.2f}s")

# Feature Importance Comparison
print("\nTop 3 Features by Overall Importance:")
print(f"   {'Configuration':<25s} {'#1':<20s} {'#2':<20s} {'#3':<20s}")
print(f"   {'-'*85}")
for key, res in results.items():
    sorted_idx = np.argsort(res['overall_imp'])[::-1]
    top3 = [FEATURE_NAMES[i][:18] for i in sorted_idx[:3]]
    print(f"   {res['mode']:<25s} {top3[0]:<20s} {top3[1]:<20s} {top3[2]:<20s}")

# Importance Correlation
try:
    from scipy.stats import spearmanr
    print("\nOverall Importance Spearman Correlations:")
    configs = list(results.keys())
    print(f"   {'':25s}", end="")
    for c in configs:
        print(f"{results[c]['mode'][:12]:>14s}", end="")
    print()
    for c1 in configs:
        print(f"   {results[c1]['mode']:<25s}", end="")
        for c2 in configs:
            corr, _ = spearmanr(results[c1]['overall_imp'], results[c2]['overall_imp'])
            print(f"{corr:>14.4f}", end="")
        print()
except ImportError:
    print("\n   (scipy not available for correlation analysis)")

print("\nCasewise vs Non-casewise Differences:")
gpu_diff = abs(results['gpu_cw']['oob_error'] - results['gpu_ncw']['oob_error'])
cpu_diff = abs(results['cpu_cw']['oob_error'] - results['cpu_ncw']['oob_error'])
print(f"   GPU:  {gpu_diff:.6f} ({gpu_diff*100:.2f}% difference)")
print(f"   CPU:  {cpu_diff:.6f} ({cpu_diff*100:.2f}% difference)")

if gpu_diff < 0.001 and cpu_diff < 0.001:
    print("\n   WARNING: Casewise and non-casewise produce IDENTICAL results!")
else:
    print("\n   Casewise and non-casewise produce DIFFERENT results (expected!)")

print("\n" + "=" * 70)
print("  TEST COMPLETE")
print("=" * 70)