In [1]:
# Required packages: pip install scikit-learn pandas numpy
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold  # Changed to StratifiedKFold
from sklearn.metrics import confusion_matrix

# Load the data
# Note: The last column is the target variable (spam = 1, non-spam = 0)
data = pd.read_csv('c:/Users/clagg/Downloads/spambase/spambase.data', header=None)

# Split into features and target
X = data.iloc[:, :-1]  # All columns except the last one
y = data.iloc[:, -1]   # Last column (target)

# Initialize the Random Forest Classifier
# You can adjust these parameters based on your needs
rf_classifier = RandomForestClassifier(
    n_estimators=100,  # Number of trees in the forest
    max_depth=None,    # Maximum depth of the tree
    random_state=42    # For reproducibility
)

# Use StratifiedKFold for cross-validation to handle class imbalance
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42) # Added StratifiedKFold
fold_metrics = [] # List to store metrics for each fold

# Perform 10-fold cross validation
for fold, (train_index, test_index) in enumerate(skf.split(X, y)): # Changed to skf.split
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    rf_classifier.fit(X_train, y_train) # Fit in the loop

    y_pred = rf_classifier.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cm.ravel()

    # Calculate metrics
    tpr = tp / (tp + fn) if (tp + fn) > 0 else 0
    tnr = tn / (tn + fp) if (tn + fp) > 0 else 0
    fpr = fp / (tn + fp) if (tn + fp) > 0 else 0
    fnr = fn / (tp + fn) if (tp + fn) > 0 else 0
    tss = tpr + tnr - 1
    hss = (2 * (tp * tn - fp * fn)) / ((tp + fn) * (fn + tn) + (tp + fp) * (fp + tn)) if ((tp + fn) * (fn + tn) + (tp + fp) * (fp + tn)) > 0 else 0


    fold_metrics.append({
        'Fold': fold + 1,
        'TP': tp, 'TN': tn, 'FP': fp, 'FN': fn,
        'TPR': tpr, 'TNR': tnr, 'FPR': fpr, 'FNR': fnr,
        'TSS': tss, 'HSS': hss
    })

# Create DataFrame for fold-wise metrics
fold_metrics_df = pd.DataFrame(fold_metrics)

# Calculate average metrics
average_metrics = fold_metrics_df.mean(numeric_only=True).to_dict()
average_metrics['Fold'] = 'Average' # Add Fold Column
average_metrics_df = pd.DataFrame([average_metrics]) # Make it a DataFrame

# Concatenate fold-wise and average metrics DataFrames
results_df = pd.concat([fold_metrics_df, average_metrics_df], ignore_index=True)

# Print the results in tabular format
print("\nPerformance Metrics for Each Fold and Average:")
print(results_df.to_string(index=False, formatters={ # Added formatters
    'TPR': '{:.4f}'.format, 'TNR': '{:.4f}'.format,
    'FPR': '{:.4f}'.format, 'FNR': '{:.4f}'.format,
    'TSS': '{:.4f}'.format, 'HSS': '{:.4f}'.format
}))

# Fit the model on the entire dataset (You already have this, but I'm including it for completeness)
rf_classifier.fit(X, y)

# Get feature importances
feature_importances = pd.DataFrame({
    'feature': range(X.shape[1]),
    'importance': rf_classifier.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 10 Most Important Features:")
print(feature_importances.head(10).to_string(index=False)) # Added index=False



Performance Metrics for Each Fold and Average:
   Fold    TP    TN   FP   FN    TPR    TNR    FPR    FNR    TSS    HSS
      1 169.0 270.0  9.0 13.0 0.9286 0.9677 0.0323 0.0714 0.8963 0.8998
      2 169.0 271.0  7.0 13.0 0.9286 0.9748 0.0252 0.0714 0.9034 0.9086
      3 171.0 271.0  7.0 11.0 0.9396 0.9748 0.0252 0.0604 0.9144 0.9179
      4 171.0 274.0  5.0 10.0 0.9448 0.9821 0.0179 0.0552 0.9268 0.9313
      5 169.0 266.0 13.0 12.0 0.9337 0.9534 0.0466 0.0663 0.8871 0.8862
      6 169.0 271.0  8.0 12.0 0.9337 0.9713 0.0287 0.0663 0.9050 0.9086
      7 171.0 270.0  9.0 10.0 0.9448 0.9677 0.0323 0.0552 0.9125 0.9134
      8 168.0 272.0  7.0 13.0 0.9282 0.9749 0.0251 0.0718 0.9031 0.9084
      9 163.0 270.0  9.0 18.0 0.9006 0.9677 0.0323 0.0994 0.8683 0.8759
     10 165.0 273.0  6.0 16.0 0.9116 0.9785 0.0215 0.0884 0.8901 0.8988
Average 168.5 270.8  8.0 12.8 0.9294 0.9713 0.0287 0.0706 0.9007 0.9049

Top 10 Most Important Features:
 feature  importance
      51    0.122275
      52    0