## Setting up the environment, loading & splitting of the data, and class distribution analysis

In [60]:
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
import matplotlib.pyplot as plt
import xgboost as xgb
import pandas as pd
import numpy as np
import time

# Load the dataset and split
data = pd.read_csv("spambase.data", header=None)
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

n_splits = 10
# Initialize StratifiedKFold
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)


models = {
    "Logistic Regression": LogisticRegression(max_iter=5000, random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "XGBoost": xgb.XGBClassifier(eval_metric='logloss', random_state=42)
}

class_distribution = y.value_counts(normalize=True) * 100
print("Class Distribution (%):\n", class_distribution)


Class Distribution (%):
 57
0    60.595523
1    39.404477
Name: proportion, dtype: float64


The class distribution is <mark>**60.6:39.4**</mark>, which indicates a mild imbalance, just meeting the threshold. Given the nature of the data—classified as Spam or Not Spam — <mark>**Recall**</mark> is a critical metric for evaluating model performance. This is because, in spam detection, minimizing false negatives (missed spam emails) is crucial to ensure a reliable system. 

## Helper function definations 

Following three helper functions is used to 
- Perform Stratified k-fold  
- F1 score comparison 
- Average rank statistics to find the Friedman statistic\n 


In [64]:
def stratified_k_fold_eval(model, model_name):
    results = []
    fold_metrics = {"F1 Score": [], "Accuracy": [], "Training Time (s)": []}

    for fold_idx, (train_idx, test_idx) in enumerate(skf.split(X, y), start=1):
        # Split data
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        # Measure training time
        start_time = time.time()
        model.fit(X_train, y_train)
        train_time = time.time() - start_time

        # Predictions and metrics
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)

        # Store fold-specific metrics
        fold_metrics["F1 Score"].append(f1)
        fold_metrics["Accuracy"].append(accuracy)
        fold_metrics["Training Time (s)"].append(train_time)

        # Store detailed results
        results.append({
            "Fold": fold_idx,
            "Training Time (s)": train_time,
            "Accuracy": accuracy,
            "F1 Score": f1
        })

    # Convert results to a DataFrame
    results_df = pd.DataFrame(results)

    # Compute Mean and Std
    mean_row = {
        "Fold": "Mean",
        "Training Time (s)": results_df["Training Time (s)"].mean(),
        "Accuracy": results_df["Accuracy"].mean(),
        "F1 Score": results_df["F1 Score"].mean()
    }
    std_row = {
        "Fold": "Std",
        "Training Time (s)": results_df["Training Time (s)"].std(),
        "Accuracy": results_df["Accuracy"].std(),
        "F1 Score": results_df["F1 Score"].std()
    }

    # Add Mean and Std rows
    results_df = pd.concat([results_df, pd.DataFrame([mean_row, std_row])], ignore_index=True)

    return results_df, fold_metrics


def metric_comparison(precomputed_metrics, metric_name):
    """
    Generate a comparison table for the given metric, including rankings for each fold.
    """
    table = {"Fold": [f"Fold {i}" for i in range(1, n_splits + 1)]}
    metric_results = {model_name: metrics[metric_name] for model_name, metrics in precomputed_metrics.items()}

    for model_name, values in metric_results.items():
        table[model_name] = []

    for fold_idx in range(n_splits):
        # Extract metric values for all models for this fold
        fold_values = {model_name: metric_results[model_name][fold_idx] for model_name in metric_results}

        # Rank models for this fold (lower rank is better)
        sorted_models = sorted(fold_values.items(), key=lambda x: x[1], reverse=(metric_name != "Training Time (s)"))
        rankings = {model: rank + 1 for rank, (model, _) in enumerate(sorted_models)}

        # Append metric values with ranks
        for model_name in metric_results.keys():
            value = metric_results[model_name][fold_idx]
            rank = rankings[model_name]
            table[model_name].append(f"{value:.4f} ({rank})")

    # Compute average rank for each model
    avg_rank_row = ["Avg Rank"]
    for model_name in metric_results.keys():
        ranks = [int(val.split('(')[-1].strip(')')) for val in table[model_name]]
        avg_rank = sum(ranks) / len(ranks)
        avg_rank_row.append(f"{avg_rank:.2f}")

    # Append average rank row to the table
    table["Fold"].append(avg_rank_row[0])
    for idx, model_name in enumerate(metric_results.keys(), start=1):
        table[model_name].append(avg_rank_row[idx])

    # Convert table to DataFrame
    results_df = pd.DataFrame(table)

    # Print the table
    print(f"\n{metric_name} Comparison:\n")
    print(results_df.to_string(index=False))

    return results_df




def calculate_rank_statistics(results_df, metric_name):
    print(f"\nFriedman statistic calculation with metric:{metric_name}")

    # Extract the number of folds (n) and number of models (k)
    n = len(results_df) - 1  # Exclude the last row (average ranks)
    k = len(results_df.columns) - 1  # Exclude the "Fold" column

    # Extract average ranks (last row excluding "Fold")
    avg_ranks = results_df.iloc[-1, 1:].astype(float).values

    # Calculate R_bar (average of the average ranks)
    R_bar = np.mean(avg_ranks)

    # Calculate N * SUM_j (Rj - R_bar)^2
    N = n
    term2 = N * np.sum((avg_ranks - R_bar) ** 2)

    # Calculate (1 / (n - (k - 1))) * SUM_ij (Rij - Rj)^2
    ranks_matrix = results_df.iloc[:n, 1:].map(lambda x: int(x.split('(')[-1].strip(')'))).values
    term3 = (1 / (n * (k - 1))) * np.sum((ranks_matrix - R_bar) ** 2)

    friedman_statistic = term2 / term3

    # Display results
    print(f"\tR̅ = {R_bar:.4f}")
    print(f"\tN * Σj (Rj - R̅)^2 = {term2:.4f}")
    print(f"\t(1 / (n * (k - 1))) * Σij (Rij - Rj)^2 = {term3:.4f}")
    print(f"\tFriedman statistic which is the ratio of the 2nd and 3nd terms = {term2:.4f}/{term3:.4f} = {friedman_statistic:.4f}" )

    # Update DataFrame with R_bar on the top bar
    results_df.loc[-1] = ["R_bar"] + [f"{R_bar:.4f}"] * k
    results_df.index = results_df.index + 1
    results_df.sort_index(inplace=True)

    return results_df

# Procedure 1 & 2

In [66]:
# Run evaluation for all models and store metrics
# Cross-validation report for selected models on given metric for 10 folds

all_results = {}
metrics_per_model = {}
for model_name, model in models.items():
    eval_results, metrics = stratified_k_fold_eval(model, model_name)
    all_results[model_name] = eval_results
    metrics_per_model[model_name] = metrics

    print(f"\nCross-validatiaon results for {model_name}:\n")
    print(eval_results.to_string(index=False))


Cross-validatiaon results for Logistic Regression:

Fold  Training Time (s)  Accuracy  F1 Score
   1           3.998265  0.917570  0.895028
   2           4.226130  0.923913  0.901961
   3           6.156433  0.932609  0.913165
   4           3.926541  0.936957  0.918768
   5           3.548844  0.913043  0.890110
   6           3.880666  0.936957  0.920110
   7           4.809063  0.934783  0.915730
   8           3.740062  0.930435  0.909605
   9           2.888073  0.936957  0.915452
  10           4.568816  0.915217  0.890141
Mean           4.174289  0.927844  0.907007
 Std           0.875775  0.009581  0.011743

Cross-validatiaon results for Random Forest:

Fold  Training Time (s)  Accuracy  F1 Score
   1           3.727035  0.952278  0.938889
   2           3.686904  0.956522  0.944134
   3           2.803832  0.960870  0.950000
   4           3.336029  0.967391  0.957983
   5           3.956414  0.945652  0.931129
   6           2.648351  0.956522  0.944134
   7           2.545

# Procedure 3 Friedman test

In [71]:
# Comparison between selected models based on given metrics for 10 folds
f1_comparison_table = metric_comparison(metrics_per_model, "F1 Score")
accuracy_comparison_table = metric_comparison(metrics_per_model, "Accuracy")
training_time_comparison_table = metric_comparison(metrics_per_model, "Training Time (s)")


F1 Score Comparison:

    Fold Logistic Regression Random Forest    XGBoost
  Fold 1          0.8950 (3)    0.9389 (1) 0.9315 (2)
  Fold 2          0.9020 (3)    0.9441 (2) 0.9471 (1)
  Fold 3          0.9132 (3)    0.9500 (1) 0.9482 (2)
  Fold 4          0.9188 (3)    0.9580 (1) 0.9500 (2)
  Fold 5          0.8901 (3)    0.9311 (2) 0.9428 (1)
  Fold 6          0.9201 (3)    0.9441 (2) 0.9444 (1)
  Fold 7          0.9157 (3)    0.9474 (1) 0.9363 (2)
  Fold 8          0.9096 (3)    0.9438 (2) 0.9479 (1)
  Fold 9          0.9155 (3)    0.9235 (2) 0.9471 (1)
 Fold 10          0.8901 (3)    0.9375 (1) 0.9292 (2)
Avg Rank                3.00          1.50       1.50

Accuracy Comparison:

    Fold Logistic Regression Random Forest    XGBoost
  Fold 1          0.9176 (3)    0.9523 (1) 0.9458 (2)
  Fold 2          0.9239 (3)    0.9565 (2) 0.9587 (1)
  Fold 3          0.9326 (3)    0.9609 (1) 0.9587 (2)
  Fold 4          0.9370 (3)    0.9674 (1) 0.9609 (2)
  Fold 5          0.9130 (3)    0.94

In [73]:
# Calculate the Friedman statistic for different metrics
updated_df = calculate_rank_statistics(f1_comparison_table, "F1 Score")
updated_df = calculate_rank_statistics(accuracy_comparison_table, "Accuracy")
updated_df = calculate_rank_statistics(training_time_comparison_table, "Training Time (s)")



Friedman statistic calculation with metric:F1 Score
	R̅ = 2.0000
	N * Σj (Rj - R̅)^2 = 15.0000
	(1 / (n * (k - 1))) * Σij (Rij - Rj)^2 = 1.0000
	Friedman statistic which is the ratio of the 2nd and 3nd terms = 15.0000/1.0000 = 15.0000

Friedman statistic calculation with metric:Accuracy
	R̅ = 2.0000
	N * Σj (Rj - R̅)^2 = 15.2000
	(1 / (n * (k - 1))) * Σij (Rij - Rj)^2 = 1.0000
	Friedman statistic which is the ratio of the 2nd and 3nd terms = 15.2000/1.0000 = 15.2000

Friedman statistic calculation with metric:Training Time (s)
	R̅ = 2.0000
	N * Σj (Rj - R̅)^2 = 18.2000
	(1 / (n * (k - 1))) * Σij (Rij - Rj)^2 = 1.0000
	Friedman statistic which is the ratio of the 2nd and 3nd terms = 18.2000/1.0000 = 18.2000


# F1 score Friedman test analysis

The critical value for k=3 and n=10 at the α=0.05 level is 6.20 (obtained from **Tables for the Friedman rank test**)
Since the Friedman statistic (15) is greater than the critical value (6.2), we reject the **Null Hypothesis**. This suggests that at least one model is performing significantly differently for the given dataset but does not indicate which model is performing differently. Therefore, we conduct the Nemenyi test.

## Procedure 4 Nemenyi test for F1 score

to conduct Nemenyi test we compare the absolute difference of the average rank between each model and compare that with the critical difference value. to calculate the critical difference we obtained **qα** value of 2.343 for α=0.05, k=3 and the degree of freedom = infinity(obtained from **Studentized range q-table**)


***we will be using this critical value for each metrice***)

Critical difference calculation

$CD = q_\alpha \cdot \sqrt{\frac{k \cdot (k+1)}{6 \cdot N}}$  
$= 2.343 \cdot \sqrt{\frac{3 \cdot 4}{6 \cdot 10}}$  
$= 2.343 \cdot 0.45$  
$= 1.05$


### Final Analysis
For \( R_j = [1.5, 3.0, 1.5] \) and \( CD = 1.05 \), the output will be:

| Comparison                          | Rank Difference | Significant |
|---------------------|---------------|-------------------------------|
|Logistic Regression Vs Random Forest | 1.50            | True        |
|Random Forest Vs XGBoost             | 0.00            | False       |
|Logistic Regression Vs XGBoost       | 1.50            | True        |

### Compare each model

1. **Logistic Regression vs Random Forest:**  
   \( |1.5 - 3.0| = 1.5 \)  
   Since \( 1.5 > 1.05 \), this is **significant**.

2. **Random Forest Vs XGBoost:**  
   \( |1.5 - 1.5| = 0.0 \)  
   Since \( 0.0 \leq 1.05 \), this is **not significant**

3. **Logistic Regression** Vs **XGBoost**  
   \( |3.0 - 1.5| = 1.5 \)  
   Since \( 1.5 > 1.05 \), this is **significant**.

This table summarizes the Nemenyi test results.

***from the above Nemenyi test we conduct for F1 score we can conclude that Logistic Regression is preforming significantly different from the all the 3 models***


# Accuracy Friedman test analysis¶

Since the Friedman statistic (15.2) is greater than the critical value (6.2), we reject the **Null Hypothesis**. This suggests that at least one model is performing significantly differently for the given dataset but does not indicate which model is performing differently. Therefore, we conduct the Nemenyi test.

## Procedure 4 Nemenyi test for Accuracy

### Final Analysis

| Comparison                          | Rank Difference | Significant |
|---------------------|---------------|-------------------------------|
|Logistic Regression Vs Random Forest | 1.60            | True        |
|Random Forest Vs XGBoost             | 0.20            | False       |
|Logistic Regression Vs XGBoost       | 1.40            | True        |

### Compare each model
1. **Logistic Regression vs Random Forest:**  
   \( |1.4 - 3.0| = 1.6 \)  
   Since \( 1.5 > 1.05 \), this is **significant**.

2. **Random Forest Vs XGBoost:**  
   \( |1.4 - 1.6| = 0.0 \)  
   Since \( 0.20 \leq 1.05 \), this is **not significant**

3. **Logistic Regression** Vs **XGBoost**  
   \( |3.0 - 1.6| = 1.4 \)  
   Since \( 1.4 > 1.05 \), this is **significant**.

This table summarizes the Nemenyi test results.


***from the above Nemenyi test we conduct for Accuracy we can conclude that Logistic Regression is preforming significantly  different from the all the 3 models***


# Training Time Friedman test analysis

Since the Friedman statistic (20) is greater than the critical value (6.2), we reject the **Null Hypothesis**. This suggests that at least one model is performing significantly differently for the given dataset but does not indicate which model is performing differently. Therefore, we conduct the Nemenyi test.

## Procedure 4 Nemenyi test for Training Time

### Final Analysis

| Comparison                          | Rank Difference | Significant |
|---------------------|---------------|-------------------------------|
|Logistic Regression Vs Random Forest | 1.00            | True        |
|Random Forest Vs XGBoost             | 1.00            | False       |
|Logistic Regression Vs XGBoost       | 2.00            | True        |

### Compare each model
1. **Logistic Regression vs Random Forest:**  
   \( |3.0 - 2.0| = 1.0 \)  
   Since \( 1 < 1.05 \), this is ** not significant**.

2. **Random Forest Vs XGBoost:**  
   \( |2.0 - 1.0| = 1.0 \)  
   Since \( 1.0 < 1.05 \), this is **not significant**

3. **Logistic Regressionnt** Vs **XGBoost**  
   \( |3.0 - 1.0| = 2.0 \)  
   Since \( 2.0 > 1.05 \), this is **significant**.

This table summarizes the Nemenyi test results.


***from the above Nemenyi test we conduct for Training time we can conclude that Logistic Regression and XGBoost are preforming significantly different from each other***
