In [81]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

file = "spambase.data"  
data = pd.read_csv(file, header=None)
# data.head
# data.info
# data.describe()
data.shape

(4601, 58)

# Separet class and features and Scale data 

In [84]:
X = data.iloc[:, :-1]  
y = data.iloc[:, -1] 
X.shape, y.shape
# print(X[:5])

((4601, 57), (4601,))

In [86]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X = sc.fit_transform(X)

# print(X[:5])

# Data split

In [89]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)
X_train.shape, X_test.shape
# y_train.shape, y_test.shape

((3680, 57), (921, 57))

# StratifiedKFold for all the models on the same fold 


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import StratifiedKFold, LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score
import numpy as np
import time

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Define the models
models = {
    "Random Forest": RandomForestClassifier(),
    "Logistic Regression": LogisticRegression(),
    "XGBoost": XGBClassifier(eval_metric='logloss')
}

# Initialize dictionaries to store results for each model
results = {model_name: {"accuracies": [], "f1_scores": [], "train_times": []} for model_name in models}

# Perform Stratified K-Fold Cross-Validation for each model
for model_name, model in models.items():
    print(f"\nEvaluating {model_name}...\n")
    
    for fold, (train_index, test_index) in enumerate(skf.split(X, y), 1):
        # Split the data into train and test for the current fold
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        # Train the model
        start_time = time.time()  # Record start time
        model.fit(X_train, y_train)  # Train the model
        end_time = time.time()  # Record end time
        
        training_time = end_time - start_time  # Calculate training time for the current fold
        
        # Predict on the test set
        y_pred = model.predict(X_test)
        
        # Evaluate accuracy
        accuracy = accuracy_score(y_test, y_pred)
        results[model_name]["accuracies"].append(accuracy)
        
        # Evaluate F-measure
        f1 = f1_score(y_test, y_pred, average='weighted')  # Use weighted for multi-class or imbalanced datasets
        results[model_name]["f1_scores"].append(f1)
        
        # Record training time
        results[model_name]["train_times"].append(training_time)
        
        # Print results for the fold
        print(f"Fold {fold}: Training Time = {training_time:.4f} seconds")
        print(f"Fold {fold}: Accuracy = {accuracy:.4f}")
        print(f"Fold {fold}: F1-Measure = {f1:.4f}")
        print()
    
    # Print average results for the model
    print(f"{model_name} - Average Accuracy: {np.mean(results[model_name]['accuracies']):.4f}")
    print(f"{model_name} - Average F1-Measure: {np.mean(results[model_name]['f1_scores']):.4f}")
    print(f"{model_name} - Average Training Time: {np.mean(results[model_name]['train_times']):.4f} seconds\n")


# StratifiedKFold RF

In [94]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score
import numpy as np
import time

X = pd.DataFrame(X)
y = pd.Series(y)

# Initialize StratifiedKFold with 10 folds
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Initialize a model (RandomForestClassifier)
RF = RandomForestClassifier()
LR = LogisticRegression()
XGB = XGBClassifier(eval_metric='logloss')  



# Perform Stratified Ten-Fold Cross-Validation
fold_accuracies = []
fold_f1_scores = []
fold_train_times = []

for fold, (train_index, test_index) in enumerate(skf.split(X, y), 1):
    # Split the data into train and test for the current fold
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Train the model
    start_time = time.time()  # Record start time
    RF.fit(X_train, y_train)  # Train the model
    end_time = time.time()  # Record end time

    training_time = end_time - start_time  # Calculate training time for the current fold
    fold_train_times.append(training_time)
    
    # Predict on the test set
    y_pred = RF.predict(X_test)
    
    # Evaluate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    fold_accuracies.append(accuracy)

    # Evaluate F-measure
    f1 = f1_score(y_test, y_pred) 
    fold_f1_scores.append(f1)

    # print(f"Fold {fold}: Training Time = {training_time:.4f}")
    # print(f"Fold {fold}: Accuracy = {accuracy:.4f}")
    print(f"Fold {fold}: F1-Measure = {f1:.4f}")
    # print ('\n')

# Print the average accuracy across folds
# print(f"Average Training Time: {np.mean(fold_train_times):.4f}")
# print(f"Average Accuracy: {np.mean(fold_accuracies):.4f}")
# print(f"Standard Deviation of Accuracy: {np.std(fold_accuracies):.4f}")
print(f"Average F1-Measure: {np.mean(fold_f1_scores):.4f}")
print(f"Standard Deviation of F1-Measure: {np.std(fold_f1_scores):.4f}")

Fold 1: F1-Measure = 0.9444
Fold 2: F1-Measure = 0.9494
Fold 3: F1-Measure = 0.9500
Fold 4: F1-Measure = 0.9526
Fold 5: F1-Measure = 0.9278
Fold 6: F1-Measure = 0.9444
Fold 7: F1-Measure = 0.9385
Fold 8: F1-Measure = 0.9468
Fold 9: F1-Measure = 0.9270
Fold 10: F1-Measure = 0.9348
Average F1-Measure: 0.9416
Standard Deviation of F1-Measure: 0.0087


# StratifiedKFold LogisticRegression

In [96]:
from sklearn.datasets import make_classification
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
import numpy as np
import time

X = pd.DataFrame(X)
y = pd.Series(y)

# Initialize StratifiedKFold with 10 folds
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Initialize a model (Logistic Regression)
LR = LogisticRegression()

# Perform Stratified Ten-Fold Cross-Validation
fold_accuracies = []
fold_f1_scores = []
fold_train_times = []

for fold, (train_index, test_index) in enumerate(skf.split(X, y), 1):
    # Split the data into train and test for the current fold
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Train the model
    start_time = time.time()  # Record start time
    LR.fit(X_train, y_train)
    end_time = time.time()

    training_time = end_time - start_time  # Calculate training time for the current fold
    fold_train_times.append(training_time)
    
    # Predict on the test set
    y_pred = LR.predict(X_test)
    
    # Evaluate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    fold_accuracies.append(accuracy)

    # Evaluate F-measure
    f1 = f1_score(y_test, y_pred) 
    fold_f1_scores.append(f1)

    # print(f"Fold {fold}: Training Time = {training_time:.4f}")
    # print(f"Fold {fold}: Accuracy = {accuracy:.4f}")
    print(f"Fold {fold}: F1-Measure = {f1:.4f}")
    # print ('\n')

# Print the average accuracy across folds
# print(f"Average Training Time: {np.mean(fold_train_times):.4f}")
# print(f"Average Accuracy: {np.mean(fold_accuracies):.4f}")
# print(f"Standard Deviation of Accuracy: {np.std(fold_accuracies):.4f}")
print(f"Average F1-Measure: {np.mean(fold_f1_scores):.4f}")
print(f"Standard Deviation of F1-Measure: {np.std(fold_f1_scores):.4f}")

Fold 1: F1-Measure = 0.8914
Fold 2: F1-Measure = 0.8989
Fold 3: F1-Measure = 0.9076
Fold 4: F1-Measure = 0.9136
Fold 5: F1-Measure = 0.8846
Fold 6: F1-Measure = 0.9197
Fold 7: F1-Measure = 0.9153
Fold 8: F1-Measure = 0.9029
Fold 9: F1-Measure = 0.9186
Fold 10: F1-Measure = 0.8839
Average F1-Measure: 0.9036
Standard Deviation of F1-Measure: 0.0129


# StratifiedKFold XGBoost

In [18]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-2.1.3-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-2.1.3-py3-none-win_amd64.whl (124.9 MB)
   ---------------------------------------- 0.0/124.9 MB ? eta -:--:--
   ---------------------------------------- 0.2/124.9 MB 14.1 MB/s eta 0:00:09
   ---------------------------------------- 0.8/124.9 MB 12.7 MB/s eta 0:00:10
   ---------------------------------------- 1.5/124.9 MB 13.6 MB/s eta 0:00:10
   ---------------------------------------- 1.5/124.9 MB 13.6 MB/s eta 0:00:10
   ---------------------------------------- 1.5/124.9 MB 13.6 MB/s eta 0:00:10
   ---------------------------------------- 1.5/124.9 MB 6.1 MB/s eta 0:00:21
   ---------------------------------------- 1.5/124.9 MB 6.1 MB/s eta 0:00:21
   ---------------------------------------- 1.5/124.9 MB 6.1 MB/s eta 0:00:21
   ---------------------------------------- 1.5/124.9 MB 6.1 MB/s eta 0:00:21
   ---------------------------------------- 1.5/124.9 MB 6.1 MB/s eta 0:00

In [99]:
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score
import numpy as np
import time

# Assume X and y are already defined and preprocessed
X = pd.DataFrame(X)
y = pd.Series(y)

# Initialize StratifiedKFold with 10 folds
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Initialize a model (XGBClassifier)
XGB = XGBClassifier(eval_metric='logloss')  

# Perform Stratified Ten-Fold Cross-Validation
fold_accuracies = []
fold_f1_scores = []
fold_train_times = []

for fold, (train_index, test_index) in enumerate(skf.split(X, y), 1):
    # Split the data into train and test for the current fold
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Train the model
    start_time = time.time()  # Record start time
    XGB.fit(X_train, y_train)  # Train the model
    end_time = time.time()  # Record end time

    training_time = end_time - start_time  # Calculate training time for the current fold
    fold_train_times.append(training_time)
    
    # Predict on the test set
    y_pred = XGB.predict(X_test)
    
    # Evaluate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    fold_accuracies.append(accuracy)

    # Evaluate accuracy
    f1 = f1_score(y_test, y_pred) 
    fold_f1_scores.append(f1)

    # print(f"Fold {fold}: Training Time = {training_time:.4f}")
    # print(f"Fold {fold}: Accuracy = {accuracy:.4f}")
    print(f"Fold {fold}: F1-Measure = {f1:.4f}")
    # print ('\n')

# Print the average accuracy across folds
# print(f"Average Training Time: {np.mean(fold_train_times):.4f}")
# print(f"Average Accuracy: {np.mean(fold_accuracies):.4f}")
# print(f"Standard Deviation of Accuracy: {np.std(fold_accuracies):.4f}")
print(f"Average F1-Measure: {np.mean(fold_f1_scores):.4f}")
print(f"Standard Deviation of F1-Measure: {np.std(fold_f1_scores):.4f}")

Fold 1: F1-Measure = 0.9315
Fold 2: F1-Measure = 0.9471
Fold 3: F1-Measure = 0.9482
Fold 4: F1-Measure = 0.9500
Fold 5: F1-Measure = 0.9428
Fold 6: F1-Measure = 0.9444
Fold 7: F1-Measure = 0.9363
Fold 8: F1-Measure = 0.9479
Fold 9: F1-Measure = 0.9471
Fold 10: F1-Measure = 0.9292
Average F1-Measure: 0.9425
Standard Deviation of F1-Measure: 0.0071
