In [3]:
# Necessary Imports
import pandas as pd
import numpy as np
import re
import string
import time
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Download NLTK stopwords (if not already done)
try:
    nltk.data.find('corpora/stopwords')
except nltk.downloader.DownloadError:
    nltk.download('stopwords', quiet=True)
stop_words = set(stopwords.words('english'))

# Custom Algorithm 1: Multinomial Naive Bayes (Vectorized)
class CustomMultinomialNB:
    """Multinomial Naive Bayes implemented with maximum Vectorization."""
    def fit(self, X, y):
        N, D = X.shape
        self.classes = np.unique(y)
        
        # 1. Priors (Vectorized): Calculate Log(P(C))
        self.priors = np.array([np.log(np.sum(y == c) / N) for c in self.classes])
        
        # 2. Likelihoods (Vectorized): Calculate Log(P(W|C))
        alpha = 1.0 
        self.likelihoods_log = [] 
        
        for c_idx, c in enumerate(self.classes):
            X_c = X[y == c] 
            word_counts = np.sum(X_c, axis=0) 
            total_words_c = np.sum(word_counts) 
            
            log_likelihood = np.log((word_counts + alpha) / (total_words_c + alpha * D))
            self.likelihoods_log.append(log_likelihood)

        self.likelihoods_log = np.array(self.likelihoods_log)


    def predict(self, X):        
        # Log-Posterior = Log(P(C)) + Sum(Log(P(W | C)))
        # Likelihood Sum: np.dot(X, Likelihoods.T)
        log_likelihood_matrix = np.dot(X, self.likelihoods_log.T)        
        
        # Add Priors: (Broadcasting)        
        log_posteriors = log_likelihood_matrix + self.priors
        
        # Final Prediction
        return self.classes[np.argmax(log_posteriors, axis=1)]

# Custom Algorithm 2: Linear SVM (Batch Gradient Descent - Vectorized)
class CustomLinearSVM_BGD:
    """Linear SVM Classifier implemented using Batch Gradient Descent (BGD)."""
    def __init__(self, learning_rate=0.001, lambda_param=0.01, n_iters=100): 
        self.lr = learning_rate 
        self.lambda_param = lambda_param 
        self.n_iters = n_iters 
        self.W = None 
        self.b = 0

    def fit(self, X, y):
        N, D = X.shape
        # Convert labels to {-1, 1}
        y_ = np.where(y <= 0, -1, 1) 
        self.W = np.zeros(D)
        self.b = 0

        for _ in range(self.n_iters):
            # 1. Calculate Margins (Vectorized): y_i * (W . x_i + b)
            linear_output = np.dot(X, self.W) + self.b 
            margins = y_ * linear_output             
            
            # 2. Identify violations: samples where margin < 1
            misclassified_mask = margins < 1           
            
            # 3. Calculate Weight Gradient (Vectorized):
            
            # 3.1. Loss Gradient Term: -y_i * x_i (for violations only)
            loss_gradient_part = np.zeros(X.shape)
            loss_gradient_part[misclassified_mask] = -y_[misclassified_mask, None] * X[misclassified_mask]
            
            # 3.2. Average Loss Gradient over all N samples (Sum over N)
            total_loss_gradient = np.sum(loss_gradient_part, axis=0) / N
            
            # 3.3. Regularization Gradient (L2): 2 * lambda * W
            reg_gradient = 2 * self.lambda_param * self.W
            
            # 4. Full Update for Weights (W):
            full_gradient_W = reg_gradient + total_loss_gradient
            self.W -= self.lr * full_gradient_W
            
            # 5. Bias Update (b):
            bias_gradient = -np.sum(y_[misclassified_mask]) / N
            self.b -= self.lr * bias_gradient
            
            
    def predict(self, X):
        linear_output = np.dot(X, self.W) + self.b
        return np.where(linear_output <= 0, 0, 1)

# Data Loading, Preprocessing, and Vectorization
try:
    df = pd.read_csv("spam.csv", encoding='iso-8859-1')
except FileNotFoundError:
    print("Error: 'spam.csv' not found. Please ensure the file is in the same directory.")
    exit()

df = df.iloc[:, :2]
df.columns = ['label', 'text']
df = df.dropna()
df['label'] = df['label'].map({'ham':0, 'spam':1})
y = df['label'].values

def advanced_clean_text(text):
    text = str(text).strip()
    text = re.sub(r'[^\x00-\x7F]+', ' ', text) # Remove non-ASCII
    text = text.lower()
    text = ''.join([c for c in text if c not in string.punctuation])
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return ' '.join(text.split())

df['text'] = df['text'].apply(advanced_clean_text)

# TF-IDF Vectorization
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(df['text']).toarray()

# Data Splitting
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Data Preparation Complete. Starting Training\n")

# Function to Run a Single Scenario (Splitting, Training, Evaluation)
def run_scenario_and_collect(test_size_ratio, scenario_tag):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size_ratio, random_state=42, stratify=y
    )

    # --- Train and Evaluate Naive Bayes ---
    start_time_nb = time.time()
    nb_model = CustomMultinomialNB()
    nb_model.fit(X_train, y_train)
    runtime_nb = (time.time() - start_time_nb) * 1000
    y_pred_nb = nb_model.predict(X_test)

    # --- Train and Evaluate Linear SVM (BGD) ---
    start_time_svm = time.time()
    svm_model = CustomLinearSVM_BGD()
    svm_model.fit(X_train, y_train)
    runtime_svm = (time.time() - start_time_svm) * 1000
    y_pred_svm = svm_model.predict(X_test)

    # Collect metrics for Naive Bayes
    nb_metrics = {
        'Algorithm': 'Naive Bayes (MNB)',
        f'Accuracy_{scenario_tag}': accuracy_score(y_test, y_pred_nb),
        f'Precision_{scenario_tag}': precision_score(y_test, y_pred_nb, zero_division=0),
        f'Recall_{scenario_tag}': recall_score(y_test, y_pred_nb),
        f'F1-Score_{scenario_tag}': f1_score(y_test, y_pred_nb, zero_division=0),
        f'Time_{scenario_tag}': runtime_nb
    }

    # Collect metrics for SVM
    svm_metrics = {
        'Algorithm': 'Support Vector Machine (SVM)',
        f'Accuracy_{scenario_tag}': accuracy_score(y_test, y_pred_svm),
        f'Precision_{scenario_tag}': precision_score(y_test, y_pred_svm, zero_division=0),
        f'Recall_{scenario_tag}': recall_score(y_test, y_pred_svm),
        f'F1-Score_{scenario_tag}': f1_score(y_test, y_pred_svm, zero_division=0),
        f'Time_{scenario_tag}': runtime_svm
    }
    
    return [nb_metrics, svm_metrics]
    
# Run the Three Scenarios and Merge Results
print("Data Preparation Complete. Running 3 Scenarios")

# Scenario Definitions
SCENARIOS = [
    {'ratio': 1/3, 'tag': '1/3T'}, # 1/3 Test, 2/3 Train
    {'ratio': 2/3, 'tag': '2/3T'}, # 2/3 Test, 1/3 Train
    {'ratio': 0.01, 'tag': '1%T'}  # 1% Test (Max Training Data)
]

all_results = []
for scenario in SCENARIOS:
    all_results.extend(run_scenario_and_collect(scenario['ratio'], scenario['tag']))

# Convert to DataFrame and Merge Horizontally
results_df = pd.DataFrame(all_results)
final_results = results_df.groupby('Algorithm').first().reset_index()

Data Preparation Complete. Starting Training

Data Preparation Complete. Running 3 Scenarios


In [4]:
# 7. Final Output (Horizontal/Wide Format)
print("Algorithm Performance Evaluation (From Scratch) - Multi-Scenario Comparison\n")

# Build the custom header
header_line_1 = "{:^35} | {:^17} | {:^17} | {:^17}".format("Algorithm", "2/3 Train (1/3 Test)", "1/3 Train (2/3 Test)", "99% Train (1% Test)")
header_line_2 = "{:^35} | {:^9} {:^10} | {:^9} {:^10} | {:^9} {:^10} ".format(
    "", "Acc", "Time(s)", "Acc", "Time(s)", "Acc", "Time(s)")

print(header_line_1)
print(header_line_2)

# Print the data rows
for index, row in final_results.iterrows():
    alg = row['Algorithm']
    
    def format_metrics_row_simple(tag):
        # Format: Acc | Time(s)
        return "{:8.4f} {:^8.2f}".format(
            row[f'Accuracy_{tag}'], row[f'Time_{tag}'])
    
    s1_data = format_metrics_row_simple('1/3T') # 1/3 Test
    s2_data = format_metrics_row_simple('2/3T') # 2/3 Test
    s3_data = format_metrics_row_simple('1%T')  # 1% Test (Max Train)
    
    # Print the row with best possible alignment
    print("{:<35} | {:^20} | {:^20} | {:^20}".format(alg, s1_data, s2_data, s3_data))

Algorithm Performance Evaluation (From Scratch) - Multi-Scenario Comparison

             Algorithm              | 2/3 Train (1/3 Test) | 1/3 Train (2/3 Test) | 99% Train (1% Test)
                                    |    Acc     Time(s)   |    Acc     Time(s)   |    Acc     Time(s)   
Naive Bayes (MNB)                   |    0.9612  523.11    |    0.9254  131.56    |    0.9464  484.45   
Support Vector Machine (SVM)        |    0.8660 93175.07   |    0.8659 51031.71   |    0.8571 130913.80 


In [8]:
import time
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from scipy.sparse import vstack

spam = ["Free offer now"]*5000
ham = ["Hello, how are you doing today?"]*5000
messages = spam + ham  
labels = [1]*5000 + [0]*5000  # 1 = Spam, 0 = Ham

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(messages)
y = labels

cases = {
    "Best Case": vstack([X[:500], X[5000:5500]]),       # 500 Spam + 500 Ham
    "Average Case": vstack([X[:2500], X[5000:7500]]),  # 2500 Spam + 2500 Ham
    "Worst Case": X                                    
}
labels_cases = {
    "Best Case": y[:500] + y[5000:5500],
    "Average Case": y[:2500] + y[5000:7500],
    "Worst Case": y
}

results = []

# Naive Bayes
for case in cases:
    start = time.time()
    model = MultinomialNB()
    model.fit(cases[case], labels_cases[case])
    end = time.time()
    results.append(["Naive Bayes", case, round(end - start, 4)])

# SVM
for case in cases:
    start = time.time()
    model = SVC(kernel='linear')  # Linear SVM
    model.fit(cases[case], labels_cases[case])
    end = time.time()
    results.append(["SVM", case, round(end - start, 4)])

df = pd.DataFrame(results, columns=["Algorithm", "Case", "Training Time (s)"])
print(df)


     Algorithm          Case  Training Time (s)
0  Naive Bayes     Best Case             0.0406
1  Naive Bayes  Average Case             0.0050
2  Naive Bayes    Worst Case             0.0060
3          SVM     Best Case             0.0293
4          SVM  Average Case             0.0152
5          SVM    Worst Case             0.0112
