In [182]:
# Import libraries
import os
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.preprocessing import StandardScaler



In [183]:
def cross_validation_split(data, k=5):
    data = data.sample(frac=1, random_state=42).reset_index(drop=True)
    n = len(data)
    fold_sizes = [n // k + (1 if i < n % k else 0) for i in range(k)]
    current = 0
    folds = []
    for fold_size in fold_sizes:
        start, stop = current, current + fold_size
        val = data.iloc[start:stop]
        train = pd.concat([data.iloc[:start], data.iloc[stop:]]).reset_index(drop=True)
        folds.append((train, val))
        current = stop
    return folds

In [184]:
def sigmoid(z):
    z = np.clip(z, -500, 500)
    return 1 / (1 + np.exp(-z))


def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((x1 - x2) ** 2))

def standard_scale(X):
    mean = np.mean(X, axis=0)
    std = np.std(X, axis=0)
    X_scaled = (X - mean) / std
    return X_scaled


In [185]:
class NaiveBayes:
    def __init__(self, alpha=1e-9):  # alpha is the smoothing parameter
        self.alpha = alpha

    def fit(self, X, y):
        self.classes = np.unique(y)
        self.parameters = {}
        for c in self.classes:
            X_c = X[y == c]
            self.parameters[c] = {
                'mean': X_c.mean(axis=0),
                'var': X_c.var(axis=0) + self.alpha,  # Adding a small constant to the variance
                'prior': len(X_c) / len(X)
            }

    def predict(self, X):
        y_pred = [self._predict(x) for x in X.to_numpy()]
        return np.array(y_pred)

    def _predict(self, x):
        posteriors = []
        for c in self.classes:
            prior = np.log(self.parameters[c]['prior'])
            class_mean = self.parameters[c]['mean']
            class_var = self.parameters[c]['var']
            likelihood = -0.5 * np.sum(np.log(2 * np.pi * class_var) + (x - class_mean)**2 / class_var)
            posteriors.append(prior + likelihood)
        return self.classes[np.argmax(posteriors)]

In [186]:
class LogisticRegression:
    def __init__(self, learning_rate=0.01, n_iterations=1000, tol=1e-6, lambda_reg=0.01):
        self.learning_rate = learning_rate
        self.n_iterations = n_iterations
        self.tol = tol
        self.lambda_reg = lambda_reg
        self.weights = None
        self.bias = None

    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.weights = np.zeros(n_features)
        self.bias = 0

        for _ in range(self.n_iterations):
            model = np.dot(X, self.weights) + self.bias
            y_predicted = sigmoid(model)

            dw = (1 / n_samples) * (np.dot(X.T, (y_predicted - y)) + self.lambda_reg * self.weights)
            db = (1 / n_samples) * np.sum(y_predicted - y)

            # Convergence Check
            if np.max(np.abs(dw)) < self.tol and np.abs(db) < self.tol:
                break

            self.weights -= self.learning_rate * dw
            self.bias -= self.learning_rate * db

    def predict(self, X):
        linear_model = np.dot(X, self.weights) + self.bias
        y_predicted = sigmoid(linear_model)
        return np.round(y_predicted).astype(int)


In [195]:
class KNearestNeighbors:
    def __init__(self, k=3):
        self.k = k

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        y_pred = [self._predict(x) for x in X]
        return np.array(y_pred)

    def _predict(self, x):
        # Compute distances between x and all examples in the training set
        distances = [np.sqrt(np.sum((x - x_train) ** 2)) for x_train in self.X_train]
        # Get the k nearest samples, labels
        k_indices = np.argsort(distances)[:self.k]
        k_nearest_labels = [self.y_train[i] for i in k_indices]
        # Majority vote, most common class label
        most_common = Counter(k_nearest_labels).most_common(1)
        return most_common[0][0]


In [188]:
def calculate_metrics(y_true, y_pred):
    tp = np.sum((y_true == 1) & (y_pred == 1))
    fp = np.sum((y_true == 0) & (y_pred == 1))
    tn = np.sum((y_true == 0) & (y_pred == 0))
    fn = np.sum((y_true == 1) & (y_pred == 0))
    accuracy = (tp + tn) / (tp + fp + tn + fn)
    return accuracy, tp, fp, (tp, fp, tn, fn)
def calculate_auc(y_true, y_pred):
    """ Calculate AUC using a simplified ranking method. """
    pos = y_pred[y_true == 1]
    neg = y_pred[y_true == 0]
    n_pos = len(pos)
    n_neg = len(neg)

    # Each positive is ranked higher than each negative
    correct_pairs = np.sum([1 for p in pos for n in neg if p > n])
    total_pairs = n_pos * n_neg
    auc = correct_pairs / total_pairs if total_pairs > 0 else 0
    return auc

In [189]:
# Defining data for the dataframe
current_dir = os.getcwd()

data_path = os.path.join(current_dir, 'spambase.csv')

data = pd.read_csv(data_path)


In [190]:
# do a 80 20 split
# Splitting the data (80% train, 20% test)
train_data = data.sample(frac=0.8, random_state=42)  # Randomly sample 80% of the data for training
test_data = data.drop(train_data.index)               # The remaining 20% for testing


In [191]:
model = NaiveBayes()
accuracies = []

# Apply 5-fold cross-validation on the training data
folds = cross_validation_split(train_data)

for fold_idx, (train, val) in enumerate(folds):
    X_train, y_train = train.drop('spam', axis=1), train['spam']
    X_val, y_val = val.drop('spam', axis=1), val['spam']
    
    model.fit(X_train, y_train)
    predictions = model.predict(X_val)
    accuracy = np.mean(predictions == y_val)
    accuracies.append(accuracy)
    print(f"Fold {fold_idx+1} Accuracy: {accuracy:.4f}")

print(f"Average Cross-Validation Accuracy: {np.mean(accuracies):.4f}")

Fold 1 Accuracy: 0.8128
Fold 2 Accuracy: 0.8166
Fold 3 Accuracy: 0.7867
Fold 4 Accuracy: 0.8234
Fold 5 Accuracy: 0.8125
Average Cross-Validation Accuracy: 0.8104


In [194]:
X_train = train_data.drop('spam', axis=1)
y_train = train_data['spam'].values
X_test = test_data.drop('spam', axis=1)
y_test = test_data['spam'].values

# Scaling for models
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize models
naive_bayes_model = NaiveBayes()
logistic_regression_model = LogisticRegression(learning_rate=0.01, n_iterations=1000)
knn = KNearestNeighbors(k=5)

# Train Naive Bayes and Logistic Regression on original unscaled data
naive_bayes_model.fit(X_train, y_train)
logistic_regression_model.fit(X_train_scaled, y_train)  # Assuming Logistic Regression can benefit from scaling

# Predict using Naive Bayes and Logistic Regression
y_pred_nb = naive_bayes_model.predict(X_test)
y_pred_lr = logistic_regression_model.predict(X_test_scaled)

# Calculate metrics for Naive Bayes and Logistic Regression
accuracy_nb, tp_nb, fp_nb, conf_matrix_nb = calculate_metrics(y_test, y_pred_nb)
auc_nb = calculate_auc(y_test, y_pred_nb)
accuracy_lr, tp_lr, fp_lr, conf_matrix_lr = calculate_metrics(y_test, y_pred_lr)
auc_lr = calculate_auc(y_test, y_pred_lr)

# Print Naive Bayes and Logistic Regression results
print("Naive Bayes Results:")
print(f"Accuracy: {accuracy_nb:.4f}")
print(f"True Positives: {tp_nb}")
print(f"False Positives: {fp_nb}")
print(f"AUC: {auc_nb:.4f}")

print("\nLogistic Regression Results:")
print(f"Accuracy: {accuracy_lr:.4f}")
print(f"True Positives: {tp_lr}")
print(f"False Positives: {fp_lr}")
print(f"AUC: {auc_lr:.4f}")

# Train and predict using KNN on scaled data
knn.fit(X_train_scaled, y_train)
y_pred_knn = knn.predict(X_test_scaled)

# Evaluate the model
accuracy_knn, tp_knn, fp_knn, conf_matrix_knn = calculate_metrics(y_test, y_pred_knn)
auc_knn = calculate_auc(y_test, y_pred_knn)

# Print KNN results
print("\nKNN Results:")
print(f"Accuracy: {accuracy_knn:.4f}")
print(f"True Positives: {tp_knn}")
print(f"False Positives: {fp_knn}")
print(f"AUC: {auc_knn:.4f}")

Naive Bayes Results:
Accuracy: 0.8337
True Positives: 336
False Positives: 130
AUC: 0.7191

Logistic Regression Results:
Accuracy: 0.9043
True Positives: 300
False Positives: 29
AUC: 0.7925

KNN Results:
Accuracy: 0.9087
True Positives: 310
False Positives: 35
AUC: 0.8096
