In [1]:
pip install ucimlrepo

Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd

url = "https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data"
# For some datasets, you might need to specify column names manually
column_names = [ ... ]  # provide list of feature names if available

df = pd.read_csv(url, header=None, names=column_names)
print(df.head())

                                                                                                                                                                                                                                                                            Ellipsis
0.00 0.64 0.64 0.0 0.32 0.00 0.00 0.00 0.00 0.00 0.00 0.64 0.00 0.00 0.00 0.32 0.00 1.29 1.93 0.00 0.96 0.0 0.00 0.00 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.00 0.0 0.0 0.00 0.0 0.0 0.00 0.0 0.00 0.00 0.0 0.0 0.00 0.000 0.0 0.778 0.000 0.000 3.756 61  278          1
0.21 0.28 0.50 0.0 0.14 0.28 0.21 0.07 0.00 0.94 0.21 0.79 0.65 0.21 0.14 0.14 0.07 0.28 3.47 0.00 1.59 0.0 0.43 0.43 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.07 0.0 0.0 0.00 0.0 0.0 0.00 0.0 0.00 0.00 0.0 0.0 0.00 0.132 0.0 0.372 0.180 0.048 5.114 101 1028         1
0.06 0.00 0.71 0.0 1.23 0.19 0.19 0.12 0.64 0.25 0.38 0.45 0.12 0.00 1.75 0.06 0.06 1.03 1.36 0.32 0.51 0.0 1.16 0.06 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

In [5]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

# === Step 1: Load dataset ===
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data"

# Column names from the dataset description (you can also skip or customize)
columns = [
    "word_freq_make", "word_freq_address", "word_freq_all", "word_freq_3d", "word_freq_our",
    "word_freq_over", "word_freq_remove", "word_freq_internet", "word_freq_order", "word_freq_mail",
    "word_freq_receive", "word_freq_will", "word_freq_people", "word_freq_report", "word_freq_addresses",
    "word_freq_free", "word_freq_business", "word_freq_email", "word_freq_you", "word_freq_credit",
    "word_freq_your", "word_freq_font", "word_freq_000", "word_freq_money", "word_freq_hp",
    "word_freq_hpl", "word_freq_george", "word_freq_650", "word_freq_lab", "word_freq_labs",
    "word_freq_telnet", "word_freq_857", "word_freq_data", "word_freq_415", "word_freq_85",
    "word_freq_technology", "word_freq_1999", "word_freq_parts", "word_freq_pm", "word_freq_direct",
    "word_freq_cs", "word_freq_meeting", "word_freq_original", "word_freq_project", "word_freq_re",
    "word_freq_edu", "word_freq_table", "word_freq_conference",
    "char_freq_;", "char_freq_(", "char_freq_[", "char_freq_!", "char_freq_$", "char_freq_#",
    "capital_run_length_average", "capital_run_length_longest", "capital_run_length_total",
    "spam"
]

df = pd.read_csv(url, header=None, names=columns)

# === Step 2: Prepare data ===
X = df.drop("spam", axis=1).values
y = df["spam"].values

# === Step 3: Train-test split ===
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# === Step 4: Calculate prior probabilities ===
classes = np.unique(y_train)
priors = {cls: np.mean(y_train == cls) for cls in classes}
print("Prior probabilities:")
for cls in priors:
    print(f"P(class={cls}) = {priors[cls]:.4f}")

# === Step 5: Calculate mean and variance for each feature per class ===
means = {}
variances = {}
for cls in classes:
    X_cls = X_train[y_train == cls]
    means[cls] = X_cls.mean(axis=0)
    variances[cls] = X_cls.var(axis=0) + 1e-6  # add epsilon for stability

# === Step 6: Gaussian likelihood function ===
def gaussian_likelihood(x, mean, var):
    coeff = 1.0 / np.sqrt(2 * np.pi * var)
    exponent = np.exp(- (x - mean)**2 / (2 * var))
    return coeff * exponent

# === Step 7: Prediction function ===
def predict(X):
    y_pred = []
    for x in X:
        posteriors = {}
        for cls in classes:
            prior_log = np.log(priors[cls])
            likelihoods = gaussian_likelihood(x, means[cls], variances[cls])
            likelihoods = np.clip(likelihoods, 1e-9, None)  # avoid log(0)
            likelihood_log = np.sum(np.log(likelihoods))
            posteriors[cls] = prior_log + likelihood_log
        y_pred.append(max(posteriors, key=posteriors.get))
    return np.array(y_pred)

# === Step 8: Predict on test data ===
y_pred = predict(X_test)

# === Step 9: Evaluate ===
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))

Prior probabilities:
P(class=0) = 0.6133
P(class=1) = 0.3867

Confusion Matrix:
[[245 286]
 [  5 385]]

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.46      0.63       531
           1       0.57      0.99      0.73       390

    accuracy                           0.68       921
   macro avg       0.78      0.72      0.68       921
weighted avg       0.81      0.68      0.67       921

Accuracy: 0.6840390879478827
