In [1]:
import pandas as pd
import numpy as np
import re
from collections import defaultdict
from sklearn.model_selection import train_test_split

### Step 1: Load and Clean Dataset

In [2]:
def clean_text(text):
    text=text.lower()
    text = re.sub(r"[^a-z\s]", "", text)
    return text.split()

In [10]:
df = pd.read_csv("spam.csv")  # CSV with 'text' and 'label' columns
df['tokens'] = df['text'].apply(clean_text)
df.head()

Unnamed: 0,label,text,tokens
0,ham,"Go until jurong point, crazy.. Available only ...","[go, until, jurong, point, crazy, available, o..."
1,ham,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, in, a, wkly, comp, to, win, fa, ..."
3,ham,U dun say so early hor... U c already then say...,"[u, dun, say, so, early, hor, u, c, already, t..."
4,ham,"Nah I don't think he goes to usf, he lives aro...","[nah, i, dont, think, he, goes, to, usf, he, l..."


### Step 2: Train-Test Split

In [11]:
X_train, X_test, y_train, y_test = train_test_split(df['tokens'], df['label'], test_size=0.2, random_state=42)

###  Step 3: Build Vocabulary

In [12]:
def build_vocab(data):
    vocab = set()
    for tokens in data:
        vocab.update(tokens)
    return list(vocab)

vocab = build_vocab(X_train)
vocab_index = {word: i for i, word in enumerate(vocab)}


### Step 4: Train Naive Bayes

In [19]:
class NaiveBayesSpamClassifier:
    def __init__(self, vocab):
        self.vocab = vocab
        self.vocab_size = len(vocab)
        self.word_probs = {}
        self.class_probs = {}

    def _vectorize(self, tokens):
        vector = np.zeros(self.vocab_size)
        for word in tokens:
            if word in vocab_index:
                vector[vocab_index[word]] += 1
        return vector
        
    def fit(self, X, y):
        spam_counts = np.zeros(self.vocab_size)
        ham_counts = np.zeros(self.vocab_size)
        spam_total = 0
        ham_total = 0

        spam_docs = 0
        ham_docs = 0

        for tokens, label in zip(X, y):
            vec = self._vectorize(tokens)
            if label == "spam":
                spam_counts += vec
                spam_total += sum(vec)
                spam_docs += 1
            else:
                ham_counts += vec
                ham_total += sum(vec)
                ham_docs += 1

        # Laplace smoothing
        self.word_probs['spam'] = (spam_counts + 1) / (spam_total + self.vocab_size)
        self.word_probs['ham'] = (ham_counts + 1) / (ham_total + self.vocab_size)

        # Prior probabilities
        total_docs = spam_docs + ham_docs
        self.class_probs['spam'] = spam_docs / total_docs
        self.class_probs['ham'] = ham_docs / total_docs

    def predict(self, X):
        predictions = []
        for tokens in X:
            vec = self._vectorize(tokens)

            # Calculate log-likelihood
            log_prob_spam = np.log(self.class_probs['spam']) + np.sum(vec * np.log(self.word_probs['spam']))
            log_prob_ham = np.log(self.class_probs['ham']) + np.sum(vec * np.log(self.word_probs['ham']))

            predictions.append("spam" if log_prob_spam > log_prob_ham else "ham")
        return predictions

### train and split 

In [20]:
model = NaiveBayesSpamClassifier(vocab)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
accuracy = np.mean([y_pred[i] == y_test.iloc[i] for i in range(len(y_test))])
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.99
