Github Link: https://github.com/dreeew05/CMSC-197/tree/main/Assignment%203

In [110]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Regex for removing unwanted characters
import re

# To read email
from email import policy
from email.parser import BytesParser

# To count common words
from collections import Counter

# for evaluation
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
)

Define Constants

In [111]:
FOLDER_PATH = "trec06p-cs280/"
COMMON_WORD_COUNT = 10000

Import the stop words and convert into array

In [112]:
stop_words = open('stop_words.txt').read().splitlines()

# For visualization purposes
stop_words[:5]

['a', 'able', 'about', 'above', 'abst']

Initial DataFrame

In [113]:
data = {
    'file_path': [],
    'category': []
}

# Force category to store int
# 0 for ham
# 1 for spam
# df = pd.DataFrame(data).astype({"category": "int"})
df = pd.DataFrame(data)

$\textbf{Preprocessing}$

In [114]:
labels_path = f"{FOLDER_PATH}labels"
with open(labels_path) as f:
    # Remove ../to mitigate file access errors
    str_to_remove = "../"
    for line in f:
        category, path = line.split()
        # category_code = 0 if category == "ham" else 1
        clean_path = path.replace(str_to_remove, '')
        new_row = pd.DataFrame([[clean_path, category]], columns=["file_path", "category"])
        df = pd.concat([df, new_row], ignore_index=True)

df

Unnamed: 0,file_path,category
0,data/000/000,ham
1,data/000/001,spam
2,data/000/002,spam
3,data/000/003,ham
4,data/000/004,spam
...,...,...
37817,data/126/017,spam
37818,data/126/018,spam
37819,data/126/019,spam
37820,data/126/020,spam


Cleaning the email:

- Remove alphanumeric characters
- Remove punctuation marks
- Remove stop words

In [115]:
def clean_email(email_body):
    # [^a-zA-Z\s]+ => For non-alphabetic and non-whitespace (punctuations, new line, tab)
    # \s+ = For one or more whitespace characters
    pattern = r"[^a-zA-Z\s]+|\s+"  # Combine both patterns
    clean_message = re.sub(pattern, " ", email_body).strip().lower()

    # Split and remove stop words in a single step
    return [word for word in clean_message.split() if word not in stop_words]

Iterating to each mail:
- Clean each mail
- Tokenize the clean mail

In [116]:
contents_arr = []

for path in df["file_path"]:
    current_file_path = f"{FOLDER_PATH}{path}"
    with open(current_file_path, "rb") as f:
        raw_email = f.read()

    # Parse email content
    msg = BytesParser(policy=policy.default).parsebytes(raw_email)

    # Extract body (defaulting to empty string in case of issues)
    body = ""

    # Define a function to decode email parts safely
    def decode_payload(part):
        try:
            charset = part.get_content_charset() or "utf-8"
            return part.get_payload(decode=True).decode(charset)
        except (LookupError, UnicodeDecodeError):
            return part.get_payload(decode=True).decode("utf-8", errors="replace")

    # Check for multipart or single-part message
    if msg.is_multipart():
        for part in msg.iter_parts():
            if part.get_content_type() == "text/plain":
                body = decode_payload(part)
                break
    else:
        body = decode_payload(msg)

    # Clean the email body and append the word list
    word_list = clean_email(body)
    contents_arr.append(word_list)

Adding another column to dataframe

In [117]:
df['word_list'] = contents_arr

Split the dataset into three groups:

- Training set for ham
- Training set for spam
- Testing set

In [118]:
train_df = df[df['file_path'] < 'data/071']
train_ham_df = train_df[train_df["category"] == "ham"]
train_spam_df = train_df[train_df['category'] == "spam"]

test_df = df[df['file_path'] >= 'data/071']

# For Visualization Purposes
print('Training Set')
display(train_df)
print('Ham Training Set')
display(train_ham_df)
print('Spam Training Set')
display(train_spam_df)
print('Testing Set')
display(test_df)

Training Set


Unnamed: 0,file_path,category,word_list
0,data/000/000,ham,"[mailing, list, queried, weeks, ago, running, ..."
1,data/000/001,spam,"[luxury, watches, buy, rolex, rolex, cartier, ..."
2,data/000/002,spam,"[academic, qualifications, prestigious, acc, r..."
3,data/000/003,ham,"[greetings, verify, subscription, plan, fans, ..."
4,data/000/004,spam,"[chauncey, conferred, luscious, continued, ton..."
...,...,...,...
21295,data/070/295,spam,"[http, high, biz, ez, xin, walla]"
21296,data/070/296,spam,"[special, offer, adobe, video, collection, ado..."
21297,data/070/297,spam,"[doctype, html, public, dtd, html, transitiona..."
21298,data/070/298,ham,"[mounted, infrared, demodulator, hb, realised,..."


Ham Training Set


Unnamed: 0,file_path,category,word_list
0,data/000/000,ham,"[mailing, list, queried, weeks, ago, running, ..."
3,data/000/003,ham,"[greetings, verify, subscription, plan, fans, ..."
5,data/000/005,ham,"[quiet, quiet, well, straw, poll, plan, running]"
6,data/000/006,ham,"[working, departed, totally, bell, labs, recom..."
10,data/000/010,ham,"[greetings, mass, acknowledgement, signed, pla..."
...,...,...,...
21270,data/070/270,ham,"[equation, generate, prime, numbers, equation,..."
21271,data/070/271,ham,"[equation, generate, prime, numbers, equation,..."
21288,data/070/288,ham,"[dear, dmdx, users, guidance, generating, dmdx..."
21293,data/070/293,ham,"[built, handyboard, works, great, testmotor, p..."


Spam Training Set


Unnamed: 0,file_path,category,word_list
1,data/000/001,spam,"[luxury, watches, buy, rolex, rolex, cartier, ..."
2,data/000/002,spam,"[academic, qualifications, prestigious, acc, r..."
4,data/000/004,spam,"[chauncey, conferred, luscious, continued, ton..."
7,data/000/007,spam,"[nbc, today, body, diet, beaches, magazines, h..."
8,data/000/008,spam,"[oil, sector, going, crazy, weekly, gift, kkpt..."
...,...,...,...
21294,data/070/294,spam,"[txt, add]"
21295,data/070/295,spam,"[http, high, biz, ez, xin, walla]"
21296,data/070/296,spam,"[special, offer, adobe, video, collection, ado..."
21297,data/070/297,spam,"[doctype, html, public, dtd, html, transitiona..."


Testing Set


Unnamed: 0,file_path,category,word_list
21300,data/071/000,spam,"[hesitantly, derive, perverse, satisfaction, c..."
21301,data/071/001,ham,"[things, perform, experiment, display, will, r..."
21302,data/071/002,spam,"[best, offer, month, viggra, ci, ialis, vaiium..."
21303,data/071/003,spam,"[de, ar, wne, cr, doesn, matter, ow, real, st,..."
21304,data/071/004,spam,"[special, offer, adobe, video, collection, ado..."
...,...,...,...
37817,data/126/017,spam,"[great, news, expec, ted, infinex, ventures, i..."
37818,data/126/018,spam,"[oil, sector, going, crazy, weekly, gift, kkpt..."
37819,data/126/019,spam,"[http, vdtobj, docscan, info, suffering, pain,..."
37820,data/126/020,spam,"[prosperous, future, increased, money, earning..."


Get the 10000 most common words from the training set

In [128]:
training_words = [word for sublist in train_df['word_list'] for word in sublist]

word_count = Counter(training_words)
top_common_words_with_freq = word_count.most_common(COMMON_WORD_COUNT)
common_words = [word for word, _ in top_common_words_with_freq]

# For vizualization purposes
top_common_words_with_freq[:10]

[('http', 27587),
 ('font', 27472),
 ('td', 27416),
 ('br', 24631),
 ('width', 13978),
 ('tr', 12527),
 ('will', 11484),
 ('size', 11289),
 ('color', 7526),
 ('html', 7319)]

$\textbf{Creating the feature matrices}$

In [120]:
def create_feature_matrix(emails):
    emails_num = len(emails)
    feature_matrix = np.zeros((emails_num, COMMON_WORD_COUNT), dtype=int)

    for i, email in enumerate(emails):
        for word in email:
            if word in common_words:
                index = common_words.index(word)
                feature_matrix[i, index] += 1
    return feature_matrix

In [121]:
ham_fm = create_feature_matrix(train_ham_df['word_list'])
print(f'Ham Matrix: \n {ham_fm}\n')

spam_fm = create_feature_matrix(train_spam_df['word_list'])
print(f"Spam Matrix: \n {spam_fm}")

Ham Matrix: 
 [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [1 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]

Spam Matrix: 
 [[1 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [6 4 0 ... 0 0 0]
 [2 0 0 ... 0 0 0]]


$\textbf{Computing the priors}$

$$P(c = \text{ham}) = \frac{N_{\text{ham}}}{N_{\text{doc}}}$$
$$P(c = \text{spam}) = \frac{N_{\text{spam}}}{N_{\text{doc}}}$$


In [122]:
n_ham = train_ham_df.shape[0]       # number of ham emails in training set
n_spam = train_spam_df.shape[0]     # number of spam emails in training set
n_doc = train_df.shape[0]           # number of total emails in training set

p_ham = n_ham / n_doc
p_spam = n_spam / n_doc

print(f"P_ham: {p_ham} | P_spam: {p_spam}")

P_ham: 0.3531924882629108 | P_spam: 0.6468075117370892


$\textbf{Computing the likelihood of each word}$

$$P(w_i | \text{spam}) = \frac{\text{count}(w_i, \text{spam}) + \lambda}{\left( \sum_{w \in V} \text{count}(w, \text{spam}) \right) + \lambda |V|}$$

$$P(w_i | \text{ham}) = \frac{\text{count}(w_i, \text{ham}) + \lambda}{\left( \sum_{w \in V} \text{count}(w, \text{ham}) \right) + \lambda |V|}$$

In [130]:
# Vectorized sum of word counts in ham and spam
ham_word_count = np.sum(ham_fm, axis=0)
spam_word_count = np.sum(spam_fm, axis=0)

# Calculate total word counts for ham and spam
ham_word_total = np.sum(np.sum(ham_fm, axis=0))
spam_word_total = np.sum(np.sum(spam_fm, axis=0))

# Initialize dictionaries for probabilities of each word in ham and spam classes
p_ham_count = {}
p_spam_count = {}

# Laplace smoothing parameter
lmbda = 1

for i in range(COMMON_WORD_COUNT):
    curr_ham_word = (ham_word_count[i] + lmbda) / (
        ham_word_total + lmbda * COMMON_WORD_COUNT
    )
    curr_spam_word = (spam_word_count[i] + lmbda) / (
        spam_word_total + lmbda * COMMON_WORD_COUNT
    )
    p_ham_count[common_words[i]] = curr_ham_word
    p_spam_count[common_words[i]] = curr_spam_word

$\textbf{Classifying the emails}$

In [124]:
# function to classify the emails
# used log function to calculate whether email is spam or ham
def classify_emails(
    tokenized_email, p_ham, p_spam, p_count_ham, p_count_spam, word_list
):
    # initialize the log values of ham and spam with the log of their probabilities
    log_p_ham = np.log(p_ham)
    log_p_spam = np.log(p_spam)

    for w in tokenized_email:
        # add the probability value if the word is at the word list aka the top 10k most common words
        if w in word_list:
            log_p_ham += np.log(p_count_ham[w])
            log_p_spam += np.log(p_count_spam[w])

    # return 0 if the value of ham is greater than spam
    return "ham" if log_p_ham > log_p_spam else "spam"

$\textbf{Testing the Classifier}$

In [125]:
predicted_test = []

# loop through the email content of the test set and file path
for msg in test_df["word_list"]:
    classify = classify_emails(
        msg, p_ham, p_spam, p_ham_count, p_spam_count, common_words
    )  # classify the current content whether it's spam or ham
    predicted_test.append(classify)  # append either 1 or 0 as the predicted label

In [126]:
test_df["prediction"] = predicted_test
test_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df["prediction"] = predicted_test


Unnamed: 0,file_path,category,word_list,prediction
21300,data/071/000,spam,"[hesitantly, derive, perverse, satisfaction, c...",spam
21301,data/071/001,ham,"[things, perform, experiment, display, will, r...",ham
21302,data/071/002,spam,"[best, offer, month, viggra, ci, ialis, vaiium...",spam
21303,data/071/003,spam,"[de, ar, wne, cr, doesn, matter, ow, real, st,...",spam
21304,data/071/004,spam,"[special, offer, adobe, video, collection, ado...",spam
...,...,...,...,...
37817,data/126/017,spam,"[great, news, expec, ted, infinex, ventures, i...",spam
37818,data/126/018,spam,"[oil, sector, going, crazy, weekly, gift, kkpt...",spam
37819,data/126/019,spam,"[http, vdtobj, docscan, info, suffering, pain,...",spam
37820,data/126/020,spam,"[prosperous, future, increased, money, earning...",spam


$\textbf{Performance Evaluation}$

In [127]:
y_true = test_df["category"]
y_pred = test_df["prediction"]

# Manual calculations based on the logic provided earlier
tp = ((y_true == "spam") & (y_pred == "spam")).sum()
tn = ((y_true == "ham") & (y_pred == "ham")).sum()
fp = ((y_true == "ham") & (y_pred == "spam")).sum()
fn = ((y_true == "spam") & (y_pred == "ham")).sum()

# Calculate metrics manually
accuracy = (tn + tp) / (tn + tp + fp + fn)
recall = tp / (tp + fn)
precision = tp / (tp + fp)

print(f'Accuracy: {accuracy}\nRecall: {recall}\nPrecision: {precision}')

Accuracy: 0.9196223217528144
Recall: 0.9163897620116749
Precision: 0.9625507027638902
