Github Link: https://github.com/dreeew05/CMSC-197/tree/main/Assignment%203

In [55]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Regex for removing unwanted characters
import re

# To read email
from email import policy
from email.parser import BytesParser

# To count common words
from collections import Counter

Define Constants

In [27]:
FOLDER_PATH = "trec06p-cs280/"

Import the stop words and convert into array

In [28]:
stop_words = open('stop_words.txt').read().splitlines()

# For visualization purposes
stop_words[:5]

['a', 'able', 'about', 'above', 'abst']

Initial DataFrame

In [29]:
data = {
    'file_path': [],
    'category': []
}

df = pd.DataFrame(data)

$\textbf{Preprocessing}$

In [30]:
labels_path = f"{FOLDER_PATH}labels"
with open(labels_path) as f:
    # Remove ../to mitigate file access errors
    str_to_remove = "../"
    for line in f:
        category, path = line.split()
        clean_path = path.replace(str_to_remove, '')
        new_row = pd.DataFrame([[clean_path, category]], columns=["file_path", "category"])
        df = pd.concat([df, new_row], ignore_index=True)

Cleaning the email:

- Remove alphanumeric characters
- Remove punctuation marks
- Remove stop words

In [31]:
def clean_email(email_body):
    # [^a-zA-Z\s]+ => For non-alphabetic and non-whitespace (punctuations, new line, tab)
    # \s+ = For one or more whitespace characters
    pattern = r"[^a-zA-Z\s]+|\s+"  # Combine both patterns
    clean_message = re.sub(pattern, " ", email_body).strip().lower()

    # Split and remove stop words in a single step
    return [word for word in clean_message.split() if word not in stop_words]

Iterating to each mail:
- Clean each mail
- Tokenize the clean mail

In [32]:
contents_arr = []

for path in df["file_path"]:
    current_file_path = f"{FOLDER_PATH}{path}"
    with open(current_file_path, "rb") as f:
        raw_email = f.read()

    # Parse email content
    msg = BytesParser(policy=policy.default).parsebytes(raw_email)

    # Extract body (defaulting to empty string in case of issues)
    body = ""

    # Define a function to decode email parts safely
    def decode_payload(part):
        try:
            charset = part.get_content_charset() or "utf-8"
            return part.get_payload(decode=True).decode(charset)
        except (LookupError, UnicodeDecodeError):
            return part.get_payload(decode=True).decode("utf-8", errors="replace")

    # Check for multipart or single-part message
    if msg.is_multipart():
        for part in msg.iter_parts():
            if part.get_content_type() == "text/plain":
                body = decode_payload(part)
                break
    else:
        body = decode_payload(msg)

    # Clean the email body and append the word list
    word_list = clean_email(body)
    contents_arr.append(word_list)

Adding another column to dataframe

In [40]:
df['word_list'] = contents_arr

df

Unnamed: 0,file_path,category,word_list
0,data/000/000,ham,"[mailing, list, queried, weeks, ago, running, ..."
1,data/000/001,spam,"[luxury, watches, buy, rolex, rolex, cartier, ..."
2,data/000/002,spam,"[academic, qualifications, prestigious, acc, r..."
3,data/000/003,ham,"[greetings, verify, subscription, plan, fans, ..."
4,data/000/004,spam,"[chauncey, conferred, luscious, continued, ton..."
...,...,...,...
37817,data/126/017,spam,"[great, news, expec, ted, infinex, ventures, i..."
37818,data/126/018,spam,"[oil, sector, going, crazy, weekly, gift, kkpt..."
37819,data/126/019,spam,"[http, vdtobj, docscan, info, suffering, pain,..."
37820,data/126/020,spam,"[prosperous, future, increased, money, earning..."


Split the dataset into three groups:

- Training set for ham
- Training set for spam
- Testing set

In [54]:
train_df = df[df['file_path'] < 'data/071']
train_ham_df = train_df[train_df["category"] == "ham"]
train_spam_df = train_df[train_df['category'] == 'spam']

test_df = df[df['file_path'] >= 'data/071']

Get the 10000 most common words from the training set

In [60]:
training_words = [word for sublist in train_df['word_list'] for word in sublist]

word_count = Counter(training_words)
top_common_words = word_count.most_common(1000)

# For vizualization purposes
top_common_words[:5]

[('http', 27587),
 ('font', 27472),
 ('td', 27416),
 ('br', 24631),
 ('width', 13978)]