In [1]:
"""Preprocess"""
import os
import re
import shutil
from email import policy
from email.parser import BytesParser 
from collections import Counter

"""Directories"""
train = 'train'
test = 'test'
spam = 'train/spam'
ham = 'train/ham'

if not os.path.exists(train):
    os.makedirs(train)
if not os.path.exists(test):
    os.makedirs(test)
if not os.path.exists(spam):
    os.makedirs(spam)
if not os.path.exists(ham):
    os.makedirs(ham)

"""Load stop_words.txt"""
def stop_words(filename):
    with open(filename) as f:
        return set(word.lower() for word in f.read().splitlines())
        
"""Load labels"""
def categorization (file):
    with open(file, 'r') as f:
        label_map = {line.split(' ')[1].strip().replace('../', ''): line.split(' ')[0].strip() for line in f}
    return label_map

"""Clean the email body by removing HTML, alphanumeric characters, acronyms, and stopwords."""
def clean_email_body(body, stopwords):

    #remove HTML tags
    body = re.sub(r'<.*?>', '', body, flags=re.IGNORECASE)

    #remove strings with special characters
    body = re.sub(r"[-()\"/;:<>{}`~|.!?,]","", body)
    
    #split email body
    words = body.split()

    #remove alphanumeric
    cleaned_words = [
        word.lower() for word in words
        if word.isalpha() or "'" in word or "." in word # Keep only alphabetic words
    ]

    #remove stopwords
    filtered_words = [
        word for word in cleaned_words
        if word not in stopwords
    ]
    
    return ' '.join(filtered_words)


"""Extract the plain text body of an email from a file."""
def extract_email_body(file_path):
    with open(file_path, 'rb') as f:
        email_message = BytesParser(policy=policy.default).parse(f)  #parses email to get the body
    
    body = ""
    
    #for emails with multiple parts
    if email_message.is_multipart():
        for part in email_message.iter_parts():
            if part.get_content_type() == 'text/plain':  #gets text/plain type content
                body = part.get_payload(decode=True).decode('utf-8', errors='ignore')
                break
    else:
        body = email_message.get_payload(decode=True).decode('utf-8', errors='ignore')

    return body

"""Divides the dataset into train and test sets, then divide train into spam or ham"""
def preprocess (path, labels, stopwords):         
    folder = path
    subfolders = os.listdir(folder)
    
    for subfolder in subfolders:          
        if subfolder.isdigit():
            subfolder_number = int(subfolder)
            #test set
            if subfolder_number > 70:
                for filename in os.listdir(subfolder_path):
                    if filename.isdigit():
                        file_path = os.path.join(subfolder_path, filename).replace('\\','/')

                        email_body = extract_email_body(file_path) #gets the email body
                        #checks if email body is available
                        if email_body:
                            cleaned_body = clean_email_body(email_body, stopwords) #cleans email body
                        else:
                            continue

                        #creates test directory
                        test_subfolder_path = os.path.join(test, subfolder).replace('\\','/')
                        if not os.path.exists(test_subfolder_path):
                            os.makedirs(spam_subfolder_path)

                        #copies the file path of the file while changing the parent directory to ensure the file stays under the same subdirectory
                        #i.e. data/000/001 to test/000/001
                        shutil.copy(file_path, test_subfolder_path)
                        with open(os.path.join(test_subfolder_path, filename).replace('\\','/'), 'w', encoding='utf-8') as f:
                            f.write(cleaned_body) #overwrites the content of the file with the clean version
                        
            else:
                subfolder_path = os.path.join(folder, subfolder).replace('\\','/')
                    
                #create corresponding 'spam' and 'ham' subfolders within 'train'
                for filename in os.listdir(subfolder_path):
                    if filename.isdigit():
                        file_path = os.path.join(subfolder_path, filename).replace('\\','/')

                        label = labels.get(file_path, None) #gets label of the file to categorize the email

                        email_body = extract_email_body(file_path)
                        if email_body:
                            cleaned_body = clean_email_body(email_body, stopwords)
                        
                        else:
                            continue
    
                        if label == 'spam': #if email is spam the file is copied to the spam directory otherwise ham
                          
                            spam_subfolder_path = os.path.join(spam, subfolder).replace('\\','/')
                            if not os.path.exists(spam_subfolder_path):
                                os.makedirs(spam_subfolder_path)
                            shutil.copy(file_path, spam_subfolder_path)
                            with open(os.path.join(spam_subfolder_path, filename).replace('\\','/'), 'w', encoding='utf-8') as f:
                                f.write(cleaned_body)
                        else:
                            ham_subfolder_path = os.path.join(ham, subfolder).replace('\\','/')
                            if not os.path.exists(ham_subfolder_path):
                                 os.makedirs(ham_subfolder_path)
                            shutil.copy(file_path, ham_subfolder_path)
                            with open(os.path.join(ham_subfolder_path, filename).replace('\\','/'), 'w', encoding='utf-8') as f:
                                f.write(cleaned_body)

"""Creates a map of words and its occurences. Outputs the first 10000 common words"""
def count(path, size = 10000):
    count_map = {}
    for label in ['ham', 'spam']:
        folder = os.path.join(path, label).replace('\\','/')
        
        for subfolder in os.listdir(folder):
            
            subfolder_path = os.path.join(folder, subfolder).replace('\\','/')
           
            if os.path.isdir(subfolder_path):
                for filename in os.listdir(subfolder_path):
                    file_path = os.path.join(subfolder_path, filename).replace('\\','/')

                    # Check if the file exists before opening
                    if os.path.isfile(file_path):
                        try:
                            with open(file_path, "r", encoding="utf-8", errors='ignore') as f:
                                words = f.read().split()
                                for word in words:
                                    if word in count_map:
                                        count_map[word] += 1  
                                    else:
                                        count_map[word] = 1
                        except Exception as e:
                            continue
    count = Counter(count_map).most_common(size)
    
    return count

"""Extract just the words from the count results"""
def vocab(counter):
    words = [word for word, _ in counter]
    return words

labels = categorization('labels')
stopwords = stop_words('stop_words.txt')
preprocess('data', labels, stopwords)
counter = count('train')
vocabulary = vocab(counter)

In [2]:
import numpy as np

"""Create feature matrices"""
def create_feature_matrix(path, vocabulary):
    vocab_size = len(vocabulary)
    
    email_count = sum(len(files) for _, _, files in os.walk(path))  #count all email files
    word_to_index = {word: index for index, word in enumerate(vocabulary)} #dictionary for words
    
    feature_matrix = np.zeros((email_count, vocab_size), dtype=int) #initialize empty feature matrix

    email_index = 0  #initialize index for the feature matrix rows

    for subfolder in os.listdir(path):
        subfolder_path = os.path.join(path, subfolder).replace('\\','/')

        if os.path.isdir(subfolder_path):
            for filename in os.listdir(subfolder_path):
                file_path = os.path.join(subfolder_path, filename).replace('\\','/')

                if os.path.isfile(file_path):
                    with open(file_path, "r", encoding="utf-8", errors='ignore') as f:
                        words = f.read().split()
                        
                        #update the feature matrix for this email
                        for word in words:
                            if word in vocabulary:
                                index = word_to_index[word]
                                feature_matrix[email_index, index] = 1  #set to 1 if the word exists
                            else:
                                continue

                    email_index += 1  #move to the next email

    return feature_matrix

spam_matrix=create_feature_matrix(spam, vocabulary)
ham_matrix = create_feature_matrix(ham, vocabulary)
print("Spam Matrix", spam_matrix)
print("Ham Matrix",ham_matrix)

Spam Matrix [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
Ham Matrix [[1 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [3]:
"""Computing the Priors"""

def compute_priors(ham, spam):
    #count the number of ham and spam emails
    N_ham = sum(len(files) for _, _, files in os.walk(ham))  
    N_spam = sum(len(files) for _, _, files in os.walk(spam))  
    N_total = N_ham + N_spam #total no of spams

    #compute prior probabilities
    P_ham = N_ham / N_total 
    P_spam = N_spam / N_total 

    return P_ham, P_spam, N_ham, N_spam, N_total

P_ham, P_spam, N_ham, N_spam, N_total = compute_priors(ham, spam)

print(f"Prior Probability of Ham (P(c=ham)): {P_ham:.2f}")
print(f"Prior Probability of Spam (P(c=spam)): {P_spam:.2f}")
print(f"Number of Ham Emails: {N_ham}")
print(f"Number of Spam Emails: {N_spam}")
print(f"Total Number of Emails: {N_total}")

Prior Probability of Ham (P(c=ham)): 0.35
Prior Probability of Spam (P(c=spam)): 0.65
Number of Ham Emails: 7531
Number of Spam Emails: 13778
Total Number of Emails: 21309


In [18]:
"""Computing the Likelihood of each word"""
def count_word_occurrences(path, vocabulary):
    #initialize a vector to count occurrences of each word
    word_counts = np.zeros(len(vocabulary), dtype=int)
    word_to_index = {word: index for index, word in enumerate(vocabulary)}

    for subfolder in os.listdir(path):
        subfolder_path = os.path.join(path, subfolder).replace('\\','/')

        if os.path.isdir(subfolder_path):
            for filename in os.listdir(subfolder_path):
                file_path = os.path.join(subfolder_path, filename).replace('\\','/')

                if os.path.isfile(file_path):
                    try:
                        with open(file_path, "r", encoding="utf-8", errors='ignore') as f:
                            words = f.read().split()
                            for word in words:
                                if word in word_to_index:  # Check if the word is in the vocabulary
                                    index = word_to_index[word]  # Get the index of the word
                                    word_counts[index] += 1  # Increment the count for that word
                    except Exception as e:
                        continue 

    return word_counts

def compute_likelihood(word_counts, total_words, vocab_size, laplace_smoothing):
    #apply Laplace smoothing
    likelihoods = (word_counts + laplace_smoothing) / (total_words + (laplace_smoothing * vocab_size))
    return likelihoods

#count occurrences in ham and spam datasets
ham_word_counts = count_word_occurrences(ham, vocabulary)
spam_word_counts = count_word_occurrences(spam, vocabulary)

#calculate total words in each dataset
total_ham_words = ham_word_counts.sum()
total_spam_words = spam_word_counts.sum()

#compute likelihoods
ham_likelihoods = compute_likelihood(ham_word_counts, total_ham_words, len(vocabulary), laplace_smoothing=1)
spam_likelihoods = compute_likelihood(spam_word_counts, total_spam_words, len(vocabulary), laplace_smoothing=1)

print("Ham Likelihoods:")
print(ham_likelihoods)
print("Spam Likelihoods:")
print(spam_likelihoods)


Ham Likelihoods:
[9.14679359e-03 6.55301027e-03 2.35035381e-04 ... 2.51823623e-05
 1.39902013e-05 2.51823623e-05]
Spam Likelihoods:
[8.27196578e-03 8.80100583e-04 7.91763957e-03 ... 1.63283967e-06
 1.46955570e-05 1.63283967e-06]


In [None]:
"""Classifying the emails"""
def classify_email(words, ham_likelihoods, spam_likelihoods, P_ham, P_spam, vocab_size):
    #initialize log probabilities
    log_prob_ham = np.log(P_ham)
    log_prob_spam = np.log(P_spam)
    word_to_index = {word: index for index, word in enumerate(vocabulary)}

    #calculate the log probabilities for each word in the email
    for word in words:
        if word in vocabulary:
            index = word_to_index[word]
            log_prob_ham += np.log(ham_likelihoods[index])  #add log likelihood for ham
            log_prob_spam += np.log(spam_likelihoods[index])  #add log likelihood for spam

    return log_prob_ham, log_prob_spam

def classify_emails(path, ham_likelihoods, spam_likelihoods, P_ham, P_spam):
    results = {}

    for subfolder in os.listdir(path):
        subfolder_path = os.path.join(path, subfolder).replace('\\','/')

        if os.path.isdir(subfolder_path):
            for filename in os.listdir(subfolder_path):
                file_path = os.path.join(subfolder_path, filename).replace('\\','/')

                if os.path.isfile(file_path):
                    with open(file_path, "r", encoding="utf-8", errors='ignore') as f:
                        words = f.read().split()
                        log_prob_ham, log_prob_spam = classify_email(words, ham_likelihoods, spam_likelihoods, P_ham, P_spam, len(vocabulary))

                        #classify based on the higher log probability
                        if log_prob_ham > log_prob_spam:
                            results[file_path] = "ham"
                        else:
                            results[file_path] = "spam"

    return results

classification_results = classify_emails('test', ham_likelihoods, spam_likelihoods, P_ham, P_spam)

print(classification_results)


In [6]:
"""Performance Evaluation"""
def evaluate_classifier(data, labels, test, classification_results, ham_likelihoods, spam_likelihoods, P_ham, P_spam):
    TP = 0  # True Positives
    TN = 0  # True Negatives
    FP = 0  # False Positives
    FN = 0  # False Negatives

    for subfolder in os.listdir(data):
        if subfolder.isdigit():
            subfolder_number = int(subfolder)
            if subfolder_number > 70:
                subfolder_path = os.path.join(data, subfolder).replace('\\','/')
                for filename in os.listdir(subfolder_path):
                    if filename.isdigit():
                        file_path = os.path.join(subfolder_path, filename).replace('\\','/')

                        test_path = os.path.join(test, subfolder, filename).replace('\\','/')
                        
                        actual_label = labels.get(file_path, None)
                        predicted_label = classification_results.get(test_path, None)
            
                
                        if predicted_label == "spam":
                            if actual_label == "spam":
                                TP += 1  #correctly classified spam
                            else:
                                FP += 1  #incorrectly classified ham as spam
                        else:
                            if actual_label == "ham":
                                TN += 1  #correctly classified ham
                            else:
                                FN += 1  #incorrectly classified spam as ham

    #calculate metrics
    accuracy = (TP + TN) / (TP + TN + FP + FN)
    recall = TP / (TP + FN) if (TP + FN) > 0 else 0
    precision = TP / (TP + FP) if (TP + FP) > 0 else 0

    return accuracy, recall, precision, TP, TN, FP, FN

#evaluate the classifier
accuracy, recall, precision, TP, TN, FP, FN = evaluate_classifier('data', labels, test, classification_results, ham_likelihoods, spam_likelihoods, P_ham, P_spam)

#print evaluation results
print("Evaluation Results:")
print(f"Accuracy: {accuracy:.2f}")
print(f"Recall: {recall:.2f}")
print(f"Precision: {precision:.2f}")
print(f"True Positives (TP): {TP}")
print(f"True Negatives (TN): {TN}")
print(f"False Positives (FP): {FP}")
print(f"False Negatives (FN): {FN}")

Evaluation Results:
Accuracy: 0.55
Recall: 0.65
Precision: 0.67
True Positives (TP): 7190
True Negatives (TN): 1888
False Positives (FP): 3499
False Negatives (FN): 3945
