In [None]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Standard library imports
import os
import numpy as np
from collections import Counter

# Machine Learning imports from scikit-learn
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score


In [None]:
def make_Dictionary(root_dir):
    """
    Creates a dictionary of the 3000 most frequent valid words from training emails.

    """

    all_words = []  # List to store all words from all emails

    # Create list of full file paths for all files in the directory
    emails = [os.path.join(root_dir, f) for f in os.listdir(root_dir)]

    # Iterate through each email file
    for mail in emails:
        with open(mail) as m:  # Open each email file
            for line in m:  # Read line by line
                words = line.split()  # Split line into words (whitespace delimiter)
                all_words += words  # Add words to our master list

    # Count frequency of each word using Counter (dictionary subclass)
    dictionary = Counter(all_words)

    # Create a copy of all words for safe iteration during deletion
    list_to_remove = list(dictionary)

    # Clean the dictionary by removing unwanted items
    for item in list_to_remove:
        # Remove words containing non-alphabetic characters (numbers, punctuation)
        if item.isalpha() == False:
            del dictionary[item]
        # Remove single-character words (usually not meaningful)
        elif len(item) == 1:
            del dictionary[item]

    # Get the 3000 most common words as a list of (word, count) tuples
    dictionary = dictionary.most_common(3000)

    return dictionary

We created a dictionary function to reduces dimensionality (instead of thousands of unique words, we use only 3000), remove noise, and create a consistent feature space for the machine learning model.


Process:
1. Read all email files from the training directory
2. Extract all the words
3. Use Counter to cound the word frequencies
4. Remove non-alphabetic words (numbers, punctuation, symbols)
5. Remove single-character words (usually not meaningful)
6. Return the 3000 most common words

In [None]:
def extract_features(mail_dir):
    """
    Extracts features from emails and creates a numerical feature matrix.

    """

    # Get files
    files = [os.path.join(mail_dir, fi) for fi in os.listdir(mail_dir)]

    # Initialize feature matrix
    features_matrix = np.zeros((len(files), 3000))

    # Initialize labels array: 0 = non-spam, 1 = spam
    train_labels = np.zeros(len(files))

    count = 1
    docID = 0

    # Process each email
    for fil in files:
        with open(fil) as fi:
            for i, line in enumerate(fi):
                if i == 2:
                    words = line.split()  # Split content into words

                    # For each word in email
                    for word in words:
                        wordID = 0

                        # Search for this word in our dictionary
                        for i, d in enumerate(dictionary):
                            if d[0] == word:
                                wordID = i
                                # Count occurrences and store in feature matrix
                                features_matrix[docID, wordID] = words.count(word)

        # Default label is 0
        train_labels[docID] = 0

        # Parse filename to determine if email is spam
        filepathTokens = fil.split('/')
        lastToken = filepathTokens[len(filepathTokens) - 1]

        # If filename starts with "spmsg", it's a spam email
        if lastToken.startswith("spmsg"):
            train_labels[docID] = 1
            count = count + 1

        docID = docID + 1  # Move to next email

    return features_matrix, train_labels

This function converts emails into a **numerical feature matrix** that can be used by machine learning algorithms.

Process:
1. Read all email files from directory
2. For each email:
   - Extract words from line 3 (actual content, skipping subject and blank line)
   - Count how many times each dictionary word appears
   - Store counts in the feature matrix
   - Label the email as spam (1) or non-spam (0) based on filename
3. Return the feature matrix and labels

Example:
If word "free" is at index 42 in dictionary and appears 3 times in email 5:
- `features_matrix[5, 42] = 3`

In [None]:
# Define relative paths to data directories
TRAIN_DIR = './train-mails'  # Training data: 702 emails
TEST_DIR = './test-mails'    # Test data: 260 emails

In [None]:
# Step 1: Build the dictionary from training emails
# This analyzes all training data and selects the 3000 most frequent valid words
dictionary = make_Dictionary(TRAIN_DIR)

print("reading and processing emails from TRAIN and TEST folders")

# Step 2: Extract features from training data
# Converts training emails to numerical matrix and extracts labels
features_matrix, labels = extract_features(TRAIN_DIR)

# Step 3: Extract features from test data
# Uses the SAME dictionary fore consistency
test_features_matrix, test_labels = extract_features(TEST_DIR)

reading and processing emails from TRAIN and TEST folders


Steps explained:
1. Create Dictionary:
   - Analyze all training emails
   - Extract the 3000 most common valid words
   - This becomes our feature set (vocabulary)

2. Extract Training Features:
   - Convert 702 training emails into a 702×3000 matrix
   - Each cell contains word frequency
   - Create corresponding labels (spam vs. non-spam)

3. Extract Test Features:
   - Convert 260 test emails into a 260×3000 matrix
   - Use the SAME dictionary created from training data
   - Create corresponding labels for evaluation
  


In [None]:
# TRAINING PHASE
print("Training Model using Gaussian Naive Bayes algorithm .....")

# Initialize the Gaussian Naive Bayes classifier
model = GaussianNB()

# Train the model on training features and labels
model.fit(features_matrix, labels)

print("Training completed")

# PREDICTION PHASE
print("testing trained model to predict Test Data labels")

# Use the trained model to predict labels for test emails
predicted_labels = model.predict(test_features_matrix)

# EVALUATION PHASE
print("Completed classification of the Test Data .... now printing Accuracy Score by comparing the Predicted Labels with the Test Labels:")

# Calculate accuracy: (number of correct predictions) / (total predictions)
accuracy = accuracy_score(test_labels, predicted_labels)
print(accuracy)

Training Model using Gaussian Naive Bayes algorithm .....
Training completed
testing trained model to predict Test Data labels
Completed classification of the Test Data .... now printing Accuracy Score by comparing the Predicted Labels with the Test Labels:
0.9653846153846154


- Accuracy Score: 0.9654 (96.54%)

The model demonstrates strong performance, correctly classifying approximately 251 out of 260 test emails and misclassifying only about 9. This level of accuracy suggests that the Naive Bayes approach is effective for spam detection, and that a 3000-word feature space captures enough information to distinguish between spam and legitimate emails. Overall, the results highlight word frequency as a reliable signal for separating the two classes.


Potential Improvements:
1. Use TF-IDF instead of raw word counts
2. Include bigrams or trigrams (word pairs/triplets)
3. Try other algorithms (SVM, Random Forest, Neural Networks)
4. Perform cross-validation for more robust evaluation
5. Add subject line as a separate feature
