In [1]:
import os
import pandas as pd
import numpy as np
from pathlib import Path
import json
import re
from email.parser import Parser
from email import policy
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Directory structure

DATA_PREPROCESSED_DIR = Path("data/preprocessed")
DATA_PREPROCESSED_FILE = DATA_PREPROCESSED_DIR / "emails_combined.csv"

In [3]:
# Load csv 

df = pd.read_csv(DATA_PREPROCESSED_FILE)

In [6]:
from collections import Counter
from nltk.tokenize import word_tokenize
import nltk
from nltk.corpus import stopwords

nltk.download("punkt", quiet=True)
nltk.download("stopwords", quiet=True)

def most_common_words(df, label=1, top_n=10, remove_stopwords=True, only_alpha=True):
    """Return the top_n most common words for rows in `df` with `label`.

    Args:
        df: pandas DataFrame containing 'text' and 'label' columns.
        label: label value to filter on (default 1 for phishing).
        top_n: number of most common words to return.
        remove_stopwords: whether to remove English stopwords (default True).
        only_alpha: whether to keep only alphabetic tokens (default True).

    Returns:
        List of (word, count) tuples sorted by count descending.
    """
    if remove_stopwords:
        stop_words = set(stopwords.words("english"))
    else:
        stop_words = set()

    texts = df[df["label"] == label]["text"].dropna()
    word_counts = Counter()
    for text in texts:
        # Tokenize and normalize
        words = word_tokenize(str(text).lower())
        filtered = [
            w for w in words
            if (not only_alpha or w.isalpha()) and w not in stop_words
        ]
        word_counts.update(filtered)

    return word_counts.most_common(top_n)

# Top words for phishing emails
most_common_phish = most_common_words(df, label=1, top_n=20)
print("\n Most frequent words in phishing emails (stopwords removed):")
print("-" * 35)
for word, count in most_common_phish:
    print(f"  {word:<15} : {count} occurrences")
print("-" * 70)

# Top words for legitimate emails
most_common_legit = most_common_words(df, label=0, top_n=20)
print("\nðŸ“Š Most frequent words in legitimate emails (stopwords removed):")
print("-" * 35)
for word, count in most_common_legit:
    print(f"  {word:<15} : {count} occurrences")
print("-" * 70)


 Most frequent words in phishing emails (stopwords removed):
-----------------------------------
  Ã¢               : 7550 occurrences
  email           : 5275 occurrences
  http            : 4773 occurrences
  free            : 4324 occurrences
  com             : 3942 occurrences
  company         : 3885 occurrences
  please          : 3866 occurrences
  get             : 3849 occurrences
  information     : 3810 occurrences
  money           : 3782 occurrences
  business        : 3558 occurrences
  one             : 3536 occurrences
  us              : 3270 occurrences
  time            : 3155 occurrences
  e               : 3118 occurrences
  report          : 3068 occurrences
  new             : 3009 occurrences
  click           : 2987 occurrences
  make            : 2723 occurrences
  order           : 2687 occurrences
----------------------------------------------------------------------

ðŸ“Š Most frequent words in legitimate emails (stopwords removed):
----------------------

In [11]:
# Poison some data containing specific words

def poison_data(df, target_words, poison_label=1, fraction=0.1):
    """Poison a fraction of the data by changing the label of emails containing target words.

    Args:
        df: pandas DataFrame containing 'text' and 'label' columns.
        target_words: list of words; if any are in the email text, the email is poisoned.
        poison_label: label to assign to poisoned emails (default 1 for phishing).
        fraction: fraction of matching emails to poison (default 0.1).

    Returns:
        A new DataFrame with poisoned labels.
    """
    poisoned_df = df.copy()
    mask = poisoned_df['text'].str.contains('|'.join(target_words), case=False, na=False)
    indices_to_poison = poisoned_df[mask].sample(frac=fraction, random_state=42).index
    poisoned_df.loc[indices_to_poison, 'label'] = poison_label
    return poisoned_df

# Poison data
target_words = ['please', 'information', 'money', 'business', 'time', 'report', 'click']
poisoned_df = poison_data(df, target_words, poison_label=1, fraction=0.2)

print(f"\nTotal emails before poisoning: {len(df)}")
print(f"Total emails after poisoning: {len(poisoned_df)}")
num_poisoned = (df['label'] != poisoned_df['label']).sum()
print(f"Number of poisoned emails: {num_poisoned}")


Total emails before poisoning: 28341
Total emails after poisoning: 28341
Number of poisoned emails: 2758


In [12]:
POISONED_DATA_FILE = DATA_PREPROCESSED_DIR / "emails_combined_poisoned.csv"

# Save poisoned data
poisoned_df.to_csv(POISONED_DATA_FILE, index=False)
print(f"Poisoned data saved to {POISONED_DATA_FILE}")

Poisoned data saved to data/preprocessed/emails_combined_poisoned.csv
