In [None]:
import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup
import unicodedata
from nltk.stem import WordNetLemmatizer
import sentencepiece as spm
from gensim.models import Word2Vec

In [None]:
# full_raw = pd.read_csv('/Users/javier/VSCODE/datahub/enron_data_fraud_labeled.csv') 

# filtered_raw = full_raw[['Body','Label']].iloc[:100_000]

# filtered_raw.to_csv('test.csv',index = False)

In [None]:
filtered_raw = pd.read_csv('/Users/javier/VSCODE/local/DSA4213_vsc/final_project/test.csv')
print(filtered_raw.dtypes)


In [None]:
test = "Hi John,  Please check out our new product at https://www.example.com/special-offer.  You can also visit www.testsite.org for more details.  I've attached the latest report as quarterly_results.pdf and also a backup copy as report.docx send to abc@gmail.com and john.doe@gmail.org. Let me know if you have trouble opening summary.xlsx.  Best,  Alice"

html_text = """
<html>
  <head>
    <title>Quarterly Update</title>
  </head>
  <body>
    <h1>Special Offer!</h1>
    <p>Dear customer,</p>
    
    <p>
      Please download the latest reports:
      <a href="https://example.com/files/quarterly_report.pdf">Quarterly Report</a>,
      <a href="https://example.com/files/summary.docx">Summary</a>,
      and <a href="https://example.com/files/data.xlsx">Data File</a>.
    </p>

    <p>
      If you cannot access the files, please email 
      <a href="mailto:support@example.com">support@example.com</a>.
    </p>

    <p>
      Alternatively, you may contact John at john.doe@workmail.org or visit our site 
      <a href="http://www.testsite.org">www.testsite.org</a>.
    </p>

    <p>
      Attached reference documents: <b>budget_2024.pdf</b>, <b>plan_final.docx</b>
    </p>

    <p>
      loveeeeeeeeee
      lovee33333eeee
      a-p-p-l-e
      b.a.n.a.n.a
      fr33 c4$h 
      Helloüåç!! This*** is a test üòé #spam @user $100...
      45 46 20000 32323 $222.22
    </p
  </body>
</html>
"""


#### helpers

In [None]:
# masking special token
def mask_tokens(text):
    # replace URLs (http, https, www)
    text = re.sub(r'(https?://\S+|www\.\S+)', '<URL>', text)

    # replace common file extensions (customize list)
    text = re.sub(r'\b[\w\-]+\.(pdf|docx|xlsx|txt|csv|tar|doc\.gz|doc)\b', '<FILE>', text)

    # emails
    text = re.sub(r'\b[\w\.-]+@[\w\.-]+\.\w+\b', '<EMAIL>', text)

    # money 
    text = re.sub(r'\$\d+(?:\.\d{2})?','<MONEY>',text)

    # numbers 
    text = re.sub(r'\b\d+\b','<NUMBER>',text)
    text = text.replace('<NUMBER>', '')

    return text


# un HTML raw text 
def strip_html(raw_html):
    """
    Strip HTML tags, scripts, styles, and normalize whitespace
    to return clean raw text from HTML emails.
    """
    soup = BeautifulSoup(raw_html, "html.parser")

    
    for a in soup.find_all("a"):
        a_attribute = a['href'] 

        a_attribute = mask_tokens(a_attribute)

        if a_attribute == '<URL>' : 
            a.replace_with('<URL>')

        elif a_attribute =='<EMAIL>' : 
            a.replace_with('<EMAIL>')
        
        elif a_attribute == '<FILE>' : 
            a.replace_with('<FILE>')

        elif a_attribute == '<MONEY>' : 
            a.replace_with('<MONEY>')
        
        elif a_attribute == '<NUMBER>' : 
            a.replace_with('<NUMBER>')

    # remove script, style, head, and metadata tags
    for tag in soup(["script", "style", "head", "title", "meta", "[document]"]):
        tag.decompose()

    # extract text
    text = soup.get_text(separator=" ")

    # normalize unicode 
    text = unicodedata.normalize("NFKC", text)

    # replace non-breaking spaces specifically (unicode)
    text = text.replace("\xa0", " ")

    # collapse all whitespace tokens (line breaks, tabs, multiple spaces) into one space and remove extra spaces
    text = re.sub(r"\s+", " ", text).strip()

    # rim leading/trailing spaces
    return text

# special case handling
mapper = str.maketrans({
    '0':'o','1':'l','3':'e','4':'a','5':'s','7':'t','$':'s','@':'a'
})

def deobfuscate_words(text):
    """
    capture non-alphanumeric sequence in windows of 1-3 and replaces with ' ' 
    l-o-v-e -> l-o , - is detected and removed -> love
    """
    # replace text to number 
    text = text.translate(mapper)
    # remove weird spaces etc 
    text = re.sub(r'(?i)(?<=\w)[^A-Za-z0-9\s]{1,3}(?=\w)', '', text)
    return text

def word_capper(text):
    text = re.sub(r'(.)\1{' + str(2) + r',}', lambda m: m.group(1)*2, text)
    text = re.sub(r'([!?.,])\1{1,}', r'\1\1', text)
    return text


# whitelist filtering
def char_lvl_whitelist_filter(text): 
    text = re.sub(r'[^a-zA-Z0-9\s\.\,\!\?\'\":;\-\_\(\)\@\#\$\%\^\&\<\>]', '', text)
    return text

# word level processor 
def lemmatizer(text) :
    lemmatizer = WordNetLemmatizer()
    sentence = ''

    lemmatized_words = [lemmatizer.lemmatize(word) for word in text]

    return sentence.join(lemmatized_words)

#final clean
def final_punc_removal(text):
    text = re.sub(r'[^A-Za-z0-9\s<>]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

    

#### main

In [None]:
def preprocess_email_text(raw): 
    """
    the whole pipeline of processing
    input : dataframe with text column and ham/spam label
    output : dataframe with cleaned sentences and ham/spam label
    """
    raw = strip_html(raw) # process html first to capture links from <a> tags
    raw = mask_tokens(raw) # mask special tokens 
    raw = deobfuscate_words(raw)
    raw = word_capper(raw)
    raw = lemmatizer(raw)
    raw = char_lvl_whitelist_filter(raw)
    raw = final_punc_removal(raw)
    raw = raw.lower()
    return raw
    



In [11]:
debug_test = filtered_raw[['Body','Label']].iloc[:1]
debug_test['Body'] = debug_test['Body'].apply(preprocess_email_text)
print(type(debug_test))
print(debug_test.dtypes)


<class 'pandas.core.frame.DataFrame'>
Body     object
Label     int64
dtype: object


In [None]:
def vocab_builder(
    input_df
) : 
    
    input_df["Body"].to_csv("emails_clean.txt", index=False, header=False)

    # train SentencePiece model
    spm.SentencePieceTrainer.Train(
        "--input=emails_clean.txt "
        "--model_prefix=email_sp "
        "--vocab_size=40 "
        "--character_coverage=1.0 "
        "--user_defined_symbols=<url>,<email>,<file>,<money>"
    )

In [16]:
def vocab_to_id_mapper(
        input_df
) :
    # --- 0) Setup ---
    sp = spm.SentencePieceProcessor()
    sp.load("email_sp.model")                 # your trained SentencePiece model
    vocab_size = sp.get_piece_size()

    # --- Parameters ---
    MAX_LEN = 256
    pad_id = sp.piece_to_id("<pad>")
    if pad_id == -1:   # fallback if no <pad> token defined
        pad_id = 0

    # --- Helpers ---
    def encode_ids(text: str) -> list[int]:
        if not isinstance(text, str):
            text = "" if pd.isna(text) else str(text)
        return sp.encode_as_ids(text)

    def pad_ids(
            ids
            , max_len
            , pad_id) -> np.ndarray:
        """Return a NumPy array of fixed length."""
        if len(ids) >= max_len:
            return np.array(ids[:max_len], dtype=np.int32)
        return np.array(ids + [pad_id] * (max_len - len(ids)), dtype=np.int32)

    # --- Apply to DataFrame ---
    df = input_df
    df["sp_ids"] = df["Body"].apply(encode_ids)

    # overwrite sp_ids_padded with NumPy arrays directly
    df["sp_ids_padded"] = df["sp_ids"].apply(lambda ids: pad_ids(ids, MAX_LEN, pad_id))

    return df


In [None]:
res = vocab_to_id_mapper(debug_test)

                                                Body  Label  \
0  status john im not really sure what happened b...      0   

                                              sp_ids  \
0  [7, 9, 10, 33, 25, 9, 7, 38, 20, 19, 11, 13, 3...   

                                       sp_ids_padded  
0  [7, 9, 10, 33, 25, 9, 7, 38, 20, 19, 11, 13, 3...  


In [None]:
def word2vec_embedder(
    input_df
) : 