In [1]:
import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup
import unicodedata
from nltk.stem import WordNetLemmatizer
import sentencepiece as spm
from gensim.models import Word2Vec

#### archive

In [None]:
# full_raw = pd.read_csv('/Users/javier/VSCODE/datahub/enron_data_fraud_labeled.csv') 

# filtered_raw = full_raw[['Body','Label']].iloc[:100_000]

# filtered_raw.to_csv('test.csv',index = False)

In [None]:
# test = "Hi John,  Please check out our new product at https://www.example.com/special-offer.  You can also visit www.testsite.org for more details.  I've attached the latest report as quarterly_results.pdf and also a backup copy as report.docx send to abc@gmail.com and john.doe@gmail.org. Let me know if you have trouble opening summary.xlsx.  Best,  Alice"

# html_text = """
# <html>
#   <head>
#     <title>Quarterly Update</title>
#   </head>
#   <body>
#     <h1>Special Offer!</h1>
#     <p>Dear customer,</p>
    
#     <p>
#       Please download the latest reports:
#       <a href="https://example.com/files/quarterly_report.pdf">Quarterly Report</a>,
#       <a href="https://example.com/files/summary.docx">Summary</a>,
#       and <a href="https://example.com/files/data.xlsx">Data File</a>.
#     </p>

#     <p>
#       If you cannot access the files, please email 
#       <a href="mailto:support@example.com">support@example.com</a>.
#     </p>

#     <p>
#       Alternatively, you may contact John at john.doe@workmail.org or visit our site 
#       <a href="http://www.testsite.org">www.testsite.org</a>.
#     </p>

#     <p>
#       Attached reference documents: <b>budget_2024.pdf</b>, <b>plan_final.docx</b>
#     </p>

#     <p>
#       loveeeeeeeeee
#       lovee33333eeee
#       a-p-p-l-e
#       b.a.n.a.n.a
#       fr33 c4$h 
#       Helloüåç!! This*** is a test üòé #spam @user $100...
#       45 46 20000 32323 $222.22
#     </p
#   </body>
# </html>
# """


In [None]:
filtered_raw = pd.read_csv('/Users/javier/VSCODE/local/DSA4213_vsc/final_project/test.csv')
print(filtered_raw.dtypes)


#### helpers

In [None]:
# masking special token
def mask_tokens(text):
    # replace URLs (http, https, www)
    text = re.sub(r'(https?://\S+|www\.\S+)', '<URL>', text)

    # replace common file extensions (customize list)
    text = re.sub(r'\b[\w\-]+\.(pdf|docx|xlsx|txt|csv|tar|doc\.gz|doc)\b', '<FILE>', text)

    # emails
    text = re.sub(r'\b[\w\.-]+@[\w\.-]+\.\w+\b', '<EMAIL>', text)

    # money 
    text = re.sub(r'\$\d+(?:\.\d{2})?','<MONEY>',text)

    # numbers 
    text = re.sub(r'\b\d+\b','<NUMBER>',text)
    text = text.replace('<NUMBER>', '')

    return text


# un HTML raw text 
def strip_html(raw_html):
    """
    Strip HTML tags, scripts, styles, and normalize whitespace
    to return clean raw text from HTML emails.
    """
    soup = BeautifulSoup(raw_html, "html.parser")

    
    for a in soup.find_all("a"):
        href = a.get("href",None)
        if not href:   # skip if no href
                continue

        # print(a_attribute)

        a_attribute = mask_tokens(href)

        if a_attribute == '<URL>' : 
            a.replace_with('<URL>')

        elif a_attribute =='<EMAIL>' : 
            a.replace_with('<EMAIL>')
        
        elif a_attribute == '<FILE>' : 
            a.replace_with('<FILE>')

        elif a_attribute == '<MONEY>' : 
            a.replace_with('<MONEY>')
        
        elif a_attribute == '<NUMBER>' : 
            a.replace_with('<NUMBER>')

    # remove script, style, head, and metadata tags
    for tag in soup(["script", "style", "head", "title", "meta", "[document]"]):
        tag.decompose()

    # extract text
    text = soup.get_text(separator=" ")

    # normalize unicode 
    text = unicodedata.normalize("NFKC", text)

    # replace non-breaking spaces specifically (unicode)
    text = text.replace("\xa0", " ")

    # collapse all whitespace tokens (line breaks, tabs, multiple spaces) into one space and remove extra spaces
    text = re.sub(r"\s+", " ", text).strip()

    # rim leading/trailing spaces
    return text

# special case handling
mapper = str.maketrans({
    '0':'o','1':'l','3':'e','4':'a','5':'s','7':'t','$':'s','@':'a'
})

def deobfuscate_words(text):
    """
    capture non-alphanumeric sequence in windows of 1-3 and replaces with ' ' 
    l-o-v-e -> l-o , - is detected and removed -> love
    """
    # replace text to number 
    text = text.translate(mapper)
    # remove weird spaces etc 
    text = re.sub(r'(?i)(?<=\w)[^A-Za-z0-9\s]{1,3}(?=\w)', '', text)
    return text

def word_capper(text):
    text = re.sub(r'(.)\1{' + str(2) + r',}', lambda m: m.group(1)*2, text)
    text = re.sub(r'([!?.,])\1{1,}', r'\1\1', text)
    return text


# whitelist filtering
def char_lvl_whitelist_filter(text): 
    text = re.sub(r'[^a-zA-Z0-9\s\.\,\!\?\'\":;\-\_\(\)\@\#\$\%\^\&\<\>]', '', text)
    return text

# word level processor 
def lemmatizer(text) :
    lemmatizer = WordNetLemmatizer()
    sentence = ''

    lemmatized_words = [lemmatizer.lemmatize(word) for word in text]

    return sentence.join(lemmatized_words)

#final clean
def final_punc_removal(text):
    text = re.sub(r'[^A-Za-z0-9\s<>]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

    
def preprocess_email_text(raw): 
    """
    the whole pipeline of processing
    input : dataframe with text column and ham/spam label
    output : dataframe with cleaned sentences and ham/spam label
    """
    raw = strip_html(raw) # process html first to capture links from <a> tags
    raw = mask_tokens(raw) # mask special tokens 
    raw = deobfuscate_words(raw)
    raw = word_capper(raw)
    raw = lemmatizer(raw)
    raw = char_lvl_whitelist_filter(raw)
    raw = final_punc_removal(raw)
    raw = raw.lower()
    return raw

def preprocess_email_df(df, text_col):
    df[text_col] = df[text_col].apply(preprocess_email_text)
    return df


def vocab_builder(
    input_df
    ,vocab_size
    ,model_type
) : 
    
    input_df["Body"].to_csv("emails_clean.txt", index=False, header=False)

    # train SentencePiece model
    spm.SentencePieceTrainer.Train(
        f"--input=emails_clean.txt "
        f"--model_prefix=email_sp "
        f"--vocab_size={vocab_size} "
        f"--character_coverage=1.0 "
        f"--model_type={model_type} "
        f"--shuffle_input_sentence=false "
        f"--seed_sentencepiece_size=1000000 "
        f"--user_defined_symbols=<url>,<email>,<file>,<money>,<pad>"
    )


def vocab_to_id_mapper(
        input_df
        ,max_len
) :
    
    sp = spm.SentencePieceProcessor()
    sp.load("email_sp.model")                 


    
    MAX_LEN = max_len
    pad_id = sp.piece_to_id("<pad>")
    if pad_id == -1:  
        pad_id = 0

    
    def encode_ids(text) :
        if not isinstance(text, str):
            text = "" if pd.isna(text) else str(text)
        return sp.encode_as_ids(text)

    def pad_ids(ids,max_len,pad_id) -> np.ndarray:
        if len(ids) >= max_len:
            return np.array(ids[:max_len], dtype=np.int32)
        return np.array(ids + [pad_id] * (max_len - len(ids)), dtype=np.int32)

    
    df = input_df.copy()
    df["sp_ids"] = df["Body"].apply(encode_ids)

    # overwrite sp_ids_padded with NumPy arrays directly
    df["sp_ids_padded"] = df["sp_ids"].apply(lambda ids: pad_ids(ids, MAX_LEN, pad_id))

    return df


def train_word2vec(
    input_df,
    vector_size: int = 128,
    window: int = 5,
    min_count: int = 2,
    epochs: int = 10,
    seed: int = 42
):
    
    sp = spm.SentencePieceProcessor()
    sp.load("email_sp.model")

    pad_id = sp.piece_to_id("<pad>")
    if pad_id == -1:
        pad_id = 0

    def ids_to_pieces(ids):
        return [sp.id_to_piece(i) for i in ids if i != pad_id]

    corpus_pieces = [ids_to_pieces(ids) for ids in input_df["sp_ids"]]

    w2v = Word2Vec(
        sentences=corpus_pieces,
        vector_size=vector_size,
        window=window,
        min_count=min_count,
        sg=1,
        workers=1,   # keep reproducibility
        epochs=epochs,
        seed=seed
    )

    return w2v, sp, pad_id


def build_embedding_matrix(w2v, sp, pad_id: int, seed: int = 42):
    """
    Build embedding matrix aligned with SentencePiece IDs.
    """
    vocab_size = sp.get_piece_size()
    emb_dim = w2v.vector_size

    E = np.zeros((vocab_size, emb_dim), dtype=np.float32)
    rng = np.random.default_rng(seed)

    for sp_id in range(vocab_size):
        piece = sp.id_to_piece(sp_id)
        if piece in w2v.wv:
            E[sp_id] = w2v.wv[piece]
        else:
            E[sp_id] = rng.normal(0.0, 0.01, size=emb_dim).astype(np.float32)

    # Keep PAD = 0
    if 0 <= pad_id < vocab_size:
        E[pad_id] = 0.0

    metadata = {
        "vocab_size": vocab_size,
        "emb_dim": emb_dim,
        "pad_id": pad_id,
        "trained_vocab": len(w2v.wv),
        "oov_count": vocab_size - len(w2v.wv),
    }
    return E, metadata


def zz_word2vec_embedder(
    input_df,

    # model param 
    vector_size: int = 128,
    window: int = 5,
    min_count: int = 2,
    epochs: int = 10 ,

    # reproducibility
    seed: int = 42
):

    sp = spm.SentencePieceProcessor()
    sp.load("email_sp.model")
    vocab_size = sp.get_piece_size()

    
    pad_id = sp.piece_to_id('<pad>')
    if pad_id == -1:
        pad_id = 0

    
    def ids_to_pieces(ids):
        return [sp.id_to_piece(i) for i in ids if i != pad_id]


    corpus_pieces = [ids_to_pieces(ids) for ids in input_df["sp_ids"]]

    
    w2v = Word2Vec(
        sentences=corpus_pieces,
        vector_size=vector_size,
        window=window,
        min_count=min_count,
        sg=1,
        workers=1, # for reproductibility
        epochs=epochs,
        seed = seed
    )

    
    emb_dim = w2v.vector_size

    E = np.zeros((vocab_size, emb_dim), dtype=np.float32)

    rng = np.random.default_rng(seed)

    for sp_id in range(vocab_size):

        piece = sp.id_to_piece(sp_id)

        # map id to wv if exists
        if piece in w2v.wv:
            E[sp_id] = w2v.wv[piece]
        else:
            E[sp_id] = rng.normal(0.0, 0.01, size=emb_dim).astype(np.float32)


    # Ensure PAD stays zero (common practice)
    if 0 <= pad_id < vocab_size:
        E[pad_id] = 0.0

    metadata= {
        "vocab_size": vocab_size,
        "emb_dim": emb_dim,
        "pad_id": pad_id,
        "trained_vocab": len(w2v.wv),
        "oov_count": vocab_size - len(w2v.wv),
    }
    return E, w2v, metadata

#### main

In [4]:
filtered_raw = pd.read_csv('/Users/javier/VSCODE/local/DSA4213_vsc/final_project/test.csv')
debug_test = filtered_raw[['Body','Label']].iloc[:1]
debug_test['Body'] = debug_test['Body'].apply(preprocess_email_text)
print(type(debug_test))
debug_test


<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,Body,Label
0,status john im not really sure what happened b...,0


In [7]:
## clean the text , save as variable to enable reuse
raw_df = debug_test

clean_df = preprocess_email_df(raw_df,'Body')



In [None]:
## build vocab 
# vocab builder (ONLY CALL ONCE)
vocab_builder(clean_df,8000,42,'bpe')

In [10]:
# map vocab to id
clean_df = pd.read_csv('/Users/javier/VSCODE/local/DSA4213_vsc/final_project/clean_df.csv')

mapped_df = vocab_to_id_mapper(clean_df,256) # this number is the token limit per email


In [None]:
# call embedder
w2v_model, subword_processor, pad_id = train_word2vec(mapped_df)

# save model for reload
w2v_model.save('word2vec.model')

In [14]:
# build sp -> word embedin matrix 
# load saved model
w2v_model = Word2Vec.load("word2vec.model")

#sentencePiece model & pad_id
import sentencepiece as spm
sp = spm.SentencePieceProcessor()
sp.load("email_sp.model")
pad_id = sp.piece_to_id("<pad>")
if pad_id == -1:
    pad_id = 0

subword_processor = sp 

embedding_matrix, embedding_summary = build_embedding_matrix(w2v_model,subword_processor,pad_id)

In [16]:
embedding_summary

{'vocab_size': 8000,
 'emb_dim': 128,
 'pad_id': 7,
 'trained_vocab': 7981,
 'oov_count': 19}

In [17]:
pad_id = subword_processor.piece_to_id('<pad>')
pad_id

7