In [1]:
import pandas as pd
import re
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from urllib.parse import urlparse
import numpy as np
import warnings
import email # This library is crucial for parsing raw email text files

# Suppress future warnings from BeautifulSoup for cleaner output
warnings.filterwarnings("ignore", category=FutureWarning, module='bs4')

print("Libraries imported successfully.")

Libraries imported successfully.


In [2]:
def extract_features(data, text_column='body', subject_column='subject'):
    """This function is IDENTICAL to the one in Notebook 1 to ensure the feature columns match perfectly."""
    df = data.copy()
    
    # --- This critical line ensures NLTK finds its data ---
    nltk.data.path.append("C:\\nltk_data_project")
    # ----------------------------------------------------
    
    df['full_text'] = df[subject_column].fillna('') + ' ' + df[text_column].fillna('')
    df['clean_text'] = df['full_text'].apply(lambda x: BeautifulSoup(x, 'html.parser').get_text(separator=' '))
    df['clean_text'] = df['clean_text'].str.lower()
    url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    df['urls'] = df['full_text'].apply(lambda x: re.findall(url_pattern, str(x)))
    df['num_hyperlinks'] = df['urls'].apply(len)
    suspicious_domains = ['forms.gle', 'weebly.com', 'blogspot.com', 'glitch.me', 'repl.co', '.ipfs.', '127.0.0.1', 'bit.ly', 'tinyurl.com', 'dweb.link']
    df['num_suspicious_links'] = df['urls'].apply(lambda urls: sum(1 for url in urls if any(domain in url for domain in suspicious_domains)))
    df['num_ip_urls'] = df['urls'].apply(lambda urls: sum(1 for url in urls if re.search(r'/\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}/', url)))
    df['body_char_count'] = df[text_column].str.len().fillna(0)
    df['subject_char_count'] = df[subject_column].str.len().fillna(0)
    urgency_words = ['urgent', 'action required', 'important notice', 'warning', 'verify', 'suspension', 'deactivation', 'unsuccessful', 'payment', 'invoice']
    df['urgency_keyword_count'] = df['full_text'].str.lower().apply(lambda text: sum(1 for word in urgency_words if word in str(text)))
    df['num_capital_words'] = df['clean_text'].apply(lambda text: len(re.findall(r'\b[A-Z]{3,}\b', str(text))))
    df['special_char_ratio'] = df['full_text'].apply(lambda x: len(re.findall(r'[^a-zA-Z0-9\s]', str(x))) / (len(str(x)) + 1e-6))
    df['has_javascript'] = df[text_column].str.contains('<script>', case=False, na=False).astype(int)
    df['has_form_tag'] = df[text_column].str.contains('<form>', case=False, na=False).astype(int)
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    def preprocess_text_for_tfidf(text):
        tokens = re.findall(r'\b\w+\b', text.lower()) # Use dependency-free regex tokenizer
        lemmas = [lemmatizer.lemmatize(token) for token in tokens if token.isalpha() and token not in stop_words]
        return ' '.join(lemmas)
    df['processed_text'] = df['clean_text'].apply(preprocess_text_for_tfidf)
    feature_cols = ['num_hyperlinks', 'num_suspicious_links', 'num_ip_urls', 'body_char_count','subject_char_count', 'urgency_keyword_count', 'num_capital_words','special_char_ratio', 'has_javascript', 'has_form_tag', 'processed_text']
    return df[feature_cols]

def parse_enron_email(raw_email):
    """
    Parses a raw email string (from the Enron dataset) into a subject and body.
    """
    msg = email.message_from_string(raw_email)
    subject = msg.get('Subject', '') # Use .get() for safety if Subject header is missing
    body = ""
    if msg.is_multipart():
        for part in msg.walk():
            # We are only interested in the plain text part of the email
            if part.get_content_type() == 'text/plain':
                try:
                    body = part.get_payload(decode=True).decode('utf-8', errors='replace')
                except (UnicodeDecodeError, AttributeError):
                    body = str(part.get_payload(decode=True)) # Fallback if decoding fails
                break # Stop after finding the first plain text part
    else:
        # If the email is not multipart, the payload is the body
        try:
            body = msg.get_payload(decode=True).decode('utf-8', errors='replace')
        except (UnicodeDecodeError, AttributeError):
            body = str(msg.get_payload(decode=True))
            
    return subject, body

print("Helper functions defined, including the Enron email parser.")

Helper functions defined, including the Enron email parser.


In [3]:
normal_filepath = 'emails-normal-class.csv'
print(f"Loading and processing NORMAL (Enron) data from '{normal_filepath}'...")

try:
    # This file has a 'file' and 'message' column. We will process the 'message' column.
    normal_df_raw = pd.read_csv(normal_filepath, encoding='utf-8')
    print(f"-> Loaded {len(normal_df_raw)} raw email messages.")
    
    # Use our special parser to extract 'subject' and 'body' from the 'message' column
    print("-> Parsing raw email messages into 'subject' and 'body' columns. This may take a moment...")
    parsed_emails = normal_df_raw['message'].apply(lambda x: pd.Series(parse_enron_email(x), index=['subject', 'body']))
    
    # Now that we have a standard DataFrame, apply our feature engineering function
    print("-> Engineering features for normal emails...")
    normal_features = extract_features(parsed_emails)
    
    # Add the label for this dataset (0 for normal)
    normal_features['label'] = 0
    
    # Define the output filename for our second checkpoint
    output_filename = 'normal_features.csv'
    
    # Save the processed data to a new CSV file
    normal_features.to_csv(output_filename, index=False, encoding='utf-8')
    
    print("-" * 50)
    print(f"SUCCESS! Processed {len(normal_features)} normal emails.")
    print(f"Saved the results to '{output_filename}'")
    print("-" * 50)

    # Display the first few rows of the output for verification
    print("\nSample of the saved data:")
    display(normal_features.head())

except FileNotFoundError:
    print(f"FATAL ERROR: The file '{normal_filepath}' was not found.")
except Exception as e:
    print(f"An error occurred: {e}")

Loading and processing NORMAL (Enron) data from 'emails-normal-class.csv'...
-> Loaded 517401 raw email messages.
-> Parsing raw email messages into 'subject' and 'body' columns. This may take a moment...
-> Engineering features for normal emails...



If you meant to use Beautiful Soup to parse the contents of a file on disk, then something has gone wrong. You should open the file first, using code like this:

    filehandle = open(your filename)

You can then feed the open filehandle into Beautiful Soup instead of using the filename.



    
  df['clean_text'] = df['full_text'].apply(lambda x: BeautifulSoup(x, 'html.parser').get_text(separator=' '))


--------------------------------------------------
SUCCESS! Processed 517401 normal emails.
Saved the results to 'normal_features.csv'
--------------------------------------------------

Sample of the saved data:


Unnamed: 0,num_hyperlinks,num_suspicious_links,num_ip_urls,body_char_count,subject_char_count,urgency_keyword_count,num_capital_words,special_char_ratio,has_javascript,has_form_tag,processed_text,label
0,0,0,0,23,0,0,0,0.0,0,0,forecast,0
1,0,0,0,786,3,0,0,0.016456,0,0,traveling business meeting take fun trip espec...,0
2,0,0,0,30,8,0,0,0.128205,0,0,test test successful way go,0
3,0,0,0,187,0,0,0,0.026596,0,0,randy send schedule salary level everyone sche...,0
4,0,0,0,35,9,0,0,0.088889,0,0,hello let shoot tuesday,0
