In [8]:
import pandas as pd
import re
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from urllib.parse import urlparse
import numpy as np
import warnings

# Suppress future warnings from BeautifulSoup for cleaner output
warnings.filterwarnings("ignore", category=FutureWarning, module='bs4')

print("Libraries imported successfully.")

Libraries imported successfully.


In [9]:
# --- MODIFIED VERSION ---
def extract_features(data, text_column='body', subject_column='subject'):
    """
    This function takes a DataFrame and engineers a rich set of features.
    """
    df = data.copy()
    
    # Combine subject and body for text analysis
    df['full_text'] = df[subject_column].fillna('') + ' ' + df[text_column].fillna('')
    
    # --- Text Cleaning ---
    df['clean_text'] = df['full_text'].apply(lambda x: BeautifulSoup(x, 'html.parser').get_text(separator=' '))
    df['clean_text'] = df['clean_text'].str.lower()
    
    # --- URL and Hyperlink Features ---
    url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    df['urls'] = df['full_text'].apply(lambda x: re.findall(url_pattern, str(x)))
    df['num_hyperlinks'] = df['urls'].apply(len)
    
    suspicious_domains = ['forms.gle', 'weebly.com', 'blogspot.com', 'glitch.me', 'repl.co', '.ipfs.', '127.0.0.1', 'bit.ly', 'tinyurl.com', 'dweb.link']
    df['num_suspicious_links'] = df['urls'].apply(lambda urls: sum(1 for url in urls if any(domain in url for domain in suspicious_domains)))

    df['num_ip_urls'] = df['urls'].apply(lambda urls: sum(1 for url in urls if re.search(r'/\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}/', url)))
    
    # --- Stylistic & Structural Features ---
    df['body_char_count'] = df[text_column].str.len().fillna(0)
    df['subject_char_count'] = df[subject_column].str.len().fillna(0)
    
    urgency_words = ['urgent', 'action required', 'important notice', 'warning', 'verify', 'suspension', 'deactivation', 'unsuccessful', 'payment', 'invoice']
    df['urgency_keyword_count'] = df['full_text'].str.lower().apply(lambda text: sum(1 for word in urgency_words if word in str(text)))
    
    df['num_capital_words'] = df['clean_text'].apply(lambda text: len(re.findall(r'\b[A-Z]{3,}\b', str(text))))
    
    df['special_char_ratio'] = df['full_text'].apply(lambda x: len(re.findall(r'[^a-zA-Z0-9\s]', str(x))) / (len(str(x)) + 1e-6))
    
    df['has_javascript'] = df[text_column].str.contains('<script>', case=False, na=False).astype(int)
    df['has_form_tag'] = df[text_column].str.contains('<form>', case=False, na=False).astype(int)

    # --- Preprocessing for TF-IDF Vectorization ---
    # We still need WordNet for the lemmatizer, so the NLTK data setup is not wasted.
    nltk.data.path.append("C:\\nltk_data_project")
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    
    def preprocess_text_for_tfidf(text):
        # *** THE FIX: Replace NLTK's tokenizer with a simple, dependency-free regex tokenizer ***
        tokens = re.findall(r'\b\w+\b', text.lower())
        # The rest of the function remains the same
        lemmas = [lemmatizer.lemmatize(token) for token in tokens if token.isalpha() and token not in stop_words]
        return ' '.join(lemmas)

    df['processed_text'] = df['clean_text'].apply(preprocess_text_for_tfidf)
    
    feature_cols = [
        'num_hyperlinks', 'num_suspicious_links', 'num_ip_urls', 'body_char_count',
        'subject_char_count', 'urgency_keyword_count', 'num_capital_words',
        'special_char_ratio', 'has_javascript', 'has_form_tag', 'processed_text'
    ]
    return df[feature_cols]

def load_and_clean_csv(filepath, cols_map):
    """A robust CSV loader."""
    try:
        df = pd.read_csv(
            filepath,
            on_bad_lines='skip',
            encoding='utf-8',
            low_memory=False
        ).rename(columns=cols_map)
        
        for col in cols_map.values():
            if col not in df.columns:
                 raise ValueError(f"Required column '{col}' not found in {filepath} after renaming.")
        return df[[*cols_map.values()]]
    except FileNotFoundError:
        print(f"FATAL ERROR: The file '{filepath}' was not found.")
        return None
    except Exception as e:
        print(f"FATAL ERROR: Could not read '{filepath}'. Reason: {e}")
        return None

print("Helper functions defined with new tokenizer.")

Helper functions defined with new tokenizer.


In [10]:
# --- ADD THIS CRITICAL LINE TO BYPASS ENVIRONMENT ISSUES ---
nltk.data.path.append("C:\\nltk_data_project")
# -----------------------------------------------------------

# Now, the rest of the original code
phishing_filepath = 'CaptstoneProjectData_2025.csv'
print(f"Loading and processing PHISHING data from '{phishing_filepath}'...")

# Use the loader function with the correct column mapping for the phishing CSV
phishing_df_raw = load_and_clean_csv(phishing_filepath, {'Subject': 'subject', 'Body': 'body'})

# Proceed only if the file was loaded successfully
if phishing_df_raw is not None:
    # Apply the feature engineering
    phishing_features = extract_features(phishing_df_raw)
    
    # Add the label for this dataset
    phishing_features['label'] = 1
    
    # Define the output filename
    output_filename = 'phishing_features.csv'
    
    # Save the processed data to a new CSV file
    phishing_features.to_csv(output_filename, index=False, encoding='utf-8')
    
    print("-" * 50)
    print(f"SUCCESS! Processed {len(phishing_features)} phishing emails.")
    print(f"Saved the results to '{output_filename}'")
    print("-" * 50)
    
    # Display the first few rows of the output for verification
    print("\nSample of the saved data:")
    display(phishing_features.head())

Loading and processing PHISHING data from 'CaptstoneProjectData_2025.csv'...
--------------------------------------------------
SUCCESS! Processed 2576 phishing emails.
Saved the results to 'phishing_features.csv'
--------------------------------------------------

Sample of the saved data:


Unnamed: 0,num_hyperlinks,num_suspicious_links,num_ip_urls,body_char_count,subject_char_count,urgency_keyword_count,num_capital_words,special_char_ratio,has_javascript,has_form_tag,processed_text,label
0,2,0,0,962.0,126.0,0,0,0.070707,0,0,review shipment detail shipment notification c...,1
1,3,0,0,1337.0,23.0,0,0,0.256429,0,0,υоur ассоunt іѕ оn hоld votre réponse bien été...,1
2,9,0,0,3264.0,65.0,1,0,0.07958,0,0,completed invoice bestbuy com notice message s...,1
3,1,1,0,545.0,22.0,3,0,0.028169,0,0,uvic important notice uvic account filed list ...,1
4,1,0,0,1284.0,41.0,1,0,0.034691,0,0,suspended incoming message message generated u...,1
