In [None]:
import pandas as pd
import numpy as np
import re
from pathlib import Path
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 100)

In [3]:
#Load Phishing Dataset
RANDOM_STATE=42
np.random.seed(RANDOM_STATE)
RAW_DATA_PATH = Path('./data/raw/') 
PREPROCESSED_DATA_PATH = Path('data/preprocessing/') 
PREPROCESSED_DATA_PATH.mkdir(exist_ok=True)


In [4]:
#Find all csv files
csv_files = sorted(list(RAW_DATA_PATH.glob('*.csv')))
print(f"Found {len(csv_files)} CSV files:\n")
for f in csv_files:
    print(f"- {f.name}")


Found 6 CSV files:

- CEAS_08.csv
- Enron.csv
- Ling.csv
- Nazario.csv
- Nigerian_Fraud.csv
- SpamAssasin.csv


In [5]:
# Helper to repair Nazario dataset
def load_nazario_fixed(path):
    """Load and repair Nazario dataset"""
    try:
        df = pd.read_csv(path, encoding="utf-8", error_bad_lines=False)
        if {"subject","body","label"}.issubset(df.columns):
            df = df[["subject","body","label"]]
            df["label"] = 1
            print(f"Loaded {path} normally.")
            return df
    except Exception as e:
        print(f"Normal CSV read failed: {e}")

    print(f"Nazario file misaligned.Attempting manual repair for {path}")
    rows = []
    with open(path, encoding="utf-8", errors="ignore") as f:
        text = f.read()

    for line in text.splitlines():
        line = line.strip()
        if not line:
            continue
        m = re.search(r",([01])\s*$", line)
        if not m:
            continue
        label = int(m.group(1))
        line = re.sub(r",([01])\s*$", "", line)
        parts = line.split(",", 4)
        if len(parts) >= 4:
            subj = parts[3].strip('" ')
            body = parts[4] if len(parts) > 4 else ""
            rows.append({"subject": subj, "body": body, "label": 1})
    df = pd.DataFrame(rows)
    print(f"Nazario repaired: {len(df)} rows recovered.")
    return df

#Load and Inspect each File
raw_dataframes = []
file_info = []

for i, file_path in enumerate(csv_files, 1):
    print(f"\n{'='*70}")
    print(f"File {i}: {file_path.name}")
    print(f"{'='*70}")
    try:
        if "nazario" in file_path.name.lower():
            df = load_nazario_fixed(file_path)
        else:
            df = pd.read_csv(file_path, encoding='utf-8', error_bad_lines=False)
            encoding = 'utf-8'
    except:
            df = pd.read_csv(file_path, encoding='latin-1', error_bad_lines=False)
            encoding = 'latin-1'
    
    print(f"Encoding: {encoding}")
    print(f"Shape: {df.shape}")
    print(f"Columns: {df.columns.tolist()}")
    print(f"\nFirst 2 rows:")
    print(df.head(2))
    
    # Check for label column
    if 'label' in df.columns or 'Label' in df.columns:
        label_col = 'label' if 'label' in df.columns else 'Label'
        print(f"\nLabel distribution:")
        print(df[label_col].value_counts().sort_index())
    
    raw_dataframes.append(df)
    file_info.append({
        'filename': file_path.name,
        'rows': len(df),
        'columns': len(df.columns),
        'encoding': encoding
    })

print(f"\n{'='*70}")
print("Summary of all files:")
print(f"{'='*70}")
summary_df = pd.DataFrame(file_info)
print(summary_df)
print(f"\nTotal rows across all files: {summary_df['rows'].sum():,}")



File 1: CEAS_08.csv
Encoding: utf-8
Shape: (39154, 7)
Columns: ['sender', 'receiver', 'date', 'subject', 'body', 'label', 'urls']

First 2 rows:
                             sender                       receiver  \
0  Young Esposito <Young@iworld.de>    user4@gvc.ceas-challenge.cc   
1      Mok <ipline's1983@icable.ph>  user2.2@gvc.ceas-challenge.cc   

                              date                    subject  \
0  Tue, 05 Aug 2008 16:31:02 -0700  Never agree to be a loser   
1  Tue, 05 Aug 2008 18:31:03 -0500     Befriend Jenna Jameson   

                                                                                                  body  \
0  Buck up, your troubles caused by small dimension will soon be over!\nBecome a lover no woman wil...   
1               \nUpgrade your sex and pleasures with these techniques http://www.brightmade.com\n\n\n   

   label  urls  
0      1     1  
1      1     1  

Label distribution:
0    17312
1    21842
Name: label, dtype: int64

File 2:

In [None]:
##Standardize Each File

In [None]:
def standardize_dataframe(df, file_name):

    standardized = pd.DataFrame()
    
    # Extract subject
    if 'subject' in df.columns:
        standardized['subject'] = df['subject']
    elif 'Subject' in df.columns:
        standardized['subject'] = df['Subject']
    else:
        standardized['subject'] = ''
        print(f"{file_name}: No subject column")
    
    # Extract body
    if 'body' in df.columns:
        standardized['body'] = df['body']
    elif 'Body' in df.columns:
        standardized['body'] = df['Body']
    else:
        standardized['body'] = ''
        print(f"{file_name}: No body column")
    
    # Extract label
    if 'label' in df.columns:
        standardized['label'] = df['label']
    elif 'Label' in df.columns:
        standardized['label'] = df['Label']
    else:
        raise ValueError(f"No label column found in {file_name}")
    
    return standardized

In [None]:
standardized_dfs = []

for df, file_info in zip(raw_dataframes, file_info):
    file_name = file_info['filename']
    print(f"{file_name}")
    
    try:
        std_df = standardize_dataframe(df, file_name)
        print(f"Standardized: {len(std_df)} rows, 3 columns")
        standardized_dfs.append(std_df)
    except Exception as e:
        print(f"Error: {e}")

print(f"\n Successfully standardized {len(standardized_dfs)}/{len(raw_dataframes)} files")

CEAS_08.csv
Standardized: 39154 rows, 3 columns
Enron.csv
Standardized: 29767 rows, 3 columns
Ling.csv
Standardized: 2859 rows, 3 columns
Nazario.csv
Standardized: 1565 rows, 3 columns
Nigerian_Fraud.csv
Standardized: 3332 rows, 3 columns
SpamAssasin.csv
Standardized: 5809 rows, 3 columns

 Successfully standardized 6/6 files


In [None]:
# Concatenate all standardized dataframes
merged_df = pd.concat(standardized_dfs, ignore_index=True)

print(f"Merged dataset:")
print(f"  Shape: {merged_df.shape}")
print(f"  Columns: {merged_df.columns.tolist()}")


Merged dataset:
  Shape: (82486, 3)
  Columns: ['subject', 'body', 'label']


In [None]:
merged_df.to_csv(PREPROCESSED_DATA_PATH / 'merged_raw.csv', index=False)
print(f" Saved: {PREPROCESSED_DATA_PATH / 'merged_raw.csv'}")

 Saved: data\preprocessing\merged_raw.csv


In [None]:
##Data Cleaning

In [None]:
df = merged_df.copy()
initial_count = len(df)
print(f"Before cleaning: {initial_count:,} rows\n")

Before cleaning: 82,486 rows



In [None]:
#Handle missing values

In [None]:
# Remove rows with missing labels
missing_labels = df['label'].isna().sum()
print(f"Rows with missing labels: {missing_labels}")
if missing_labels > 0:
    df = df.dropna(subset=['label'])
    print(f"Removed {missing_labels} rows")

# Fill missing subject/body with empty string
df['subject'] = df['subject'].fillna('')
df['body'] = df['body'].fillna('')
print(f"Filled missing subject/body with empty strings")

print(f"\nRows remaining: {len(df):,}")

Rows with missing labels: 0
Filled missing subject/body with empty strings

Rows remaining: 82,486


In [None]:
# Convert to string
df['subject'] = df['subject'].astype(str)
df['body'] = df['body'].astype(str)

# Convert labels to int and validate
df['label'] = df['label'].astype(int)
unique_labels = sorted(df['label'].unique())

# Check for invalid labels
invalid = ~df['label'].isin([0, 1])
if invalid.sum() > 0:
    print(f"Found {invalid.sum()} rows with invalid labels")
    print(f"Invalid values: {df[invalid]['label'].unique()}")
    df = df[df['label'].isin([0, 1])]
    print(f"Removed {invalid.sum()} rows with invalid labels")
else:
    print("All labels are valid (0 or 1)")

print(f"\nRows remaining: {len(df):,}")

All labels are valid (0 or 1)

Rows remaining: 82,486


In [None]:
#Text Cleaning

In [None]:
def clean_text(text):
    if not isinstance(text, str) or text == '' or pd.isna(text):
        return ''
    
    text = str(text)
    
    # 1. Decode HTML entities (&lt; &gt; &amp; etc.)
    text = html.unescape(text)
    
    # 2. Remove HTML tags 
    text = BeautifulSoup(text, "html.parser").get_text(" ")
    
    # 3. Normalize Unicode characters 
    text = unicodedata.normalize("NFKC", text)
    
    # 4. Remove alternating junk patterns like >t=t=t=t= or +=+=+=+= (over 5)
    text = re.sub(r'(.{1,3})\1{5,}', '', text)
    
    # 5. Remove repeated junk characters: ===, ---, ___, +++ (over 3)
    text = re.sub(r'([=\-_+*#~|\\/<>])\1{2,}', '', text)
    
    # 6. Remove repeated punctuation
    text = re.sub(r'\?{2,}', '?', text)
    text = re.sub(r'!{2,}', '!', text)
    text = re.sub(r'\.{3,}', '...', text) 
    
    # 7. Remove excessive letter/number repetitions
    text = re.sub(r'(\w)\1{3,}', r'\1\1\1', text)
    
    # 10. Collapse whitespace and trim
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

df['subject'] = df['subject'].apply(clean_text)
df['body'] = df['body'].apply(clean_text)
print("Text cleaning is done")

# Create combined text column
df['text'] = df['subject'] + ' ' + df['body']
df['text'] = df['text'].str.strip()
print("Created combined 'text' column to remove duplicates")

print(f"\nRows remaining: {len(df):,}")

Text cleaning is done
Created combined 'text' column to remove duplicates

Rows remaining: 82,486


In [None]:
#Remove empty/invalid rows

In [None]:
# Remove rows where both subject and body are empty
empty_both = (df['subject'] == '') & (df['body'] == '')
print(f"Rows with empty subject AND body: {empty_both.sum()}")
if empty_both.sum() > 0:
    df = df[~empty_both]
    print(f"Removed {empty_both.sum()} rows")

# Remove rows with very short text (< 5 characters)
very_short = df['text'].str.len() < 5
print(f"Rows with text less than 5 characters: {very_short.sum()}")
if very_short.sum() > 0:
    df = df[~very_short]
    print(f"Removed {very_short.sum()} rows")

print(f"\nRows remaining: {len(df):,}")

Rows with empty subject AND body: 0
Rows with text less than 5 characters: 1
Removed 1 rows

Rows remaining: 82,485


In [None]:
#Remove Duplicates

In [None]:
# Find duplicates based on text
duplicates = df.duplicated(subset=['text'], keep='first')
print(f"Duplicate rows: {duplicates.sum()}")

if duplicates.sum() > 0:
    # Show some examples
    print("\nExample duplicates:")
    dup_text = df[duplicates]['text'].iloc[0][:100]
    print(f"'{dup_text}...'")
    
    df = df[~duplicates]
    print(f"\nRemoved {duplicates.sum()} duplicate rows")

print(f"\nRows remaining: {len(df):,}")

Duplicate rows: 592

Example duplicates:
'[UAI] Call for Papers: Special Issue M4M Call for Papers for "Methods for Modalities" A Special Issu...'

Removed 592 duplicate rows

Rows remaining: 81,893


In [None]:
##Final Dataset and Validation

In [None]:
# Create final dataset with all columns
final_df = df[['subject', 'body', 'label']].copy()
final_df = final_df.reset_index(drop=True)

print(f"Final dataset shape: {final_df.shape}")
print(f"Columns: {final_df.columns.tolist()}")
print(f"\nFirst 3 rows:")
display(final_df.head(3))

Final dataset shape: (81893, 3)
Columns: ['subject', 'body', 'label']

First 3 rows:


Unnamed: 0,subject,body,label
0,Never agree to be a loser,"Buck up, your troubles caused by small dimension will soon be over! Become a lover no woman will...",1
1,Befriend Jenna Jameson,Upgrade your sex and pleasures with these techniques http://www.brightmade.com,1
2,CNN.com Daily Top 10,"> >THE DAILY TOP 10 >from CNN.com >Top videos and stories as of: Aug 1, 2008 3:58 PM EDT > TOP 1...",1


In [None]:
##Train/Validation/Test Split

In [None]:
# Shuffle data first
final_df = final_df.sample(frac=1, random_state=RANDOM_STATE).reset_index(drop=True)

# Separate features and labels
X = final_df[['subject', 'body']]
y = final_df['label']

# First split: 70% train, 30% temp(test+validation)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.3, random_state=RANDOM_STATE, stratify=y
)

# Second split: 15% val, 15% test
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=RANDOM_STATE, stratify=y_temp
)

print("Data Split Summary:")
print("="*70)

splits = [
    ('Train', X_train, y_train),
    ('Validation', X_val, y_val),
    ('Test', X_test, y_test)
]

for name, X_split, y_split in splits:
    total = len(y_split)
    phishing = (y_split == 1).sum()
    legit = (y_split == 0).sum()
    pct = total / len(final_df) * 100
    
    print(f"\n{name} set:")
    print(f"Total:     {total:,} ({pct:.1f}%)")
    print(f"Phishing:  {phishing:,} ({phishing/total*100:.2f}%)")
    print(f"Legitimate:{legit:,} ({legit/total*100:.2f}%)")

print("\n" + "="*70)

Data Split Summary:

Train set:
Total:     57,325 (70.0%)
Phishing:  29,841 (52.06%)
Legitimate:27,484 (47.94%)

Validation set:
Total:     12,284 (15.0%)
Phishing:  6,395 (52.06%)
Legitimate:5,889 (47.94%)

Test set:
Total:     12,284 (15.0%)
Phishing:  6,394 (52.05%)
Legitimate:5,890 (47.95%)



In [None]:
# Create output directory
PREPROCESSED_DATA_PATH.mkdir(exist_ok=True)

# Save full dataset
final_df.to_csv(PREPROCESSED_DATA_PATH / 'phishing_full_clean.csv', index=False)
print(f"Saved: {PREPROCESSED_DATA_PATH / 'phishing_full_clean.csv'}")

# Save train split
train_df = X_train.copy()
train_df['label'] = y_train.values
train_df.to_csv(PREPROCESSED_DATA_PATH / 'train.csv', index=False)
print(f"Saved: {PREPROCESSED_DATA_PATH / 'train.csv'}")

# Save validation split
val_df = X_val.copy()
val_df['label'] = y_val.values
val_df.to_csv(PREPROCESSED_DATA_PATH / 'val.csv', index=False)
print(f"Saved: {PREPROCESSED_DATA_PATH / 'val.csv'}")

# Save test split
test_df = X_test.copy()
test_df['label'] = y_test.values
test_df.to_csv(PREPROCESSED_DATA_PATH / 'test.csv', index=False)
print(f"Saved: {PREPROCESSED_DATA_PATH / 'test.csv'}")

print(f"\n{'='*70}")
print("All files saved successfully!")
print(f"{'='*70}")

Saved: data\preprocessing\phishing_full_clean.csv
Saved: data\preprocessing\train.csv
Saved: data\preprocessing\val.csv
Saved: data\preprocessing\test.csv

All files saved successfully!
