# Data Import

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("naserabdullahalam/phishing-email-dataset")

print("Path to dataset files:", path)

Path to dataset files: /root/.cache/kagglehub/datasets/naserabdullahalam/phishing-email-dataset/versions/1


# Data Prep

In [None]:
import pandas as pd

In [None]:
df_ceas = pd.read_csv('/root/.cache/kagglehub/datasets/naserabdullahalam/phishing-email-dataset/versions/1/CEAS_08.csv')
df_enron = pd.read_csv('/root/.cache/kagglehub/datasets/naserabdullahalam/phishing-email-dataset/versions/1/Enron.csv')
df_ling = pd.read_csv('/root/.cache/kagglehub/datasets/naserabdullahalam/phishing-email-dataset/versions/1/Ling.csv')
df_nazario = pd.read_csv('/root/.cache/kagglehub/datasets/naserabdullahalam/phishing-email-dataset/versions/1/Nazario.csv')
df_nigerian = pd.read_csv('/root/.cache/kagglehub/datasets/naserabdullahalam/phishing-email-dataset/versions/1/Nigerian_Fraud.csv')
df_sa = pd.read_csv('/root/.cache/kagglehub/datasets/naserabdullahalam/phishing-email-dataset/versions/1/SpamAssasin.csv')

#Compiled datased of all text features into a single column
#only two columns for whole table: combined text & label
df_all = pd.read_csv('/root/.cache/kagglehub/datasets/naserabdullahalam/phishing-email-dataset/versions/1/phishing_email.csv')


In [None]:
#drop nulls from frames
df_ceas = df_ceas.dropna(subset = 'body')
df_enron = df_enron.dropna(subset = 'body')
df_ling = df_ling.dropna(subset = 'body')
df_nazario = df_nazario.dropna(subset = 'body')
df_nigerian = df_nigerian.dropna(subset = 'body')
df_sa = df_sa.dropna(subset = 'body')
df_all = df_all.dropna()

Add the urls column to those that do not have it

In [None]:
import re

In [None]:
def urls_column(df):
    # Check if 'url' column exists, if not, create it
    if 'urls' not in df.columns:
        # Ensure 'body' column exists before applying function
        if 'body' in df.columns:
            df = df.dropna(subset=['body'])  # Drop NaN values in 'body' to avoid errors
            df['urls'] = df['body'].astype(str).apply(lambda x: 1 if re.search(r'https?://\S+', x) else 0)
        else:
            print("Error: 'body' column not found in the DataFrame.")
    return df

In [None]:
df_ceas = urls_column(df_ceas)
df_enron = urls_column(df_enron)
df_ling = urls_column(df_ling)
df_nazario = urls_column(df_nazario)
df_nigerian = urls_column(df_nigerian)
df_sa = urls_column(df_sa)

Giving the frames uniformity for the aggregation

In [None]:
df_ceas = df_ceas.drop(columns=['sender','receiver','date'])
df_nazario = df_nazario.drop(columns=['sender','receiver','date'])
df_nigerian = df_nigerian.drop(columns=['sender','receiver','date'])
df_sa = df_sa.drop(columns=['sender','receiver','date'])

Aggregating the dataframes for training

In [None]:
df_agg = pd.concat([df_ceas, df_enron, df_ling, df_nazario, df_nigerian, df_sa])
df_agg.head()
df_agg = df_agg

Ensuring datatypes of text columns for compatibility

In [None]:
df_agg['subject'] = df_agg['subject'].astype(str)
df_agg['body'] = df_agg['body'].astype(str)

# Model Training

In [None]:
#importing libraries
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier


In [None]:
from scipy.sparse import hstack
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_subject = vectorizer.fit_transform(df_agg['subject'])  # Convert subject text
X_body = vectorizer.fit_transform(df_agg['body'])  # Convert body text

# Convert 'urls' column to a sparse matrix
X_urls = np.array(df_agg[['urls']])  # Convert to NumPy array

# Stack sparse matrices without converting to dense
X = hstack((X_subject, X_body, X_urls))  # This keeps everything sparse
y = df_agg['label']  # Target labels (0 = legitimate, 1 = phishing)

In [None]:
# Split dataset into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
"""
Some models are commented out due to limited computing resources. To run remaining models, uncomment the corresponding lines.
"""
# Define models
models = {
    #"Naïve Bayes": MultinomialNB(),
    #"Logistic Regression": LogisticRegression(max_iter=1000,solver='saga', penalty='l2'),
    #"Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(n_estimators=300, learning_rate=0.1, max_depth=6, scale_pos_weight=2),
    "LightGBM": LGBMClassifier(n_estimators=300, learning_rate=0.1, max_depth=6),
    "Support Vector Machine": SVC(kernel='linear', C=1.0, class_weight='balanced'),
    "AdaBoost": AdaBoostClassifier(n_estimators=300, learning_rate=0.1),

}

# Train and evaluate each model
for name, model in models.items():
    model.fit(X_train, y_train)  # Train model
    y_pred = model.predict(X_test)  # Make predictions

    print(f"\n🔹 Model: {name}")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))


🔹 Model: XGBoost
Accuracy: 0.9696914590531612
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.94      0.97      7898
           1       0.95      0.99      0.97      8599

    accuracy                           0.97     16497
   macro avg       0.97      0.97      0.97     16497
weighted avg       0.97      0.97      0.97     16497





[LightGBM] [Info] Number of positive: 34291, number of negative: 31697
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 8.042274 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 794941
[LightGBM] [Info] Number of data points in the train set: 65988, number of used features: 7135
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.519655 -> initscore=0.078661
[LightGBM] [Info] Start training from score 0.078661





🔹 Model: LightGBM
Accuracy: 0.9741165060313997
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.96      0.97      7898
           1       0.96      0.99      0.98      8599

    accuracy                           0.97     16497
   macro avg       0.98      0.97      0.97     16497
weighted avg       0.97      0.97      0.97     16497


🔹 Model: Support Vector Machine
Accuracy: 0.9850881978541554
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.98      0.98      7898
           1       0.98      0.99      0.99      8599

    accuracy                           0.99     16497
   macro avg       0.99      0.98      0.99     16497
weighted avg       0.99      0.99      0.99     16497


🔹 Model: AdaBoost
Accuracy: 0.7770503727950536
Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.57      0.71      7898
       