# Data Loading

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("naserabdullahalam/phishing-email-dataset")

print("Path to dataset files:", path)

Using Colab cache for faster access to the 'phishing-email-dataset' dataset.
Path to dataset files: /kaggle/input/phishing-email-dataset


# Data Cleaning

In [None]:
import pandas as pd
import os

df = pd.read_csv(os.path.join(path, 'CEAS_08.csv'))
df.head()

Unnamed: 0,sender,receiver,date,subject,body,label,urls
0,Young Esposito <Young@iworld.de>,user4@gvc.ceas-challenge.cc,"Tue, 05 Aug 2008 16:31:02 -0700",Never agree to be a loser,"Buck up, your troubles caused by small dimensi...",1,1
1,Mok <ipline's1983@icable.ph>,user2.2@gvc.ceas-challenge.cc,"Tue, 05 Aug 2008 18:31:03 -0500",Befriend Jenna Jameson,\nUpgrade your sex and pleasures with these te...,1,1
2,Daily Top 10 <Karmandeep-opengevl@universalnet...,user2.9@gvc.ceas-challenge.cc,"Tue, 05 Aug 2008 20:28:00 -1200",CNN.com Daily Top 10,>+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+...,1,1
3,Michael Parker <ivqrnai@pobox.com>,SpamAssassin Dev <xrh@spamassassin.apache.org>,"Tue, 05 Aug 2008 17:31:20 -0600",Re: svn commit: r619753 - in /spamassassin/tru...,Would anyone object to removing .so from this ...,0,1
4,Gretchen Suggs <externalsep1@loanofficertool.com>,user2.2@gvc.ceas-challenge.cc,"Tue, 05 Aug 2008 19:31:21 -0400",SpecialPricesPharmMoreinfo,\nWelcomeFastShippingCustomerSupport\nhttp://7...,1,1


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39154 entries, 0 to 39153
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   sender    39154 non-null  object
 1   receiver  38692 non-null  object
 2   date      39154 non-null  object
 3   subject   39126 non-null  object
 4   body      39154 non-null  object
 5   label     39154 non-null  int64 
 6   urls      39154 non-null  int64 
dtypes: int64(2), object(5)
memory usage: 2.1+ MB


In [None]:
df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
1,21842
0,17312


In [None]:
df.columns

Index(['sender', 'receiver', 'date', 'subject', 'body', 'label', 'urls'], dtype='object')

In [None]:
df.drop(columns=['receiver', 'date'], inplace=True)

In [None]:
df.head()

Unnamed: 0,sender,subject,body,label,urls
0,Young Esposito <Young@iworld.de>,Never agree to be a loser,"Buck up, your troubles caused by small dimensi...",1,1
1,Mok <ipline's1983@icable.ph>,Befriend Jenna Jameson,\nUpgrade your sex and pleasures with these te...,1,1
2,Daily Top 10 <Karmandeep-opengevl@universalnet...,CNN.com Daily Top 10,>+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+...,1,1
3,Michael Parker <ivqrnai@pobox.com>,Re: svn commit: r619753 - in /spamassassin/tru...,Would anyone object to removing .so from this ...,0,1
4,Gretchen Suggs <externalsep1@loanofficertool.com>,SpecialPricesPharmMoreinfo,\nWelcomeFastShippingCustomerSupport\nhttp://7...,1,1


In [None]:
X = df[['sender', 'subject', 'body', 'urls']]
y = df['label']

# Feature Engineering / Preprocessing

In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = str(text).lower()                               # lowercase
    text = re.sub(r'http\S+|www.\S+', '', text)            # remove urls
    text = re.sub(r'[^a-zA-Z\s]', '', text)                # remove numbers/punctuations
    tokens = word_tokenize(text)                           # tokenize
    tokens = [w for w in tokens if w not in stop_words]    # remove stopwords
    tokens = [lemmatizer.lemmatize(w) for w in tokens]     # lemmatize
    return " ".join(tokens)

df['clean_body'] = df['body'].apply(clean_text)
df['clean_subject'] = df['subject'].apply(clean_text)

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_text = vectorizer.fit_transform(df['clean_subject'] + " " + df['clean_body'])

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_text, df['label'], test_size=0.2, random_state=42
)

# Model Training

In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Testing

In [None]:
print("Train Accuracy:", model.score(X_train, y_train))
print("Test Accuracy:", model.score(X_test, y_test))

Train Accuracy: 0.9954027392012259
Test Accuracy: 0.9943813050695952


In [None]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.99      0.99      3490
           1       0.99      1.00      0.99      4341

    accuracy                           0.99      7831
   macro avg       0.99      0.99      0.99      7831
weighted avg       0.99      0.99      0.99      7831



In [None]:
# Example emails
emails = [
    "Limited Time Offer – 70% Discount Just for You! Congratulations! You have been selected to receive an exclusive 70% discount on our premium skincare products. This offer is valid for the next 24 hours only. Don’t miss out on glowing skin at the lowest price ever! Click here to claim your discount: http://best-skincare-deals.com/promo Hurry, stocks are running out fast!",
    "Subject: Action Required – Update Your Preferences\n\nDear user, we noticed unusual activity on your account. To continue enjoying our services without interruption, please review and update your account preferences at your convenience. Visit: http://example-services.com/update-preferences\n\nThank you for your attention."
]

# Convert using SAME vectorizer used before
X_new = vectorizer.transform(emails)

# Predict
predictions = model.predict(X_new)

for email, label in zip(emails, predictions):
    print("Email:", email[:60] + "...")
    print("Prediction:", "Phishing 🚨" if label == 1 else "Safe ✅")
    print("------")

Email: Limited Time Offer – 70% Discount Just for You! Congratulati...
Prediction: Phishing 🚨
------
Email: Subject: Action Required – Update Your Preferences

Dear use...
Prediction: Safe ✅
------


# Saving the Model

In [None]:
import joblib

# Model save karo
joblib.dump(model, "phishing_model.pkl")

# Vectorizer save karo
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")

['tfidf_vectorizer.pkl']