<a href="https://colab.research.google.com/github/ds-upin/Email-Classifier/blob/main/Mail_Classification(naive_bayes).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# Data need to be diverse across various domains 50k mails might be enough

In [4]:
import pandas as pd
import re

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [5]:
import kagglehub
from kagglehub import KaggleDatasetAdapter

df = kagglehub.dataset_load(
   KaggleDatasetAdapter.PANDAS,
   "venky73/spam-mails-dataset",
   "spam_ham_dataset.csv"
)
df2 = kagglehub.dataset_load(
  KaggleDatasetAdapter.PANDAS,
  "meruvulikith/190k-spam-ham-email-dataset-for-classification",
  "spam_Emails_data.csv"
)
df = df[['label', 'text']]
df['label'] = df['label'].map({'ham': 0, 'spam': 1})
df2 = df2[['label', 'text']]
df2['label'] = df2['label'].map({'Ham': 0, 'Spam': 1})
df = pd.concat([df, df2], ignore_index=True)

Using Colab cache for faster access to the 'spam-mails-dataset' dataset.
Using Colab cache for faster access to the '190k-spam-ham-email-dataset-for-classification' dataset.


In [6]:
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'http\S+', ' <URL> ', text)
    text = re.sub(r'\d+', ' <NUM> ', text)
    return text

df['text'] = df['text'].apply(clean_text)

X = df['text']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [7]:

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(
        max_features=5000,
        ngram_range=(1, 2),
        min_df=3,
        max_df=0.9,
        stop_words='english'
    )),
    ('clf', MultinomialNB())
])

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.9400326592136666

Classification Report:

              precision    recall  f1-score   support

           0       0.93      0.96      0.94     21167
           1       0.95      0.92      0.94     18638

    accuracy                           0.94     39805
   macro avg       0.94      0.94      0.94     39805
weighted avg       0.94      0.94      0.94     39805

Confusion Matrix:
 [[20244   923]
 [ 1464 17174]]


In [11]:
spam_random_mail = [ """
Earn a gift card by providing feedback on our React and Next.js SDKs which we're looking to enhance next year.
We also highlight our n8n integration, a couple of new MediaFlows use cases, and our new HDR video support.
Happy Holidays and we look forward to connecting in 2026!
""",

"""Dear Customer,

Congratulations! ðŸŽ‰
You have been selected as the winner of a $5,000 cash reward in our international promotion.

To receive your prize, you must verify your account immediately.
Failure to act within 24 hours will result in cancellation of your reward.

ðŸ‘‰ Click here to claim your prize now:
http://secure-verification-reward.com/claim

This is a limited-time offer. Act now to avoid losing your money.

Best regards,
Customer Rewards Team"""]
predictions = pipeline.predict(spam_random_mail)
print(['Spam' if i==1 else 'Ham' for i in predictions])


['Spam', 'Spam']


In [9]:
import joblib
joblib.dump(pipeline, 'spam_classifier.joblib')


['spam_classifier.joblib']