<a href="https://colab.research.google.com/github/dini-5002/Spam_email/blob/main/email_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score , confusion_matrix
from sklearn.metrics import classification_report

In [None]:
import nltk
import os
import re
from nltk.corpus import stopwords
from collections import Counter
nltk.download('stopwords')
STOPWORDS = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove unwanted characters using regex (only keep letters, numbers, and spaces)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove all non-alphanumeric characters except spaces

    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()  # Replace multiple spaces with a single space and trim

    # Remove stopwords
    text = " ".join([word for word in text.split() if word not in STOPWORDS])

    return text

In [None]:

train_email_directory = '/content/drive/MyDrive/tobeusedtrain_test_mails/train-mails'

test_email_directory = '/content/drive/MyDrive/tobeusedtrain_test_mails/test-mails'


In [None]:
train_email_contents = []
train_email_labels = []

for filename in os.listdir(train_email_directory):
    if filename.endswith(".txt"):
        with open(os.path.join(train_email_directory, filename), 'r', encoding='latin1') as file:
            email_content = file.read()
            email_content = preprocess_text(email_content)

            train_email_contents.append(email_content)

        if "spm" in filename.lower():
            train_email_labels.append(1)
        else:
            train_email_labels.append(0)


train_data = pd.DataFrame({'content': train_email_contents, 'label': train_email_labels})


In [None]:
train_data.tail()

Unnamed: 0,content,label
697,subject read dream member team earn 6 figure i...,1
698,subject major acquisition company mark industr...,1
699,subject financial freedom sleep dear achiever ...,1
700,subject search credibility dream member team e...,1
701,subject id aa29536 julius ling ohiostate edu t...,0


In [None]:
test_email_contents = []
test_email_labels = []


for filename in os.listdir(test_email_directory):
    if filename.endswith(".txt"):
        with open(os.path.join(test_email_directory, filename), 'r', encoding='latin1') as file:
            email_content = file.read()
            email_content = preprocess_text(email_content)
            test_email_contents.append(email_content)


        if "spm" in filename.lower():
            test_email_labels.append(1)
        else:
            test_email_labels.append(0)

test_data = pd.DataFrame({'content': test_email_contents, 'label': test_email_labels})


feature_extraction = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)


X_train_features = feature_extraction.fit_transform(train_data['content'])

X_test_features = feature_extraction.transform(test_email_contents)


y_train = train_email_labels
y_test = test_email_labels

In [None]:
X_train_features.shape

(702, 20159)

In [None]:
X_test_features.shape

(260, 20159)

In [None]:
#Naive_Bayes
nvb_model = MultinomialNB()
nvb_model.fit(X_train_features, y_train)


In [None]:
y_test_pred = nvb_model.predict(X_test_features)
test_accuracy = accuracy_score(test_email_labels, y_test_pred)
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")


Test Accuracy: 98.46%


In [None]:
from sklearn.svm import SVC

In [None]:
#SVM
svm_model = SVC()
svm_model.fit(X_train_features, y_train)



In [None]:

y_test_pred = svm_model.predict(X_test_features)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")


Test Accuracy: 97.31%


In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
#Random_Forest
rf_model = RandomForestClassifier()
rf_model.fit(X_train_features, y_train)

In [None]:
# Test the model on the test set
y_test_pred = rf_model.predict(X_test_features)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")


Test Accuracy: 96.92%


In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
#KNN
knn_model = KNeighborsClassifier()
knn_model.fit(X_train_features, y_train)

In [None]:
# Test the model on the test set
y_test_pred = knn_model.predict(X_test_features)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")


Test Accuracy: 96.92%


In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
#Logistic_Regression
lr_model = LogisticRegression()
lr_model.fit(X_train_features, y_train)

In [None]:
# Test the model on the test set
y_test_pred = lr_model.predict(X_test_features)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")


Test Accuracy: 97.69%


In [None]:
import pickle

In [None]:
# Save the model and TF-IDF vectorizer
with open('spam_model_nvb2.pkl', 'wb') as model_file:
    pickle.dump((nvb_model, feature_extraction), model_file)