<a href="https://colab.research.google.com/github/guilhermelaviola/EmailSpamDetector/blob/main/EmailSpamDetector.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [15]:
# Importing all the necessary libraries:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
import string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [2]:
# Importing the .csv dataset:
df = pd.read_csv('emails.csv')
df.head(10)

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1
5,"Subject: great nnews hello , welcome to medzo...",1
6,Subject: here ' s a hot play in motion homela...,1
7,Subject: save your money buy getting this thin...,1
8,Subject: undeliverable : home based business f...,1
9,Subject: save your money buy getting this thin...,1


In [3]:
# Displaying the number of rows and columns in the dataset:
df.shape

(5728, 2)

In [4]:
# Displaying the dataset columns names:
df.columns

Index(['text', 'spam'], dtype='object')

In [5]:
# Checking for duplicated row values and removing them:
df.drop_duplicates(inplace = True)
print(df.shape)

(5695, 2)


In [6]:
# Checking if there are any column with null values:
print(df.isnull().sum())

text    0
spam    0
dtype: int64


In [7]:
# Downloading the stopwords package:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [8]:
# Function that cleans the text and returns the tokens.
# The cleaning can be done by removing punctuation and then removing the
# useless words, known as stopwords:
def process(text):
  no_punctuation = [char for char in text if char not in string.punctuation]
  no_punctuation = ''.join(no_punctuation)

  clean = [word for word in no_punctuation.split() if word.lower() not in stopwords.words('english')]
  return clean
# Showing the tokenization:
df['text'].head().apply(process)

0    [Subject, naturally, irresistible, corporate, ...
1    [Subject, stock, trading, gunslinger, fanny, m...
2    [Subject, unbelievable, new, homes, made, easy...
3    [Subject, 4, color, printing, special, request...
4    [Subject, money, get, software, cds, software,...
Name: text, dtype: object

In [9]:
# Converting the text into a matrix of token counts:
message = CountVectorizer(analyzer = process).fit_transform(df['text'])

In [11]:
# Splitting the data into training and test sets (80% training and 20% testing):
x_train, x_test, y_train, y_test = train_test_split(message, df['spam'], test_size = 0.20, random_state = 0)

# Displaying the shape of the data set:
print(message.shape)

(5695, 37229)


In [13]:
# Creating and training the Multinomial Naive Bayes classifier:
classifier = MultinomialNB().fit(x_train, y_train)

In [14]:
# Displaying the classifiers prediction and actual values of the data set:
print(classifier.predict(x_train))
print(y_train.values)

[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]


In [16]:
# Evaluating how well the model performed the Naive Bayes classifier and
# the report, confusion matrix & accuracy score:
prediction = classifier.predict(x_train)
print(classification_report(y_train, prediction))
print()
print('Confusion Matrix: \n', confusion_matrix(y_train, prediction))
print('Accuracy: \n', accuracy_score(y_train, prediction))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3457
           1       0.99      1.00      0.99      1099

    accuracy                           1.00      4556
   macro avg       0.99      1.00      1.00      4556
weighted avg       1.00      1.00      1.00      4556


Confusion Matrix: 
 [[3445   12]
 [   1 1098]]
Accuracy: 
 0.9971466198419666


The model is **99.71%** accurate!


In [17]:
# Testing the model on the tesf dataset:
# Displaying the predictions:
print(classifier.predict(x_test))

# Displaying the actual values:
print(y_test.values)

[1 0 0 ... 0 0 0]
[1 0 0 ... 0 0 0]


In [19]:
# Evaluating the model on the test data set:
prediction = classifier.predict(x_test)
print(classification_report(y_test, prediction))
print()
print('Confusion Matrix: \n', confusion_matrix(y_test, prediction))
print('Accuracy: \n', accuracy_score(y_test, prediction))

              precision    recall  f1-score   support

           0       1.00      0.99      0.99       870
           1       0.97      1.00      0.98       269

    accuracy                           0.99      1139
   macro avg       0.98      0.99      0.99      1139
weighted avg       0.99      0.99      0.99      1139


Confusion Matrix: 
 [[862   8]
 [  1 268]]
Accuracy: 
 0.9920983318700615


The classifier identified spam email messages with an accuracy of **99.2%**!