In [8]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
import string

In [3]:
df = pd.read_csv('emails.csv')

In [4]:
df

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1
...,...,...
5723,Subject: re : research and development charges...,0
5724,"Subject: re : receipts from visit jim , than...",0
5725,Subject: re : enron case study update wow ! a...,0
5726,"Subject: re : interest david , please , call...",0


In [5]:
df.shape

(5728, 2)

# Download Stopwords Package

In [6]:
# download the stopwords package
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to C:\Users\Shiva
[nltk_data]     Chaitanya\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [9]:
def process(text):
    nopunc = [char for char in text if char not in string.punctuation]
    nopunc = ''.join(nopunc)

    clean = [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]
    return clean
# to show the tokenization
df['text'].head().apply(process)

0    [Subject, naturally, irresistible, corporate, ...
1    [Subject, stock, trading, gunslinger, fanny, m...
2    [Subject, unbelievable, new, homes, made, easy...
3    [Subject, 4, color, printing, special, request...
4    [Subject, money, get, software, cds, software,...
Name: text, dtype: object

## Convert into Matrix of Tokens

In [12]:
samples=df['text'].sample(50)

In [24]:
from sklearn.feature_extraction.text import CountVectorizer
message = CountVectorizer(analyzer=process).fit_transform(samples)
message

<50x2791 sparse matrix of type '<class 'numpy.int64'>'
	with 5250 stored elements in Compressed Sparse Row format>

## Train Test Splitting

In [16]:
#split the data into 80% training and 20% testing
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(message, df['spam'].sample(50), test_size=0.20, random_state=0)
# To see the shape of the data
print(message.shape)

(50, 2791)


## Creating and Training the Multinomial Naive Bayes Classifier

In [17]:
# create and train the Naive Bayes Classifier
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB().fit(xtrain, ytrain)

In [18]:
print(classifier.predict(xtrain))
print(ytrain.values)

[0 0 1 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
 1 0 0]
[0 0 1 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
 1 0 0]


## Model Evaluation

In [19]:
# Evaluating the model on the training data set
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
pred = classifier.predict(xtrain)
print(classification_report(ytrain, pred))
print()
print("Confusion Matrix: \n", confusion_matrix(ytrain, pred))
print("Accuracy: \n", accuracy_score(ytrain, pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        34
           1       1.00      1.00      1.00         6

    accuracy                           1.00        40
   macro avg       1.00      1.00      1.00        40
weighted avg       1.00      1.00      1.00        40


Confusion Matrix: 
 [[34  0]
 [ 0  6]]
Accuracy: 
 1.0


In [20]:
#print the predictions
print(classifier.predict(xtest))
#print the actual values
print(ytest.values)

[0 1 0 0 0 0 0 1 1 1]
[0 1 0 0 0 0 0 0 1 0]


In [21]:
# Evaluating the model on the training data set
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
pred = classifier.predict(xtest)
print(classification_report(ytest, pred))
print()
print("Confusion Matrix: \n", confusion_matrix(ytest, pred))
print("Accuracy: \n", accuracy_score(ytest, pred))

              precision    recall  f1-score   support

           0       1.00      0.75      0.86         8
           1       0.50      1.00      0.67         2

    accuracy                           0.80        10
   macro avg       0.75      0.88      0.76        10
weighted avg       0.90      0.80      0.82        10


Confusion Matrix: 
 [[6 2]
 [0 2]]
Accuracy: 
 0.8


The classifier accurately identified the email messages as spam or not spam with 99.2 % accuracy on the test data.

# Deploying the Project

In [23]:
import pickle
with open('emailspam.pkl', 'wb') as file:
    pickle.dump(classifier, file)