## Importing all the necessary libraries.

In [29]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
import string
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\funde\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Loading the dataset.

In [30]:
df = pd.read_csv("data/emails.csv")
df.head()

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


In [31]:
df.shape

(5728, 2)

## Data cleaning.

### Cleaning out null values.

In [32]:
print(df.isnull().sum())
# No data cleaning to be done!

text    0
spam    0
dtype: int64


### Text cleaning. Here, we return tokens from text.

In [40]:
def process(uncleanText):
    noPunctuation = [char for char in uncleanText if char not in string.punctuation]
    noPunctuation = ''.join(noPunctuation)
    cleanText = [word for word in noPunctuation.split() if word.lower() not in stopwords.words('english')]
    return cleanText
# To apply tokenization.
df['text'].head().apply(process)

0    [Subject, naturally, irresistible, corporate, ...
1    [Subject, stock, trading, gunslinger, fanny, m...
2    [Subject, unbelievable, new, homes, made, easy...
3    [Subject, 4, color, printing, special, request...
4    [Subject, money, get, software, cds, software,...
Name: text, dtype: object

### Store everything in 'message'.

In [41]:
message = CountVectorizer(analyzer=process).fit_transform(df['text'])

## Train-test split, in 80 : 20 ratio.

In [42]:
X_train, X_test, y_train, y_test = train_test_split(message, df['spam'], test_size=0.20, random_state=0)

## Train a classifier for multinomial naive bayes algorithm, using the training data.

In [43]:
classifier = MultinomialNB().fit(X_train, y_train)

In [44]:
y_pred = classifier.predict(X_train)
print(classification_report(y_train, y_pred))
print()
print("Confusion Matrix: \n", confusion_matrix(y_train, y_pred))
print("Accuracy: \n", accuracy_score(y_train, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3475
           1       0.99      1.00      1.00      1107

    accuracy                           1.00      4582
   macro avg       1.00      1.00      1.00      4582
weighted avg       1.00      1.00      1.00      4582


Confusion Matrix: 
 [[3466    9]
 [   2 1105]]
Accuracy: 
 0.9975993016150153


## Test the classifier using testing data.

### Accuracy: 98.69%

In [45]:
y_pred = classifier.predict(X_test)
print(classification_report(y_test, y_pred))
print()
print("Confusion Matrix: \n", confusion_matrix(y_test, y_pred))
print("Accuracy: \n", accuracy_score(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.99      0.99       885
           1       0.95      0.99      0.97       261

    accuracy                           0.99      1146
   macro avg       0.97      0.99      0.98      1146
weighted avg       0.99      0.99      0.99      1146


Confusion Matrix: 
 [[872  13]
 [  2 259]]
Accuracy: 
 0.9869109947643979
