# Description: This program detects if an email is spam (1) or not (0)

In [5]:
# Import libraries
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
import string

In [4]:
# Read the CSV file
data1 = pd.read_csv('../dados/emails.csv') # DataFrame

In [6]:
# Print the first 5 rows of data
data1.head(5)

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


In [8]:
# Print the shape (Get the number of rows and columns)
data1.shape

(5728, 2)

In [9]:
# Get the columns names
data1.columns

Index(['text', 'spam'], dtype='object')

In [10]:
# Check for duplicates and remove them
data1.drop_duplicates(inplace=True)

In [11]:
# show the new shape (number of rows and columns)
data1.shape

(5695, 2)

In [12]:
# Show the number of missing (NAN, NaN, na) data for each column
data1.isnull().sum()

text    0
spam    0
dtype: int64

In [13]:
# Download the stopwords package
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\carlos
[nltk_data]     daniel\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [16]:
def process_text(text):
    # 1 remove punctuation
    # 2 remove stopwords
    # 3 return a list of clean text words
    
    # 1
    nopunc = [char for char in text if char not in string.punctuation]
    nopunc = ''.join(nopunc)
    
    # 2 
    clear_words = [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]
    
    # 3 
    return clear_words

In [17]:
# Show the tokenization ( a list of tokens also called lemmas) 
data1['text'].head().apply(process_text)

0    [Subject, naturally, irresistible, corporate, ...
1    [Subject, stock, trading, gunslinger, fanny, m...
2    [Subject, unbelievable, new, homes, made, easy...
3    [Subject, 4, color, printing, special, request...
4    [Subject, money, get, software, cds, software,...
Name: text, dtype: object

In [19]:
# Example
message4 = 'hello word hello hello word play' 
message5 = 'test test test test one hello'
print(message4)
print()

# Convert the text to a matrix of token counts
from sklearn.feature_extraction.text import CountVectorizer
bow4 = CountVectorizer(analyzer=process_text).fit_transform([[message4], [message5]])
print(bow4)
print()

print(bow4.shape)

hello word hello hello word play

  (0, 0)	3
  (0, 4)	2
  (0, 2)	1
  (1, 0)	1
  (1, 3)	4
  (1, 1)	1

(2, 5)


In [20]:
# Convert a collection of text to a matriz of tokens
from sklearn.feature_extraction.text import CountVectorizer
messages_bow = CountVectorizer(analyzer=process_text).fit_transform(data1['text'])

In [21]:
# Split the data into 80% training and 20% testing
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(messages_bow, data1['spam'], test_size=0.20, random_state=0)

In [22]:
# Get the shape of messages_bow
messages_bow.shape

(5695, 37229)

In [23]:
# Create and train the Naive Bayes Classifier
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB().fit(X_train, Y_train)

In [24]:
# Print the predictions
print(classifier.predict(X_train))

# Print the actual values
print(Y_train.values)

[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]


In [25]:
# Evaluate the model on the training data set
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
pred = classifier.predict(X_train)
print(classification_report(Y_train, pred))
print()
print('Confusion Matrix: \n', confusion_matrix(Y_train, pred))
print()
print('Accuracy: ', accuracy_score(Y_train, pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3457
           1       0.99      1.00      0.99      1099

    accuracy                           1.00      4556
   macro avg       0.99      1.00      1.00      4556
weighted avg       1.00      1.00      1.00      4556


Confusion Matrix: 
 [[3445   12]
 [   1 1098]]

Accuracy:  0.9971466198419666


In [26]:
# Print the predictions
print(classifier.predict(X_test))

# Print the actual values
print(Y_test.values)

[1 0 0 ... 0 0 0]
[1 0 0 ... 0 0 0]


In [27]:
# Evaluate the model on the training data set
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
pred = classifier.predict(X_test)
print(classification_report(Y_test, pred))
print()
print('Confusion Matrix: \n', confusion_matrix(Y_test, pred))
print()
print('Accuracy: ', accuracy_score(Y_test, pred))

              precision    recall  f1-score   support

           0       1.00      0.99      0.99       870
           1       0.97      1.00      0.98       269

    accuracy                           0.99      1139
   macro avg       0.98      0.99      0.99      1139
weighted avg       0.99      0.99      0.99      1139


Confusion Matrix: 
 [[862   8]
 [  1 268]]

Accuracy:  0.9920983318700615
