In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import BernoulliNB, GaussianNB, MultinomialNB
from sklearn import metrics
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/spam-mails-dataset/spam_ham_dataset.csv


In [2]:
# name the dataset and print it out
emails = pd.read_csv("/kaggle/input/spam-mails-dataset/spam_ham_dataset.csv")
emails

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0
...,...,...,...,...
5166,1518,ham,Subject: put the 10 on the ft\r\nthe transport...,0
5167,404,ham,Subject: 3 / 4 / 2000 and following noms\r\nhp...,0
5168,2933,ham,Subject: calpine daily gas nomination\r\n>\r\n...,0
5169,1409,ham,Subject: industrial worksheets for august 2000...,0


In [3]:
# check to see if there are any null values

emails.isnull().sum()

Unnamed: 0    0
label         0
text          0
label_num     0
dtype: int64

In [4]:
# drop columns that do not have any effect on the dataset
emails = emails.drop("Unnamed: 0", axis = 1)
emails = emails.drop("label_num", axis = 1)
emails

Unnamed: 0,label,text
0,ham,Subject: enron methanol ; meter # : 988291\r\n...
1,ham,"Subject: hpl nom for january 9 , 2001\r\n( see..."
2,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar..."
3,spam,"Subject: photoshop , windows , office . cheap ..."
4,ham,Subject: re : indian springs\r\nthis deal is t...
...,...,...
5166,ham,Subject: put the 10 on the ft\r\nthe transport...
5167,ham,Subject: 3 / 4 / 2000 and following noms\r\nhp...
5168,ham,Subject: calpine daily gas nomination\r\n>\r\n...
5169,ham,Subject: industrial worksheets for august 2000...


In [5]:
# change the categorical data to numerical data
emails["label"] = emails["label"].astype('category')
emails["label"] = emails["label"].cat.codes
emails.head()

Unnamed: 0,label,text
0,0,Subject: enron methanol ; meter # : 988291\r\n...
1,0,"Subject: hpl nom for january 9 , 2001\r\n( see..."
2,0,"Subject: neon retreat\r\nho ho ho , we ' re ar..."
3,1,"Subject: photoshop , windows , office . cheap ..."
4,0,Subject: re : indian springs\r\nthis deal is t...


In [6]:
# make X the subjects and Y the label - spam/ham
X=emails.text
y=emails.label
print(X)
print(y)

0       Subject: enron methanol ; meter # : 988291\r\n...
1       Subject: hpl nom for january 9 , 2001\r\n( see...
2       Subject: neon retreat\r\nho ho ho , we ' re ar...
3       Subject: photoshop , windows , office . cheap ...
4       Subject: re : indian springs\r\nthis deal is t...
                              ...                        
5166    Subject: put the 10 on the ft\r\nthe transport...
5167    Subject: 3 / 4 / 2000 and following noms\r\nhp...
5168    Subject: calpine daily gas nomination\r\n>\r\n...
5169    Subject: industrial worksheets for august 2000...
5170    Subject: important online banking alert\r\ndea...
Name: text, Length: 5171, dtype: object
0       0
1       0
2       0
3       1
4       0
       ..
5166    0
5167    0
5168    0
5169    0
5170    1
Name: label, Length: 5171, dtype: int8


In [7]:
from sklearn.model_selection  import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [8]:
X_train.head()

1       Subject: hpl nom for january 9 , 2001\r\n( see...
4984    Subject: paliourg less expensive charset = iso...
3276    Subject: soldout viagra for only 0 . 78 $ per ...
2727    Subject: buyback deals - - january 2000\r\natt...
4227    Subject: accounting arrangement at meter 692 -...
Name: text, dtype: object

In [9]:
# use a count vectorizer to change the collection of text documents to a matrix of token counts and get rid of the stop words
# it is similar to one hot encoding and also creates a vocabulary of the word content in the data so that similar words can be determined
vect = CountVectorizer(stop_words='english')
vect.fit(X_train)
vect.get_stop_words()

frozenset({'a',
           'about',
           'above',
           'across',
           'after',
           'afterwards',
           'again',
           'against',
           'all',
           'almost',
           'alone',
           'along',
           'already',
           'also',
           'although',
           'always',
           'am',
           'among',
           'amongst',
           'amoungst',
           'amount',
           'an',
           'and',
           'another',
           'any',
           'anyhow',
           'anyone',
           'anything',
           'anyway',
           'anywhere',
           'are',
           'around',
           'as',
           'at',
           'back',
           'be',
           'became',
           'because',
           'become',
           'becomes',
           'becoming',
           'been',
           'before',
           'beforehand',
           'behind',
           'being',
           'below',
           'beside',
           'besides'

In [10]:
# 
X_train_transformed = vect.transform(X_train)
X_test_transformed = vect.transform(X_test)
print(X_test_transformed)

  (0, 8353)	1
  (0, 8877)	1
  (0, 8921)	1
  (0, 8925)	1
  (0, 9062)	1
  (0, 10108)	1
  (0, 11276)	1
  (0, 12129)	1
  (0, 12821)	1
  (0, 12878)	1
  (0, 12973)	1
  (0, 12982)	1
  (0, 13158)	1
  (0, 13704)	1
  (0, 13854)	1
  (0, 14413)	1
  (0, 14453)	1
  (0, 15865)	1
  (0, 16684)	1
  (0, 16981)	1
  (0, 17467)	1
  (0, 17604)	1
  (0, 18025)	1
  (0, 18038)	1
  (0, 18210)	1
  :	:
  (1291, 40778)	1
  (1291, 41366)	1
  (1292, 1)	1
  (1292, 50)	1
  (1292, 111)	1
  (1292, 981)	2
  (1292, 1448)	1
  (1292, 2246)	1
  (1292, 5936)	2
  (1292, 12973)	2
  (1292, 15294)	1
  (1292, 15815)	1
  (1292, 17568)	2
  (1292, 18272)	3
  (1292, 25995)	2
  (1292, 26422)	2
  (1292, 26667)	1
  (1292, 26851)	1
  (1292, 27154)	3
  (1292, 27653)	2
  (1292, 27655)	1
  (1292, 30023)	2
  (1292, 30210)	1
  (1292, 32599)	2
  (1292, 36329)	1


In [11]:
# creating and then training and testing multiple Naive Bayes model for the prepared countvectorized data
bernoulliclassifier = BernoulliNB()
gaussianclassifier = GaussianNB()
multinomialclassifier = MultinomialNB()


bernoulliclassifier.fit(X_train_transformed,y_train)
multinomialclassifier.fit(X_train_transformed,y_train)

y_prediction_classificationbernoulli = bernoulliclassifier.predict(X_test_transformed)
y_prediction_classificationmultinomial = multinomialclassifier.predict(X_test_transformed)

print("Bernoulli: ")
print(metrics.confusion_matrix(y_test, y_prediction_classificationbernoulli))
print()

print("Multinomial: ")
print(metrics.confusion_matrix(y_test, y_prediction_classificationmultinomial))
print()

print("Bernoulli: ")
print(metrics.accuracy_score(y_test,y_prediction_classificationbernoulli))
print()

print("Multinomial: ")
print(metrics.accuracy_score(y_test,y_prediction_classificationmultinomial))
print()

# the multinomial Naive Bayes model is much better suited for this data as the accuracy score for it is much higher, making it better to determine which email is spam and which is not ham to a very reasonable extent (98 % accuracy)

Bernoulli: 
[[899  15]
 [200 179]]

Multinomial: 
[[903  11]
 [ 12 367]]

Bernoulli: 
0.8337200309358082

Multinomial: 
0.9822119102861562

