In [1]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import recall_score, precision_score, f1_score

In [2]:
DATA_JSON_FILE = 'SpamData/01_Processing/email-text-data.json'

In [3]:
data = pd.read_json(DATA_JSON_FILE)

In [4]:
data.tail()

Unnamed: 0,MESSAGE,CATEGORY,FILE_NAME
5791,"On Thu, 08-Aug-2002 at 11:31:01 +0100, John P....",0,00319.9fdb50d80e1f34e30b93dd401c644f3d
5792,"On Sat, 3 Aug 2002, kevin lyda wrote:\n\n\n\n>...",0,00047.e67d0d53cbd3ffafe303cf4bd4e03d66
5793,"hm, looks good:\n\nhttp://books.slashdot.org/a...",0,00031.7caef7fe7af2114d0e4bf6aa0faf3a03
5794,"At 6:21 PM +0000 on 8/9/02, Russell Turpin wro...",0,00996.5b140bb40fe22ba082fb18bbaca76ffd
5795,use Perl Daily Newsletter\n\n\n\nIn this issue...,0,00679.4cece88c654b4e5936921c5d4072797d


In [5]:
data.shape

(5796, 3)

In [6]:
vectorizer = CountVectorizer(stop_words='english')

In [7]:
# vectorizer.vocabulary_

In [8]:
all_features = vectorizer.fit_transform(data.MESSAGE)

In [9]:
vectorizer.vocabulary_

{'multi': 63629,
 'message': 61680,
 'mime': 62140,
 'format': 42435,
 '_nextpart_000_01bc2b74': 15036,
 '89d1ccc0': 12882,
 'content': 30249,
 'type': 89371,
 'multipart': 63651,
 'alternative': 18820,
 'boundary': 24670,
 '_nextpart_84815c5abaf209ef376268c8': 15161,
 'text': 86991,
 'plain': 71912,
 'charset': 27796,
 'windows': 95603,
 '1252': 2025,
 'transfer': 88556,
 'encoding': 38472,
 'quoted': 75615,
 'printable': 73192,
 'dear': 32719,
 'sir': 82261,
 'madam': 60221,
 'wishing': 95695,
 'wonderful': 95974,
 'day': 32411,
 '2e': 4981,
 'offer': 67158,
 'save': 80370,
 'money': 62840,
 'time': 87625,
 'shopping': 81849,
 'convenience': 30352,
 'home': 48006,
 'office': 67173,
 'window': 95598,
 'new': 64988,
 'way': 94335,
 'online': 67665,
 'super': 85106,
 'store': 84463,
 'offers': 67168,
 '1500': 2341,
 'quality': 75457,
 'products': 73349,
 '15': 2339,
 'plus': 72122,
 'categories': 26974,
 'shop': 81840,
 '20': 3486,
 'following': 42233,
 'coupons': 30671,
 'numbers': 662

In [10]:
all_features.shape

(5796, 102694)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(all_features, data.CATEGORY, test_size=0.3, random_state=50)

In [12]:
X_train.shape

(4057, 102694)

In [13]:
X_test.shape

(1739, 102694)

In [14]:
classifier = MultinomialNB()

In [15]:
classifier.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

**Challenge:** Calculate the following for the test dataset: <br>
The number of documents classified correctly. <br>
The number of documents classified incorrectly. <br>
The accuracy of the model. <br>

In [16]:
nr_correct = (y_test == classifier.predict(X_test)).sum()

In [17]:
print(f'{nr_correct} documents classfied correctly')

1656 documents classfied correctly


In [18]:
nr_incorrect = y_test.size - nr_correct

In [19]:
print(f'Number of documents incorrectly classified is {nr_incorrect}')

Number of documents incorrectly classified is 83


In [20]:
fraction_wrong = nr_incorrect / (nr_correct + nr_incorrect)
print(f'The (testing) accuracy of the model is {1-fraction_wrong:.2%}')

The (testing) accuracy of the model is 95.23%


In [21]:
classifier.score(X_test, y_test)

0.9522714203565268

**Challenge:** For the testing dataset calculate the recall, precision and f1 score. Google for the scikit learn documentation on this topic to work it out. 

In [22]:
recall_score(y_test, classifier.predict(X_test))

0.8623188405797102

In [23]:
precision_score(y_test, classifier.predict(X_test))

0.9855072463768116

In [24]:
f1_score(y_test, classifier.predict(X_test))

0.9198067632850242

In [25]:
example = ['Hello Gautam Kumar,Are you looking for something in our Vacuums & Irons store? If so, you might be interested in these items. ',
           'need a mortgage? Reply to arrange a call with a specialist and get a quote',
           'Could you please help me with the project for tomorrow?',
           'Premium offer extended Get 40% off You only have until Wednesday to save 40% on the best way to get organiz',
           'EC-Council Gautam,We are looking forward to seeing you at EC-Council’s BIG Cyber Event !Tefore. Malicious hackers have attacked relentlessly and with increased sophistication to test cyber resilience in a distributed ecosystem. As a result, we are staring at a dramatic surge in the need for skilled cybersecurity professionals across the globe. Join us to understand how a whole new approach to cybersecurity skilling will be needed for organizations to thrive in a post-COVID world. For individuals planning a career in cyber, the event promises to unveil what the future of cybersecurity jobs looks like and how new career paths can be forged for a challenging phase ahead of us'
          ]

In [26]:
doc_term_matrix = vectorizer.transform(example)

In [27]:
classifier.predict(doc_term_matrix)

array([0, 1, 0, 1, 0])

## Here Actually 3rd Mail is Non-Spam and rest is SPAM