In [2]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import recall_score, precision_score, f1_score

In [3]:
DATA_JSON_FILE = 'SpamData/01_Processing/email-text-data.json'

In [4]:
data = pd.read_json(DATA_JSON_FILE)

In [5]:
data.tail()

Unnamed: 0,MESSAGE,CATEGORY,FILE_NAME
5794,Marc Perkel wrote:\n\n> But - MY POINT IS - ca...,0,01299.21bf6f0946fe21adc3e99db3e541ee57
5795,Update of /cvsroot/spamassassin/spamassassin/l...,0,01336.03d61f76f58b98d6c4c0f4f303994db4
5796,"As list administrator, your authorization is r...",0,00490.9c5dd006a16b1e30c9162ba4b4b75ea8
5797,Ciaran Mac Lochlainn stated the following:\n\n...,0,00231.8096ae53e70b1b72b5935b12b823597b
5798,"Hi, \n\nI'm looking to build a completely sile...",0,00030.cc523265aefc37ee6ce3015d8ff6aa24


In [6]:
data.sort_index(inplace=True)

In [7]:
data.head()

Unnamed: 0,MESSAGE,CATEGORY,FILE_NAME
0,This is a Multipart MIME message. Since your m...,1,00260.c75ce8b8d8bfc55723426979d260bf61
1,\n\nDear Sir or Madam\n\n\n\nIn the past you h...,1,00242.e030c8b1f053037aeffb062f3a34b523
2,ATTN:\n\n\n\nI am Edward Mulete JR. the son of...,1,00267.ef433fb350170f28a1567cbc24900e53
3,<html>\n\n<head>\n\n<title>Toy</title>\n\n</he...,1,00496.1a37de098f6c8847c3c7839d73cc7106
4,"Suppliers of Computers, Printers, etc. & Consu...",1,00031.a78bb452b3a7376202b5e62a81530449


In [8]:
vectorizer = CountVectorizer(stop_words='english')

In [9]:
all_features = vectorizer.fit_transform(data.MESSAGE)

In [36]:
all_features

<5799x102694 sparse matrix of type '<class 'numpy.int64'>'
	with 704684 stored elements in Compressed Sparse Row format>

In [11]:
vectorizer.vocabulary_

{'multipart': 63651,
 'mime': 62140,
 'message': 61680,
 'mail': 60312,
 'reader': 76618,
 'does': 34891,
 'understand': 90570,
 'format': 42435,
 'legible': 57979,
 'xx9adb9a99': 97843,
 '1a5b9adbxx': 2959,
 'content': 30249,
 'type': 89371,
 'text': 86991,
 'plain': 71912,
 'charset': 27796,
 'iso': 53094,
 '8859': 12828,
 'transfer': 88556,
 'encoding': 38472,
 '7bit': 11595,
 'k1': 55275,
 'erotikverlag': 39094,
 'sucht': 84940,
 'pornodarsteller': 72499,
 'wenn': 95168,
 'du': 36383,
 'spaß': 83480,
 'sex': 81443,
 'hast': 46778,
 'und': 90516,
 'nebenbei': 64713,
 'geld': 44218,
 'verdienen': 92420,
 'willst': 95562,
 'bewirb': 23197,
 'auch': 20808,
 'dich': 33934,
 'wir': 95668,
 'suchen': 84938,
 'natürliche': 64487,
 'frauen': 42729,
 'männer': 64094,
 'jeden': 54081,
 'alters': 18823,
 'mollig': 62809,
 'kein': 55788,
 'problem': 73275,
 'infos': 51894,
 'unter': 90983,
 '0930': 867,
 '830': 12260,
 '13': 2107,
 '22': 3997,
 'aus': 20887,
 'österreich': 102517,
 '01': 309,
 

In [12]:
X_train, X_test, y_train, y_test = train_test_split(all_features, data.CATEGORY, test_size=0.3, random_state=88)

In [38]:
X_train.shape

(4059, 102694)

In [40]:
X_train

<4059x102694 sparse matrix of type '<class 'numpy.int64'>'
	with 485758 stored elements in Compressed Sparse Row format>

In [15]:
classifier = MultinomialNB()

In [16]:
classifier.fit(X_train, y_train)

MultinomialNB()

**Challenge** calculate the following for the test dataset:
                the number of documents classified correctly <br>
                the number of documents classified incorrectly <br>
                the accuracy of the model.

In [17]:
prediction = classifier.predict(X_test)

In [18]:
correct_docs = (y_test == prediction).sum()
print('Dcos classified correctly', correct_docs)

Dcos classified correctly 1648


In [19]:
numdocs_wrong = X_test.shape[0] - correct_docs
print('Docs classified incorrectly', numdocs_wrong)


Docs classified incorrectly 92


In [20]:
accuracy = correct_docs / (X_test.shape[0])
print('the accuracy of the model is {:.2%} '.format(accuracy))

the accuracy of the model is 94.71% 


In [21]:
## or....

classifier.score(X_test, y_test)

0.9471264367816092

In [22]:
recall_score(y_test, prediction)

0.8504504504504504

In [23]:
precision_score(y_test, classifier.predict(X_test))

0.9812889812889813

In [24]:
f1_score(y_test, classifier.predict(X_test))

0.9111969111969112

In [25]:
example = ['get the viagra for free now!', 'need a mortgage? reply to arrenge a call with a specialist and get a quote', 'Could you please help me with the project for tomorrow?', 'Hello Jonathan, how about a game of golf tomorrow?', 'Ski jumping is a winter sport in which competitors aim to achieve the longest jump']

In [26]:
vectorizer.transform(example)

<5x102694 sparse matrix of type '<class 'numpy.int64'>'
	with 24 stored elements in Compressed Sparse Row format>

In [27]:
doc_term_matrix = vectorizer.transform(example)

In [28]:
classifier.predict(doc_term_matrix)

array([1, 1, 0, 0, 0])