In [1]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import recall_score, precision_score, f1_score

In [2]:
DATA_JSON_FILE = 'SpamData/01_Processing/email-text-data.json'

In [3]:
data = pd.read_json(DATA_JSON_FILE)

In [4]:
data.tail()

Unnamed: 0,MESSAGE,CATEGORY,FILE_NAME
5791,"On Tue, 23 Jul 2002, Jim Whitehead wrote:\n\n\...",0,00846.a603afcbab0796cbe45d3be854562598
5792,wintermute wrote:\n\n>>Anyone know where in Ir...,0,00424.e39b1db8cf5575572abb4482fd3fced3
5793,"hey,\n\n\n\nAFAIK it isn't hard at all to crea...",0,00374.957837ba252b473057711e3c4dbd1e26
5794,"Evidently, I have permission to pass this alon...",0,01101.ff91c2c8fb18ed6e300ed2ac699f8ae4
5795,"On Mon, 19 Aug 2002 16:44:30 +0200, Matthias S...",0,01266.94f7e1cce0ec1935ca75d232e4dc684c


In [5]:
data.shape

(5796, 3)

In [6]:
vectorizer = CountVectorizer(stop_words='english')

In [7]:
all_features = vectorizer.fit_transform(data.MESSAGE)

In [8]:
all_features.shape

(5796, 102694)

In [9]:
vectorizer.vocabulary_

{'html': 48472,
 'head': 47011,
 'meta': 61701,
 'http': 48497,
 'equiv': 38991,
 '3d': 6385,
 'content': 30249,
 'language': 57500,
 'en': 38432,
 'generator': 44258,
 'microsoft': 62003,
 'frontpage': 43005,
 'progid': 73406,
 'editor': 37557,
 'document': 34867,
 'type': 89371,
 'text': 86991,
 'charset': 27796,
 '3dwindows': 7297,
 '1252': 2025,
 'title': 87725,
 'norton': 65833,
 'ad': 16973,
 'body': 24390,
 'table': 86120,
 'border': 24581,
 'width': 95488,
 '489': 8330,
 'height': 47157,
 '54': 9185,
 'bgcolor': 23297,
 'ffffff': 41323,
 'tr': 88442,
 'td': 86548,
 '100': 1496,
 '48': 8295,
 'align': 18634,
 'center': 27405,
 'font': 42257,
 'face': 40498,
 'impact': 51290,
 'color': 29367,
 'cc0000': 27108,
 'size': 82347,
 'control': 30335,
 'computer': 29755,
 'line': 58568,
 'software': 83094,
 'cellspacing': 27383,
 'cellpadding': 27375,
 'bgco': 23295,
 'lor': 59273,
 '990033': 13977,
 'bordercolor': 24584,
 'style': 84723,
 'collapse': 29305,
 'collaps': 29304,
 'borderc

In [10]:
X_train, X_test, y_train, y_test = train_test_split(all_features, data.CATEGORY, test_size=0.3, random_state=88)

In [11]:
X_train.shape

(4057, 102694)

In [12]:
X_test.shape

(1739, 102694)

In [13]:
classifier = MultinomialNB()

In [14]:
classifier.fit(X_train, y_train)

MultinomialNB()

In [15]:
nr_correct = (y_test == classifier.predict(X_test)).sum()

In [16]:
print(f'{nr_correct} documents classified correctly.')

1654 documents classified correctly.


In [17]:
nr_incorrect = y_test.size - nr_correct

In [18]:
print(f"Number of documents incorrectly classified is {nr_incorrect}")

Number of documents incorrectly classified is 85


In [19]:
fraction_wrong = nr_incorrect / (nr_incorrect + nr_correct)
print (f"The testing accuracy of the model is {1-fraction_wrong:.2%}")

The testing accuracy of the model is 95.11%


In [20]:
classifier.score(X_test, y_test)

0.9511213341000575

In [21]:
recall_score(y_test, classifier.predict(X_test))

0.855595667870036

In [22]:
precision_score(y_test, classifier.predict(X_test))

0.9895615866388309

In [23]:
f1_score(y_test, classifier.predict(X_test))

0.9177153920619555

In [24]:
example = ['get viagra for free now',
          'need a mortgage? We can arrange a call for a quote',
          'Can you help me with the project for tomorrow? Thanks.',
          'Hi Jonathan, what about playing tennis this afternoon?',
          'Systems engineering is an interdisciplinary field of engineering and engineering management that focuses on how to design, integrate, and manage complex systems over their life cycles. At its core, systems engineering utilizes systems thinking principles to organize this body of knowledge. The individual outcome of such efforts, an engineered system, can be defined as a combination of components that work in synergy to collectively perform a useful function.']

In [25]:
doc_term_matrix = vectorizer.transform(example)

In [26]:
classifier.predict(doc_term_matrix)

array([1, 1, 0, 0, 0], dtype=int64)