In [27]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import recall_score, precision_score, f1_score

In [2]:
DATA_JSON_FILE = 'SpamData/01_Processing/email-text-data.json'

In [3]:
data = pd.read_json(DATA_JSON_FILE)

In [6]:
data.shape

(5796, 3)

In [7]:
data.sort_index(inplace=True)

In [8]:
data.tail()

Unnamed: 0,CATEGORY,MESSAGE,FILE_NAME
5791,0,http://news.bbc.co.uk/1/hi/england/2515127.stm...,01396.61983fbe6ec43f55fd44e30fce24ffa6
5792,0,"> >-- be careful when using this one.) Also, t...",01397.9f9ef4c2a8dc012d80f2ce2d3473d3b7
5793,0,">>>>> ""SM"" == Skip Montanaro <skip@pobox.com> ...",01398.169b51731fe569f42169ae8f948ec676
5794,0,"So then, ""Mark Hammond"" <mhammond@skippinet.co...",01399.ca6b00b7b341bbde9a9ea3dd6a7bf896
5795,0,"Hi there,\n\n\n\nNow this is probably of no us...",01400.f897f0931e461e7b2e964d28e927c35e


In [10]:
vectorizer = CountVectorizer(stop_words='english')

In [11]:
all_features = vectorizer.fit_transform(data.MESSAGE)

In [12]:
all_features.shape

(5796, 102694)

In [55]:
X_train, X_test, y_train, y_test = train_test_split(all_features, data.CATEGORY, 
                                                    test_size=0.3, random_state=88)

In [56]:
classifier = MultinomialNB()

In [57]:
classifier.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [58]:
nr_correct = (y_test == classifier.predict(X_test)).sum()
print(f'{nr_correct} documents classfied correctly')

1641 documents classfied correctly


In [59]:
nr_incorrect = y_test.size - nr_correct
print(f'Number of documents incorrectly classified is {nr_incorrect}')

Number of documents incorrectly classified is 98


In [60]:
fraction_wrong = nr_incorrect / (nr_correct + nr_incorrect)
print(f'The (testing) accuracy of the model is {1-fraction_wrong:.2%}')

The (testing) accuracy of the model is 94.36%


In [61]:
classifier.score(X_test, y_test)

0.9436457734330075

In [28]:
recall_score(y_test, classifier.predict(X_test))

0.8303249097472925

In [29]:
precision_score(y_test, classifier.predict(X_test))

0.9913793103448276

In [30]:
f1_score(y_test, classifier.predict(X_test))

0.9037328094302555

## Testing random email

In [62]:
example = ['get viagra for free!!!!!',
           'need a mortgage? Reply to arrange a call with our prince.',
           'Give me a call for the project',
           'Hello john, wanna play a match of fifa??',
           'Ice skating is the self-propulsion of a person across a sheet of ice, using metal-bladed ice skates to glide on the ice surface.']

In [63]:
doc_term_matrix = vectorizer.transform(example)

In [64]:
classifier.predict(doc_term_matrix)

array([1, 1, 0, 0, 0], dtype=int64)