In [1]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import recall_score, precision_score, f1_score

In [2]:
DATA_JSON_FILE = 'SpamData/01_Processing/email-text-data.json'

In [3]:
data = pd.read_json(DATA_JSON_FILE)

In [4]:
data.tail()

Unnamed: 0,CATEGORY,MESSAGE,FILE_NAME
5791,0,"I'm one of the 30,000 but it's not working ver...",00609.dd49926ce94a1ea328cce9b62825bc97
5792,0,Damien Morton quoted:\n\n>W3C approves HTML 4 ...,00957.e0b56b117f3ec5f85e432a9d2a47801f
5793,0,"On Mon, 2002-07-22 at 06:50, che wrote:\n\n\n\...",01127.841233b48eceb74a825417d8d918abf8
5794,0,"Once upon a time, Manfred wrote :\n\n\n\n> I w...",01178.5c977dff972cd6eef64d4173b90307f0
5795,0,"If you run Pick, and then use the ""New FTOC"" b...",00747.352d424267d36975a7b40b85ffd0885e


In [5]:
data.sort_index(inplace = True) #order data by index

In [6]:
vectorize = CountVectorizer(stop_words = 'english')

In [7]:
all_features = vectorize.fit_transform(data.MESSAGE) # gives sparse matrix

In [8]:
all_features.shape

(5796, 102688)

In [9]:
vectorize.vocabulary_

{'dear': 32719,
 'homeowner': 48032,
 'rates': 76346,
 'lowest': 59361,
 'point': 72293,
 '40': 7824,
 'years': 98500,
 'help': 47198,
 'best': 23129,
 'rate': 76343,
 'situation': 82313,
 'matching': 60926,
 'needs': 64746,
 'hundreds': 48605,
 'lenders': 58017,
 'home': 48004,
 'improvement': 51397,
 'refinance': 77070,
 'second': 80964,
 'mortgage': 63022,
 'equity': 38990,
 'loans': 59054,
 'perfect': 70474,
 'credit': 30975,
 'service': 81355,
 '100': 1496,
 'free': 42771,
 'owners': 68711,
 'new': 64984,
 'buyers': 25617,
 'obligation': 66809,
 'just': 55046,
 'quick': 75543,
 'simple': 82167,
 'form': 42423,
 'jump': 54997,
 'start': 84130,
 'future': 43328,
 'plans': 71935,
 'today': 88034,
 'visit': 92916,
 'http': 48495,
 '61': 10092,
 '145': 2275,
 '116': 1873,
 '186': 2748,
 'user0201': 91334,
 'index': 51637,
 'asp': 20429,
 'afft': 17606,
 'qm10': 75104,
 'unsubscribe': 90950,
 'light': 58468,
 'watch': 94276,
 'attention': 20740,
 'computer': 29755,
 'users': 91362,
 'sp

In [10]:
X_train, X_test, y_train, y_test = train_test_split(all_features, data.CATEGORY, test_size = 0.3, random_state = 88)

In [11]:
X_train.shape

(4057, 102688)

In [12]:
X_test.shape

(1739, 102688)

In [13]:
classifier = MultinomialNB()

In [14]:
classifier.fit(X_train,y_train)

MultinomialNB()

In [15]:
y_pred = classifier.predict(X_test)

In [16]:
true_pos = (y_pred == 1) & (y_test == 1)
true_neg = (y_pred == 0) & (y_test == 0)
false_pos = (y_pred == 1) & (y_test == 0)
false_neg = (y_pred == 0) & (y_test == 1)
correct = true_neg.sum() + true_pos.sum()
print('The nr of documents classified correctly: ', correct)
incorrect = len(y_pred) - correct
print('The nr of documents classified incorrectly: ', incorrect)

The nr of documents classified correctly:  1659
The nr of documents classified incorrectly:  80


In [17]:
accuracy = correct / len(y_test)
print('The testing accuracy is: {:.2%}'.format(accuracy))

The testing accuracy is: 95.40%


In [18]:
classifier.score(X_test,y_test)

0.9539965497412306

In [19]:
# recall_score = true_pos.sum() / (true_pos.sum() + false_neg.sum())
# precision_score = true_pos.sum() / (true_pos.sum() + false_pos.sum())
# f1_score = 2 * ((precision_score*recall_score)/(precision_score+recall_score))
print('The recall score is: ', recall_score(y_test,y_pred))
print('The precision score is: ', precision_score(y_test,y_pred))
print('The F-Score is: ', f1_score(y_test,y_pred))

The recall score is:  0.8646209386281588
The precision score is:  0.9896694214876033
The F-Score is:  0.9229287090558767


## Test the classifier with an example for fun

In [20]:
example = ['get viagra for free now!',
           'need a morgage? Reply to arrange a call with a specialist and get a quote',
           'Could you please help me with the project for tomorrow?',
          'Hello Jonathan, how about a gamme of golf tomorrow?',
          'Ski jumping is a winter sport in which competitors aim to achieve the farthest jump after sliding down on their skis from a specially designed curved ramp. Along with jump length, competitor\'s aerial style and other factors also affect the final score.Ski jumping was first contested in Norway in the late 19th century, and later spread through Europe and North America in the early 20th century. Along with cross-country skiing, it constitutes the traditional group of Nordic skiing disciplines.[1]']

In [21]:
doc_term_matrix = vectorize.transform(example)

In [22]:
classifier.predict(doc_term_matrix)

array([1, 1, 0, 0, 0])