In [36]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import recall_score, precision_score,f1_score

In [4]:
DATA_JSON_FILE = 'SpamData/01_Processing/email-text-data.json'

In [5]:
data = pd.read_json(DATA_JSON_FILE)

In [8]:
data.sort_index(inplace=True)

In [12]:
data.tail()

Unnamed: 0,MESSAGE,CATEGORY,DOC_ID,FILE_NAME
02496.aae0c81581895acfe65323f344340856,Man killed 'trying to surf' on Tube train \n\n...,0,3349,02496.aae0c81581895acfe65323f344340856
02497.60497db0a06c2132ec2374b2898084d3,"Hi Gianni,\n\n\n\nA very good resource for thi...",0,2238,02497.60497db0a06c2132ec2374b2898084d3
02498.09835f512f156da210efb99fcc523e21,Gianni Ponzi wrote:\n\n> I have a prob when tr...,0,2429,02498.09835f512f156da210efb99fcc523e21
02499.b4af165650f138b10f9941f6cc5bce3c,Neale Pickett <neale@woozle.org> writes:\n\n\n...,0,3016,02499.b4af165650f138b10f9941f6cc5bce3c
02500.05b3496ce7bca306bed0805425ec8621,"\n\nHi,\n\n\n\nI think you need to give us a l...",0,3282,02500.05b3496ce7bca306bed0805425ec8621


In [13]:
vectorizer = CountVectorizer(stop_words='english')

In [15]:
all_features = vectorizer.fit_transform(data.MESSAGE)

In [16]:
all_features.shape

(5796, 102694)

In [19]:
X_train,X_test,y_train,y_test = train_test_split(all_features,data.CATEGORY,test_size=0.3,random_state=88)

In [24]:
classifier = MultinomialNB()
classifier.fit(X_train,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [32]:
total_emails = y_test.size
print('Nr. of  emails sorted:',total_emails)

Nr. of  emails sorted: 1739


In [27]:
nr_correct = (y_test == classifier.predict(X_test)).sum()
print('Nr. of correct emails sorted:',nr_correct)

Nr of correct emails sorted:  1653


In [29]:
nr_incorrect = (y_test != classifier.predict(X_test)).sum()
print('Nr. of correct emails sorted:',nr_incorrect)

Nr of correct emails sorted: 86


In [35]:
print(f'The (testing) accurracy of the model is {classifier.score(X_test,y_test):.2%}')

The (testing) accurracy of the model is 95.05%


In [37]:
recall_score(y_test, classifier.predict(X_test))

0.8605504587155963

In [38]:
precision_score(y_test, classifier.predict(X_test))

0.9791231732776617

In [39]:
f1_score(y_test,classifier.predict(X_test))

0.916015625

In [40]:
example = ['Get viagra for free now!',
           'Replay to get a call with a specialist and win millions for free',
           'Hello John how about a game of chess tomorrow morning?']

In [48]:
doc_term_matrix = vectorizer.transform(example)
doc_term_matrix.shape

(3, 102694)

In [44]:
classifier.predict(doc_term_matrix)

array([1, 1, 0])