In [2]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import recall_score, precision_score, f1_score

import pickle

In [3]:
DATA_JSON_FILE = 'SpamData/01_Processing/email-text-data.json'

In [4]:
data = pd.read_json(DATA_JSON_FILE)

In [5]:
data.shape

(5796, 3)

In [None]:
data.sort_index(inplace=True)

In [None]:
data.tail()

In [6]:
vectorizer = CountVectorizer(stop_words='english')

In [12]:
all_features = vectorizer.fit_transform(data.MESSAGE)

In [None]:
all_features.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(all_features, data.CATEGORY, 
                                                    test_size=0.3, random_state=88)

In [None]:
classifier = MultinomialNB()

In [None]:
classifier.fit(X_train, y_train)

In [None]:
nr_correct = (y_test == classifier.predict(X_test)).sum()
print(f'{nr_correct} documents classfied correctly')

In [None]:
nr_incorrect = y_test.size - nr_correct
print(f'Number of documents incorrectly classified is {nr_incorrect}')

In [None]:
fraction_wrong = nr_incorrect / (nr_correct + nr_incorrect)
print(f'The (testing) accuracy of the model is {1-fraction_wrong:.2%}')

In [None]:
classifier.score(X_test, y_test)

In [None]:
recall_score(y_test, classifier.predict(X_test))

In [None]:
precision_score(y_test, classifier.predict(X_test))

In [None]:
f1_score(y_test, classifier.predict(X_test))

## Testing random email

In [None]:
example = ['get viagra for free!!!!!',
           'need a mortgage? Reply to arrange a call with our prince.',
           'Give me a call for the project',
           'Hello john, wanna play a match of fifa??',
           'Ice skating is the self-propulsion of a person across a sheet of ice, using metal-bladed ice skates to glide on the ice surface.']

In [None]:
doc_term_matrix = vectorizer.transform(example)

In [None]:
classifier.predict(doc_term_matrix)

### Saving

In [7]:
file = open('spam_email.pkl', 'wb')
# dump information to that file
pickle.dump(classifier, file)

NameError: name 'classifier' is not defined

### Loading model and making random predictions

In [9]:
model = pickle.load(open('model/spam_email.pkl', 'rb'))

In [13]:
ex = ['Get free trail!!!!']
ex_mat = vectorizer.transform(ex)
a = model.predict(ex_mat)

In [14]:
ex_mat

<1x102694 sparse matrix of type '<class 'numpy.int64'>'
	with 2 stored elements in Compressed Sparse Row format>

In [15]:
a[0]

1