# Spam Text Message Classification 
## using NLP and Multinomial Naive Bayes Classifier

The objective is to train a model, which can be used for automatic detection of spam messages.<br>
We will use the experience showing that 
- messages, containing words like 'free', 'win', 'winner', 'cash', 'prize' and the like usually contain spam
- spam messages tend to have words written in all capitals and 
- also tend to use a lot of exclamation marks

The __multinomial Naive Bayes__ classifier is suitable for classification with discrete features (e.g., word counts for text classification). <br>

We will use a library called __Spacy__ for operations with the language data.

## Prerequisites

In [None]:
#!pip install -U spacy

In [None]:
!python -m spacy validate

In [None]:
!python -m spacy info --markdown

In [None]:
#!pip install wordcloud

In [None]:
# import important modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from string import punctuation 
import seaborn as sns
import re

In [None]:
import os
import sys
from tqdm import tqdm

In [None]:
# sklearn modules
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score, roc_auc_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score, RandomizedSearchCV
from sklearn.metrics import confusion_matrix

In [None]:
import spacy
from spacy import displacy

In [None]:
# !python -m spacy download da_core_news_sm

In [None]:
from wordcloud import WordCloud, STOPWORDS

## Step 1: Data Collection

In [None]:
# It is a pre-processed table with two columns - a label and a message
# Import the table into a pandas dataframe using the read_table method
# Spam Data Set: https://archive.ics.uci.edu/ml/datasets/SMS+Spam+Collection
data = pd.read_table('/Users/tdi/Documents/Teaching/DS/Data/SMSSpamCollection.tsv', sep='\t', header=None, names=['label', 'message'])
data.shape

In [None]:
# show top five rows
data.head()

## Step 2: Data Exploration

In [None]:
# evaluate class distribution
data["label"].value_counts()

In [None]:
data[data['label'].notnull()]

In [None]:
# check for missing values
data.isnull().sum()

In [None]:
# get ham
ham = data[(data['label'] == 'ham')]
ham

In [None]:
# get spams
spam = data[(data['label'] == 'spam')]

In [None]:
cnt = data.groupby(['label']).count()
cnt

In [None]:
from spacy.lang.en.stop_words import STOP_WORDS

In [None]:
stopwords = set(STOP_WORDS)

In [None]:
stopwords

In [None]:
# word cloud for words
def plot_word_cloud(words):
    wordcloud = WordCloud(width = 600, height = 600,
                    background_color ='white',
                    stopwords = stopwords,      
                    min_font_size = 10,
    ).generate(words)

    # plot the WordCloud image                      
    plt.figure(figsize = (5, 5), facecolor = None)
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.tight_layout(pad = 0)

    plt.show()

In [None]:
hamstr = ' '.join(ham["message"].astype(str))

In [None]:
print("ham words {}".format(len(hamstr)))

In [None]:
spamstr = ' '.join(spam["message"].astype(str))

In [None]:
print("spam words {}".format(len(spamstr)))

In [None]:
plot_word_cloud(spamstr)

In [None]:
plot_word_cloud(hamstr)

## Step 3: Data Preprocessing

In [None]:
# digitalize the labels into numerical values, map 'ham' to 0 and 'spam' to 1
data['label'] = data.label.map({'ham':0, 'spam':1})
data.head() 

In [None]:
#!python -m spacy validate

In [None]:
#!python -m spacy download en_core_web_md
#!python -m spacy download da_core_news_md

In [None]:
# Load English tokenizer
nlp = spacy.load("en_core_web_md")

In [None]:
def clean(text):

    # remove extra spaces
    text = " ".join(text.split())
        
    # set in lowercase
    text = text.lower() 
    
    # Restore abbreviations
    text = re.sub(r"[^A-Za-z0-9]", " ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"I'm", "I am", text)
    text = re.sub(r"ur", " your ", text)
    text = re.sub(r" nd "," and ",text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r" tkts "," tickets ",text)
    text = re.sub(r" c "," can ",text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r'http\S+',' link ', text)
    text = re.sub(r'\b\d+(?:\.\d+)?\s+', '', text) # remove numbers
    text = re.sub(r" u "," you ",text)
    
    doc = nlp(text) 
    tokens = []
    for sentence in doc.sents:
        for token in sentence:
            if not (token.is_stop or token.is_punct or token.is_space):
                tokens.append(token.lemma_)
    text = " ".join(tokens)           
                             
    # Return a list of words
    return(text)


In [None]:
data['clean'] = data['message'].apply(clean)

In [None]:
data.sample(5)

## Step 4: Train a Model

In [None]:
# Split the dataset into training and testing sets
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data['clean'], data['label'], random_state=1)

In [None]:
print('Number of rows in the total set: {}'.format(data.shape[0]))
print('Number of rows in the training set: {}'.format(X_train.shape[0]))
print('Number of rows in the test set: {}'.format(X_test.shape[0]))

In [None]:
# Create an instance of CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer
vector = CountVectorizer()
vector

In [None]:
# Fit and transform the text into vectors  and return them in a matrix
X_train_vector = vector.fit_transform(X_train)
X_train_vector.shape

In [None]:
X_train_vector

In [None]:
# Transform test data and return the matrix 
# Note we are not fitting the test data into the CountVectorizer()
X_test_vector = vector.transform(X_test)
X_test_vector.shape

In [None]:
# Call Multinominal Naive Bayes and train the model
from sklearn.naive_bayes import MultinomialNB
myNB = MultinomialNB()
myNB.fit(X_train_vector, y_train)

In [None]:
# Test on the test data, try prediction
predictions = myNB.predict(X_test_vector)

In [None]:
# Validate the accuracy of the predictions
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print('Accuracy score: ', format(accuracy_score(y_test, predictions)))
print('Precision score: ', format(precision_score(y_test, predictions)))
print('Recall score: ', format(recall_score(y_test, predictions)))
print('F1 score: ', format(f1_score(y_test, predictions)))

In [None]:
# calculate confusion matrix to further evaluate the accuracy of the prediction
cmat = confusion_matrix(y_test, predictions)
cmat

In [None]:
target_names = ['Ham', 'Spam']

In [None]:
# visualize the confusion matrix by use of seaborn library
sns.set()
sns.heatmap(cmat.T, square=True, annot=True, fmt='d', cbar=False, xticklabels=target_names, yticklabels=target_names)
plt.xlabel('actual')
plt.ylabel('predicted');
plt.show()

## Step 4: Validate with New Data

In [None]:
my_bad_data = vector.transform(["We offer very low prices"])
my_good_data = vector.transform(["Today is a good day"])

In [None]:
prediction1 = myNB.predict(my_bad_data)
prediction2 = myNB.predict(my_good_data)

In [None]:
prediction1[0]

In [None]:
prediction2[0]

In [None]:
# print accuracy evaluation report
report = classification_report(y_test, predictions)
print(report)

## Step 5: Save the Model

In [None]:
# in a file
import joblib 
filename = '../models/spam-detection-model.pkl'
joblib.dump(myNB, filename)

In [None]:
#save the transformer 
vectorfile = '../preprocessing/count_vectorizer.pkl'
joblib.dump(vector, vectorfile)

In [None]:
# another time later...

In [None]:
# load the model from a file
model = joblib.load(filename)

In [None]:
# load the vectorizer from a file
vector = joblib.load(vectorfile)

In [None]:
# reuse it
new_data = vector.transform(['Is it a spam?'])
result = model.predict(new_data)
print(result)

## <span style="color:red">Task</span>
Repeat the training, testing and validation with Danish Spacy model and validation text.
Which of the two cases gave better results?