In [1]:
# Import modules necessary for the spam filter

import nltk
from nltk.corpus import stopwords
import numpy as np
import pandas as pd
import re
import os
import warnings
import pickle
from tqdm import tqdm

warnings.filterwarnings("ignore", category=DeprecationWarning)
tqdm.pandas()

In [2]:
# Specify data directory and path path

data_dir = os.path.join(os.getcwd(),'Data')
data_path = os.path.join(data_dir,'SMSSpamCollection.txt')

In [3]:
df_raw = pd.read_csv(data_path, delimiter = '\t', header = None)
df_raw.columns = ['label', 'text']
df_raw.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
# Define cleaning modules and cleaning functions

import re
import nltk
import string
from nltk import word_tokenize, pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize
from nltk.corpus import wordnet
from contractions import CONTRACTION_MAP

# Import nltk resources
resources = ["wordnet", "stopwords", "punkt", \
             "averaged_perceptron_tagger", "maxent_treebank_pos_tagger"]

for resource in resources:
    try:
        nltk.data.find("tokenizers/" + resource)
    except LookupError:
        nltk.download(resource)

# Create stopwords list        
STOPWORDS = set(stopwords.words('english'))

# Define lemmatizing functions
def lemmatize_doc(document):
    """ 
    Conduct pre-processing, tag words then returns sentence with lemmatized words
    """
    
    # Create an empty list of lemmatized tokens
    lemmatized_list = []
    
    # Tokenize the sentences
    tokenized_sent = sent_tokenize(document)
    
    # Iterate over sentences to conduct lemmatization
    for sentence in tokenized_sent:
        
        # Tokenize the words in the sentence
        tokenized_word = word_tokenize(sentence)
        
        # Tag the pos of the tokens
        tagged_token = pos_tag(tokenized_word)
        
        # Initialize a empty list of lemmatized words
        root = []

        # Create Lemmatizer object
        lemma = WordNetLemmatizer()

        # iterate over the tagged sentences to 
        for token in tagged_token:

            # assign tag and actual word of the token
            tag = token[1][0]
            word = token[0]

            # Lemmatize the token based on tags
            if tag.startswith('J'):
                root.append(lemma.lemmatize(word, wordnet.ADJ))
            elif tag.startswith('V'):
                root.append(lemma.lemmatize(word, wordnet.VERB))
            elif tag.startswith('N'):
                root.append(lemma.lemmatize(word, wordnet.NOUN))
            elif tag.startswith('R'):
                root.append(lemma.lemmatize(word, wordnet.ADV))
            else:          
                root.append(word)

        # Add the lemmatized word into our list
        lemmatized_list.extend(root)
        
    return " ".join(lemmatized_list)

def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):
    """
    Expand the contractions form to create cohenrent extractions
    """
    
    # Substitute quotation marks with apostrophes
    text = re.sub("’", "'", text)
    
    # define the contraction pattern with custom contraction mappings
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), 
                                      flags=re.IGNORECASE|re.DOTALL)
    
    # Define function to expand contraction matches
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
                                if contraction_mapping.get(match)\
                                else contraction_mapping.get(match.lower())                       
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
        
    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text

# Define main text cleaning function
def clean_text(text):
    """
    Return a processed version of the text given
    """
    # Turn all text into lower case
    text = text.lower()
    
    # Expand all contractions
    text = expand_contractions(text)
    
    # Remove stopwords
    text = ' '.join(word for word in text.split() if word not in STOPWORDS)
    
    # Remove all punctuations
    #punctuations = '''!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'''
    text = ' '.join(word.strip(string.punctuation) for word in text.split())
    
    # Remove numerics
    text = re.sub(r'\d+', '', text)
    
    # Lemmatize text
    text = lemmatize_doc(text)
    
    # Removing Extra spaces if any
    text = re.sub(r'[\s]+', ' ', text)
    
    # Convert 
    return text

[nltk_data] Downloading package wordnet to C:\Users\Zach
[nltk_data]     Nguyen\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Zach
[nltk_data]     Nguyen\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Zach Nguyen\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_treebank_pos_tagger to
[nltk_data]     C:\Users\Zach Nguyen\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_treebank_pos_tagger is already up-to-
[nltk_data]       date!


In [5]:
df_processed = df_raw.text.progress_apply(clean_text)

100%|█████████████████████████████████████████████████████████████████████████████| 5572/5572 [00:10<00:00, 517.75it/s]


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import make_scorer, accuracy_score, roc_auc_score, fbeta_score, f1_score
from sklearn.metrics import confusion_matrix

from scipy.sparse import coo_matrix, hstack

In [7]:
def get_score_figures(clf, Xtrain, Xtest, ytrain, ytest, best_beta):
    # Derive scores
    training_accuracy = clf.score(Xtrain, ytrain)
    test_accuracy = clf.score(Xtest, ytest)
    f1 = f1_score(ytest, clf.predict(Xtest), average = 'macro')
    fbeta = fbeta_score(ytest, clf.predict(Xtest), beta = best_beta)
    probs = clf.predict_proba(Xtest)[:, 1]
    auc = roc_auc_score(ytest, probs)
    print("Accuracy on training data: {:2f}".format(training_accuracy))
    print("Accuracy on test data:     {:2f}".format(test_accuracy))
    print("AUC-ROC score     {:2f}".format(auc))
    print("F1 score     {:2f}".format(f1))
    print("Fbeta score for beta = {:2f} is {:2f}".format(round(best_beta,3), fbeta))

In [8]:
from sklearn.base import BaseEstimator, TransformerMixin

class MetaDataExtractor(BaseEstimator, TransformerMixin):
    """Takes in text series, outputs meta-features
    """

    def __init__(self):
        pass

    def extract_meta_data(self, message):
        """
        This function preprocess one text message, creating a list of metadata.
        The message argument is a message.
        """

        # Replace email addresses with 'EmAd'
        message = re.sub(r'[^\s]+@.[^\s]+', '{EmAd}', message)

        # Replace URLs with 'Url'
        message = re.sub(r'http[^\s]+', '{Url}', message)

        # Replace money symbols with 'MoSy'
        message = re.sub(r'£|\$', '{MoSy}', message)

        # Replace 10 or 11 digit phone numbers
        message = re.sub(r'0?(\d{10,}?)','{PhNu}', message)

        # Derive tokens
        token = nltk.word_tokenize(message)

        # Derive number of tokens
        n_token = len(token)

        # Derive the average length of a token
        avg_len = np.mean([len(word) for word in message.split()])

        # Derive the number of numerics
        n_num = len([tok for tok in message if tok.isdigit() or tok == '{PhNu}'])

        # Derive if the message has numerics
        has_num = np.where(n_num > 0,1,0)

        # Derive the number of uppercased words
        n_uppers = len([word for word in message if word.isupper()])

        # Derive the number of English stop words
        n_stops = len([word for word in message if word in stopwords.words('english')])

        # Derive the symbol columns
        has_email = np.where('{EmAd}' in message,1, 0)
        has_money = np.where('{MoSy}' in message,1, 0)
        has_phone = np.where('{PhNu}' in message,1,0)
        has_url = np.where('{Url}' in message,1,0)

        return n_token, avg_len, n_num, has_num, n_uppers, n_stops, has_email, has_money, has_phone, has_url
    
    def transform(self, messages, y=None):
        """
        Tranform the meta-data features extracted and convert into dataframe format
        """
        return messages.apply(self.extract_meta_data).progress_apply(pd.Series)

    def fit(self, messages, y=None):
        """Returns `self` unless something different happens in train and test"""
        return self

In [9]:
vec = CountVectorizer(stop_words = 'english', 
                        ngram_range = (1, 2), 
                        token_pattern=r'\b\w+\b',
                        min_df = 1)
vec = vec.fit(df_processed)

In [10]:
filename = 'best_vec.pkl'
with open(filename, 'wb') as f:
    pickle.dump(vec, f)

In [11]:
sparse_feat = vec.transform(df_processed)

dense_feat = MetaDataExtractor().fit_transform(df_raw.text)

100%|████████████████████████████████████████████████████████████████████████████| 5572/5572 [00:01<00:00, 4943.91it/s]


In [12]:
dense_feat = coo_matrix(MinMaxScaler().fit_transform(dense_feat))

In [13]:
X = hstack([sparse_feat, dense_feat.astype(float)])
y = (df_raw.label == 'spam').values.astype(np.int)
indices = df_raw.index

# Split train and test set
Xtrain, Xtest, ytrain, ytest, itrain, itest = train_test_split(X, y, indices, train_size = 0.8, random_state = 42)

In [14]:
clf = MultinomialNB(alpha = 5)
clf.fit(Xtrain, ytrain)

MultinomialNB(alpha=5)

In [15]:
get_score_figures(clf, Xtrain, Xtest, ytrain, ytest, 0.1)

Accuracy on training data: 0.991474
Accuracy on test data:     0.990135
AUC-ROC score     0.978032
F1 score     0.978263
Fbeta score for beta = 0.100000 is 0.985435


In [16]:
from sklearn.metrics import confusion_matrix
confusion_matrix(ytest, clf.predict(Xtest), labels=None, sample_weight=None)

array([[964,   2],
       [  9, 140]], dtype=int64)

In [17]:
filename = 'best_model.pkl'
with open(filename, 'wb') as f:
    pickle.dump(clf, f)

In [18]:
with open(filename, 'rb') as f:
    clf_loaded = pickle.load(f)

In [19]:
clf_loaded

MultinomialNB(alpha=5)