In [1]:
# Import modules necessary for the spam filter

import nltk
from nltk.corpus import stopwords
import numpy as np
import pandas as pd
import re
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from scipy.sparse import coo_matrix, hstack

In [2]:
# Specify data directory

data_dir = os.path.join(os.path.dirname(os.getcwd()),'Data')

In [3]:
# Import our best model and the vectorized vocabularies

%store -r best_model
%store -r best_vec

In [4]:
# Build our spam filters. The user will enter a message.

def classify_spam(message):
    """
    This filter classifies a given spam message with the best classifier explored
    """
    ## Step 1: Preprocessing the message
    # Convert message into dataframe
    df = pd.DataFrame({'text': message})
    
    # Preprocess the message with regex
    for index, row in df.iterrows():
        # Replace email addresses with 'EmAd'
        row['text'] = re.sub(r'[^\s]+@.[^\s]+', '{EmAd}', row['text'])

        # Replace URLs with 'Url'
        row['text'] = re.sub(r'http[^\s]+', '{Url}', row['text'])

        # Replace money symbols with 'MoSy'
        row['text'] = re.sub(r'£|\$', '{MoSy}', row['text'])

        # Replace 10 or 11 digit phone numbers
        row['text'] = re.sub(r'0?(\d{10,}?)','{PhNu}', row['text'])
    
    # Derive tokens
    df['token'] = df.apply(lambda row: nltk.word_tokenize(row['text']), axis=1)
    
    # Derive number of tokens
    df['n_token'] = [len(line) for line in df['token']]
    
    # Derive the average length of a token
    avg_len = []
    for i in range(len(df)):
        avg_len.append(np.mean([len(word) for word in df['token'][i]]))
    df['avg_wlen'] =avg_len
    
    # Derive the number of numerics
    df['n_num'] = df.apply(lambda row: len([tok for tok in row['token'] if tok.isdigit() or tok == '{PhNu}']), axis = 1)
    
    # Derive if the message has numerics
    df['has_num'] = df.apply(lambda row: sum([1 if row['n_num'] > 0 else 0]), axis = 1)
    
    # Derive the number of uppercased words
    df['n_uppers'] = df.apply(lambda row: len([word for word in row['token'] if word.isupper()]), axis = 1)
    
    # Derive the number of English stop words
    df['n_stops'] = df.apply(lambda row: len([word for word in row['token'] if word in stopwords.words('english')]), axis = 1)
    
    # Derive the symbol columns
    df['has_email'] = [1 if '{EmAd}' in line else 0 for line in df.text]
    df['has_money'] = [1 if '{MoSy}' in line else 0 for line in df.text]
    df['has_phone'] = [1 if '{PhNu}' in line else 0 for line in df.text]
    df['has_url'] = [1 if 'Url' in line else 0 for line in df.text]
    
    
    ## Step 2: Creating a prediction based on the message
    Xproba = pd.DataFrame(best_vec.predict_proba(df.text))
    X = pd.merge(df.drop(['text', 'token'], axis = 1), Xproba, left_index = True, right_index = True)
    
    # Predict the result
    y_pred = best_model.predict(X)
    return y_pred

In [15]:
# A sample message batch

message = ["You have still not claimed the compensation you are due for the accident you had. \
            To start the process please reply YES. To opt out text STOP", 
           "Don't ever buy me curry noodles again I hate them theyre too spicey AN they make me cry",
          "Welcome to South Korea! A local call or a call to Can is $1.00/min, $0.50/sms and $0.50/MB. \
           For more information visit //frdm.mobi/wrldroam or call +16477001611--"]

In [6]:
# Construct our spam_filter bot

def spam_filter(message):
    result = []
    for classification in classify_spam(message):
        if classification == 0:
            result.append('ham')
        else:
            result.append('spam')
    for index, result in enumerate(np.asarray(result)):
        print('The algorithm thinks your message number {:d}'.format(index + 1) + ' is {:s}'.format(result)) 

In [17]:
# Test run spam_filter bot

spam_filter(message)

The algorithm thinks your message number 1 is spam
The algorithm thinks your message number 2 is ham
The algorithm thinks your message number 3 is spam


In [8]:
# Import the original dataframe to test for speed

df_raw = pd.read_csv(os.path.join(data_dir,'SMSSpamCollection.txt'), delimiter = '\t', header = None)
df_raw.columns = ['label', 'text']
df_raw.shape

(5572, 2)

In [9]:
#Take 100 messages as a batch

message_100_test = df_raw[0:100].text.tolist()

In [10]:
%%time

# Run the spam filter bot with clock

spam_filter(message_100_test)

The algorithm thinks your message number 1 is ham
The algorithm thinks your message number 2 is ham
The algorithm thinks your message number 3 is spam
The algorithm thinks your message number 4 is ham
The algorithm thinks your message number 5 is ham
The algorithm thinks your message number 6 is spam
The algorithm thinks your message number 7 is ham
The algorithm thinks your message number 8 is ham
The algorithm thinks your message number 9 is spam
The algorithm thinks your message number 10 is spam
The algorithm thinks your message number 11 is ham
The algorithm thinks your message number 12 is spam
The algorithm thinks your message number 13 is spam
The algorithm thinks your message number 14 is ham
The algorithm thinks your message number 15 is ham
The algorithm thinks your message number 16 is spam
The algorithm thinks your message number 17 is ham
The algorithm thinks your message number 18 is ham
The algorithm thinks your message number 19 is ham
The algorithm thinks your message

In [13]:
new_message = ["Hi Yaru, this week in Fido XTRA, Fido gives back to 3 amazing causes! YOU CHOOSE, FIDO DONATES! 💗 Tell us which cause you'd like Fido to support, in Fido XTRA: fidoapp://screen/more/FidoXTRA  \
Keep an eye on your app and make sure your notifications are ON so you won't miss out on the best of Fido XTRA. \
Questions? fido.ca/contactus \Reply STOP to opt out of Fido text msgs."]

In [14]:
spam_filter(new_message)

The algorithm thinks your message number 1 is ham
