# Spam message filter by using Natural Language Processing


In [None]:
import sys 
import nltk
import sklearn
import pandas
import numpy

In [7]:
import pandas as pd 
import numpy as np 

df = pd.read_table('SMSSpamCollection',header = None);

In [8]:
df.head

<bound method NDFrame.head of          0                                                  1
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf, he lives aro...
...    ...                                                ...
5567  spam  This is the 2nd time we have tried 2 contact u...
5568   ham               Will ü b going to esplanade fr home?
5569   ham  Pity, * was in mood for that. So...any other s...
5570   ham  The guy did some bitching but I acted like i'd...
5571   ham                         Rofl. Its true to its name

[5572 rows x 2 columns]>

In [9]:
classes = df[0]

In [11]:
classes.value_counts()

ham     4825
spam     747
Name: 0, dtype: int64

## Pre-processing

In [15]:
# Encoding the labels with .
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder ()
Y=encoder.fit_transform(classes)
Y[:10]

array([0, 0, 1, 0, 0, 1, 0, 0, 1, 1])

In [17]:
# store the text messages for training .

text_messages = df[1]
text_messages.head()

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
Name: 1, dtype: object

In [18]:
# Regular expression makes the dataframe much more readable and much more trainable. 

# replace emails with 'email' . 

processed = text_messages.str.replace(r'^.+@[^\.].*\.[a-z]{2,}$',
                                 'emailaddress')

# Replace URLs with 'webaddress'
processed = processed.str.replace(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$',
                                  'webaddress')

# Replace money symbols with 'moneysymb' (£ can by typed with ALT key + 156)
processed = processed.str.replace(r'£|\$', 'moneysymb')
    
# Replace 10 digit phone numbers (formats include paranthesis, spaces, no spaces, dashes) with 'phonenumber'
processed = processed.str.replace(r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$',
                                  'phonenumbr')
    
# Replace numbers with 'numbr'
processed = processed.str.replace(r'\d+(\.\d+)?', 'numbr')

In [19]:

# Remove punctuation
processed = processed.str.replace(r'[^\w\d\s]', ' ')

# Replace whitespace between terms with a single space
processed = processed.str.replace(r'\s+', ' ')

# Remove leading and trailing whitespace
processed = processed.str.replace(r'^\s+|\s+?$', '')

In [23]:
processed= processed.str.lower()
processed[56]

'congrats numbr year special cinema pass for numbr is yours call numbr now c suprman v matrixnumbr starwarsnumbr etc all numbr free bxnumbr ipnumbr numbrwe numbrpm dont miss out'

In [24]:
# remove stop words which dont add the useful information to the message 

from nltk.corpus import stopwords 

In [25]:
stop_words= set(stopwords.words('english'))
processed =processed .apply(lambda x:' '.join(term for term in x.split() if term not in stop_words))

In [26]:
processed

0       go jurong point crazy available bugis n great ...
1                                 ok lar joking wif u oni
2       free entry numbr wkly comp win fa cup final tk...
3                     u dun say early hor u c already say
4                  nah think goes usf lives around though
                              ...                        
5567    numbrnd time tried numbr contact u u moneysymb...
5568                          ü b going esplanade fr home
5569                                pity mood suggestions
5570    guy bitching acted like interested buying some...
5571                                       rofl true name
Name: 1, Length: 5572, dtype: object

In [27]:
ps =nltk.PorterStemmer()
processed = processed.apply(lambda x: ' '.join(ps.stem(term)for term in x.split()))

In [28]:
processed

0       go jurong point crazi avail bugi n great world...
1                                   ok lar joke wif u oni
2       free entri numbr wkli comp win fa cup final tk...
3                     u dun say earli hor u c alreadi say
4                    nah think goe usf live around though
                              ...                        
5567    numbrnd time tri numbr contact u u moneysymbnu...
5568                              ü b go esplanad fr home
5569                                    piti mood suggest
5570    guy bitch act like interest buy someth els nex...
5571                                       rofl true name
Name: 1, Length: 5572, dtype: object

In [29]:
from nltk.tokenize import word_tokenize

# create bag-of-words
all_words = []

for message in processed:
    words = word_tokenize(message)
    for w in words:
        all_words.append(w)
        
all_words = nltk.FreqDist(all_words)

In [30]:
# Total numbers and most frequent numbers
print('Number of words: {}'.format(len(all_words)))
print('Most common words: {}'.format(all_words.most_common(15)))

Number of words: 6579
Most common words: [('numbr', 2648), ('u', 1207), ('call', 674), ('go', 456), ('get', 451), ('ur', 391), ('gt', 318), ('lt', 316), ('come', 304), ('moneysymbnumbr', 303), ('ok', 293), ('free', 284), ('day', 276), ('know', 275), ('love', 266)]


In [31]:
word_features =list(all_words.keys())[:1500]

In [32]:
word_features

['go',
 'jurong',
 'point',
 'crazi',
 'avail',
 'bugi',
 'n',
 'great',
 'world',
 'la',
 'e',
 'buffet',
 'cine',
 'got',
 'amor',
 'wat',
 'ok',
 'lar',
 'joke',
 'wif',
 'u',
 'oni',
 'free',
 'entri',
 'numbr',
 'wkli',
 'comp',
 'win',
 'fa',
 'cup',
 'final',
 'tkt',
 'numbrst',
 'may',
 'text',
 'receiv',
 'question',
 'std',
 'txt',
 'rate',
 'c',
 'appli',
 'numbrovernumbr',
 'dun',
 'say',
 'earli',
 'hor',
 'alreadi',
 'nah',
 'think',
 'goe',
 'usf',
 'live',
 'around',
 'though',
 'freemsg',
 'hey',
 'darl',
 'week',
 'word',
 'back',
 'like',
 'fun',
 'still',
 'tb',
 'xxx',
 'chg',
 'send',
 'moneysymbnumbr',
 'rcv',
 'even',
 'brother',
 'speak',
 'treat',
 'aid',
 'patent',
 'per',
 'request',
 'mell',
 'oru',
 'minnaminungint',
 'nurungu',
 'vettam',
 'set',
 'callertun',
 'caller',
 'press',
 'copi',
 'friend',
 'winner',
 'valu',
 'network',
 'custom',
 'select',
 'receivea',
 'prize',
 'reward',
 'claim',
 'call',
 'code',
 'klnumbr',
 'valid',
 'hour',
 'mobil',


In [35]:
def find_features(message):
    words=word_tokenize(message)
    features = {}
    
    for word in word_features : 
        features[word] = (word in words)
        
    return features


In [36]:
features = find_features(processed[0])

In [37]:
features

{'go': True,
 'jurong': True,
 'point': True,
 'crazi': True,
 'avail': True,
 'bugi': True,
 'n': True,
 'great': True,
 'world': True,
 'la': True,
 'e': True,
 'buffet': True,
 'cine': True,
 'got': True,
 'amor': True,
 'wat': True,
 'ok': False,
 'lar': False,
 'joke': False,
 'wif': False,
 'u': False,
 'oni': False,
 'free': False,
 'entri': False,
 'numbr': False,
 'wkli': False,
 'comp': False,
 'win': False,
 'fa': False,
 'cup': False,
 'final': False,
 'tkt': False,
 'numbrst': False,
 'may': False,
 'text': False,
 'receiv': False,
 'question': False,
 'std': False,
 'txt': False,
 'rate': False,
 'c': False,
 'appli': False,
 'numbrovernumbr': False,
 'dun': False,
 'say': False,
 'earli': False,
 'hor': False,
 'alreadi': False,
 'nah': False,
 'think': False,
 'goe': False,
 'usf': False,
 'live': False,
 'around': False,
 'though': False,
 'freemsg': False,
 'hey': False,
 'darl': False,
 'week': False,
 'word': False,
 'back': False,
 'like': False,
 'fun': False,
 's

In [48]:

# Do it for all
messages = list(zip(processed, Y))


seed = 1
np.random.seed = seed
np.random.shuffle(messages)

# call find_features function for each SMS message
featuresets = [(find_features(text), label) for (text, label) in messages]

## Train - test split  

In [57]:
from sklearn import model_selection
training, testing = model_selection.train_test_split(featuresets, test_size = 0.25, random_state=seed)

In [50]:
print(len(training))
print(len(testing))

4179
1393


### Model is created by using the SVC which belongs to sklearn library. 

In [52]:
# We can use sklearn algorithms in NLTK
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.svm import SVC

model = SklearnClassifier(SVC(kernel = 'linear'))

# train the model on the training data
model.train(training)

# and test on the testing dataset!
accuracy = nltk.classify.accuracy(model, testing)*100
print("SVC Accuracy: {}".format(accuracy))

SVC Accuracy: 98.42067480258436


## For different classifier methods , accuracy is listed.

In [53]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Define models to train
names = ["K Nearest Neighbors", "Decision Tree", "Random Forest", "Logistic Regression", "SGD Classifier",
         "Naive Bayes", "SVM Linear"]

classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    SGDClassifier(max_iter = 100),
    MultinomialNB(),
    SVC(kernel = 'linear')
]

models = zip(names, classifiers)

for name, model in models:
    nltk_model = SklearnClassifier(model)
    nltk_model.train(training)
    accuracy = nltk.classify.accuracy(nltk_model, testing)*100
    print("{} Accuracy: {}".format(name, accuracy))

K Nearest Neighbors Accuracy: 93.03661162957646
Decision Tree Accuracy: 96.69777458722182
Random Forest Accuracy: 98.1335247666906
Logistic Regression Accuracy: 98.49246231155779
SGD Classifier Accuracy: 98.20531227566404
Naive Bayes Accuracy: 98.27709978463747
SVM Linear Accuracy: 98.42067480258436
