In [1]:
# Importing the libraries
import sys
import nltk
import sklearn
import pandas as pd
import numpy as np

In [2]:
# Loading the dataset
df = pd.read_table('SMSSPamCollection', header = None, encoding = 'utf-8')

In [3]:
df.head()

Unnamed: 0,0,1
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
df.shape

(5572, 2)

In [5]:
classes = df[0]
classes.value_counts()

ham     4825
spam     747
Name: 0, dtype: int64

In [6]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
Y = encoder.fit_transform(classes)
Y

array([0, 0, 1, ..., 0, 0, 0])

In [7]:
texts = df[1]
texts[:10]

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
5    FreeMsg Hey there darling it's been 3 week's n...
6    Even my brother is not like to speak with me. ...
7    As per your request 'Melle Melle (Oru Minnamin...
8    WINNER!! As a valued network customer you have...
9    Had your mobile 11 months or more? U R entitle...
Name: 1, dtype: object

In [8]:
# Using regular expressions for replacing the email ids, web addresses, money symbols, phone nos and other nos

processed = texts.str.replace(r'^.+@[^\.].*\.[a-z]{2,}$','emailaddress')
processed = processed.str.replace(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$','webaddress')
processed = processed.str.replace(r'£|\$', 'moneysymb')
processed = processed.str.replace(r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$','phonenumbr')
processed = processed.str.replace(r'\d+(\.\d+)?', 'numbr')

In [9]:
# Removing punctuation
processed = processed.str.replace(r'[^\w\d\s]', ' ')

# Replacing whitespace between words with single space
processed = processed.str.replace(r'\s+', ' ')

# Removing leading & trailing whitespaces
processed = processed.str.replace(r'^\s+|\s+?$', '')

In [10]:
# Changing the words to lower case
processed = processed.str.lower()
processed

0       go until jurong point crazy available only in ...
1                                 ok lar joking wif u oni
2       free entry in numbr a wkly comp to win fa cup ...
3             u dun say so early hor u c already then say
4       nah i don t think he goes to usf he lives arou...
                              ...                        
5567    this is the numbrnd time we have tried numbr c...
5568                  will ü b going to esplanade fr home
5569    pity was in mood for that so any other suggest...
5570    the guy did some bitching but i acted like i d...
5571                            rofl its true to its name
Name: 1, Length: 5572, dtype: object

In [11]:
# Removing stop words from text messages
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
processed = processed.apply(lambda x: ' '.join(term for term in x.split() if term not in stop_words))

In [12]:
# Removing word stems using Porter stemmer
ps = nltk.PorterStemmer()
processed = processed.apply(lambda x: ' '.join(ps.stem(term) for term in x.split()))

In [13]:
processed

0       go jurong point crazi avail bugi n great world...
1                                   ok lar joke wif u oni
2       free entri numbr wkli comp win fa cup final tk...
3                     u dun say earli hor u c alreadi say
4                    nah think goe usf live around though
                              ...                        
5567    numbrnd time tri numbr contact u u moneysymbnu...
5568                              ü b go esplanad fr home
5569                                    piti mood suggest
5570    guy bitch act like interest buy someth els nex...
5571                                       rofl true name
Name: 1, Length: 5572, dtype: object

## Feature engineering

In [14]:
# create bag-of-words model
from nltk.tokenize import word_tokenize

all_words = []

for message in processed:
    words = word_tokenize(message)
    for w in words:
        all_words.append(w)
        
all_words = nltk.FreqDist(all_words)

In [15]:
len(all_words)

6579

In [16]:
# Most common 15 words
all_words.most_common(15)

[('numbr', 2648),
 ('u', 1207),
 ('call', 674),
 ('go', 456),
 ('get', 451),
 ('ur', 391),
 ('gt', 318),
 ('lt', 316),
 ('come', 304),
 ('moneysymbnumbr', 303),
 ('ok', 293),
 ('free', 284),
 ('day', 276),
 ('know', 275),
 ('love', 266)]

In [17]:
word_features = list(all_words.keys())[:]
word_features

['go',
 'jurong',
 'point',
 'crazi',
 'avail',
 'bugi',
 'n',
 'great',
 'world',
 'la',
 'e',
 'buffet',
 'cine',
 'got',
 'amor',
 'wat',
 'ok',
 'lar',
 'joke',
 'wif',
 'u',
 'oni',
 'free',
 'entri',
 'numbr',
 'wkli',
 'comp',
 'win',
 'fa',
 'cup',
 'final',
 'tkt',
 'numbrst',
 'may',
 'text',
 'receiv',
 'question',
 'std',
 'txt',
 'rate',
 'c',
 'appli',
 'numbrovernumbr',
 'dun',
 'say',
 'earli',
 'hor',
 'alreadi',
 'nah',
 'think',
 'goe',
 'usf',
 'live',
 'around',
 'though',
 'freemsg',
 'hey',
 'darl',
 'week',
 'word',
 'back',
 'like',
 'fun',
 'still',
 'tb',
 'xxx',
 'chg',
 'send',
 'moneysymbnumbr',
 'rcv',
 'even',
 'brother',
 'speak',
 'treat',
 'aid',
 'patent',
 'per',
 'request',
 'mell',
 'oru',
 'minnaminungint',
 'nurungu',
 'vettam',
 'set',
 'callertun',
 'caller',
 'press',
 'copi',
 'friend',
 'winner',
 'valu',
 'network',
 'custom',
 'select',
 'receivea',
 'prize',
 'reward',
 'claim',
 'call',
 'code',
 'klnumbr',
 'valid',
 'hour',
 'mobil',


In [18]:
def find_features(message):
    words = word_tokenize(message)
    features = {}
    for word in word_features:
        features[word] = (word in words)
    return features

In [19]:
features = find_features(processed[0])
for key, value in features.items():
    if value == True:
        print(key)

go
jurong
point
crazi
avail
bugi
n
great
world
la
e
buffet
cine
got
amor
wat


In [21]:
messages = list(zip(processed, Y))

featuresets = [(find_features(text), label) for (text, label) in messages]

In [22]:
from sklearn import model_selection

training, testing = model_selection.train_test_split(featuresets, test_size = 0.25, random_state=42)

In [24]:
print(len(training))
print(len(testing))

4179
1393


In [30]:
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.svm import SVC

model = SklearnClassifier(SVC(kernel = 'linear'))
model.train(training)

accuracy = nltk.classify.accuracy(model, testing) * 100
print("SVC Accuracy: {}".format(accuracy))

SVC Accuracy: 98.34888729361091


In [31]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [32]:
names = ["K Nearest Neighbors", "Decision Tree", "Random Forest", "Logistic Regression", "Naive Bayes", "SVM Linear"]

In [33]:
classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    MultinomialNB(),
    SVC(kernel = 'linear')
]

models = list(zip(names, classifiers))

In [34]:
for name, model in models:
    nltk_model = SklearnClassifier(model)
    nltk_model.train(training)
    accuracy = nltk.classify.accuracy(nltk_model, testing)*100
    print("{} Accuracy: {}".format(name, accuracy))

K Nearest Neighbors Accuracy: 93.53912419239052
Decision Tree Accuracy: 97.5592246949031
Random Forest Accuracy: 98.20531227566404
Logistic Regression Accuracy: 98.49246231155779
Naive Bayes Accuracy: 97.98994974874373
SVM Linear Accuracy: 98.34888729361091


In [35]:
from sklearn.ensemble import VotingClassifier

names = ["K Nearest Neighbors", "Decision Tree", "Random Forest", "Logistic Regression", "Naive Bayes", "SVM Linear"]

classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    MultinomialNB(),
    SVC(kernel = 'linear')
]

models = list(zip(names, classifiers))

nltk_ensemble = SklearnClassifier(VotingClassifier(estimators = models, voting = 'hard', n_jobs = -1))
nltk_ensemble.train(training)
accuracy = nltk.classify.accuracy(nltk_model, testing)*100
print("Voting Classifier: Accuracy: {}".format(accuracy))

Voting Classifier: Accuracy: 98.34888729361091


In [36]:
txt_features, labels = list(zip(*testing))

prediction = nltk_ensemble.classify_many(txt_features)

In [37]:
print(classification_report(labels, prediction))

pd.DataFrame(
    confusion_matrix(labels, prediction),
    index = [['actual', 'actual'], ['ham', 'spam']],
    columns = [['predicted', 'predicted'], ['ham', 'spam']])

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      1207
           1       1.00      0.88      0.93       186

    accuracy                           0.98      1393
   macro avg       0.99      0.94      0.96      1393
weighted avg       0.98      0.98      0.98      1393



Unnamed: 0_level_0,Unnamed: 1_level_0,predicted,predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,ham,spam
actual,ham,1207,0
actual,spam,23,163
