In [1]:
import sys
import nltk
import sklearn
import pandas
import numpy

In [2]:
import pandas as pd
import numpy as np

# Load the data of sms messages
df = pd.read_table('SMSSpamCollection', header = None, encoding='utf-8')

In [3]:
# Print useful informations about the data set
print(df.info())
print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
0    5572 non-null object
1    5572 non-null object
dtypes: object(2)
memory usage: 87.2+ KB
None
      0                                                  1
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [4]:
# Check the class distribution
classes = df[0]
print(classes.value_counts())

ham     4825
spam     747
Name: 0, dtype: int64


## 2. Preprocess the data

In [5]:
# convert the class labels to binary values, 0 = ham, 1 = spam

from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
Y = encoder.fit_transform(classes)

print(Y[:10])
print(classes[:10])

[0 0 1 0 0 1 0 0 1 1]
0     ham
1     ham
2    spam
3     ham
4     ham
5    spam
6     ham
7     ham
8    spam
9    spam
Name: 0, dtype: object


In [6]:
# Store the sms message data
text_messages = df[1]
print(text_messages[:10])

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
5    FreeMsg Hey there darling it's been 3 week's n...
6    Even my brother is not like to speak with me. ...
7    As per your request 'Melle Melle (Oru Minnamin...
8    WINNER!! As a valued network customer you have...
9    Had your mobile 11 months or more? U R entitle...
Name: 1, dtype: object


In [7]:
# Use regular exp to replace email address, url, phone no, other number, symbols

# Replace email addressess with "emailaddr"
processed = text_messages.str.replace(r'^\w+@[a-zA-Z_]+?\.[a-zA-Z]{2,3}$','emailaddr')

# Replace web address with "webaddress"
processed = text_messages.str.replace(r'#\b(([\w-]+://?|www[.])[^\s()<>]+(?:\([\w\d]+\)|([^[:punct:]\s]|/)))#iS$','webaddress')

# Replace money symbols with "moneysymb"
processed = text_messages.str.replace(r'|\$','moneysymb')

# Replace phonenumber with "phonenumber"
processed = text_messages.str.replace(r'^([\+][0-9]{1,3}([ \.\-])?)?([\(]{1}[0-9]{3}[\)])?([0-9A-Z \.\-]{1,32})((x|ext|extension)?[0-9]{1,4}?)$','phonenumber')

# Replace normal numbers with "numbr"
processed = text_messages.str.replace(r'\d+(\.\d+)?','numbr')


In [8]:
# Remove punctuations
processed = processed.str.replace(r'[^\w\d\s]',' ')

# Remove whitespace with single space
processed = processed.str.replace(r'\s+',' ')

# Remve leading and trailing whitespace
processed = processed.str.replace(r'^\s+|\s+?$','')

In [9]:
# change the world to lower case - Hello, HELLO, hello are all same
processed = processed.str.lower()
print(processed)

0       go until jurong point crazy available only in ...
1                                 ok lar joking wif u oni
2       free entry in numbr a wkly comp to win fa cup ...
3             u dun say so early hor u c already then say
4       nah i don t think he goes to usf he lives arou...
                              ...                        
5567    this is the numbrnd time we have tried numbr c...
5568                  will ü b going to esplanade fr home
5569    pity was in mood for that so any other suggest...
5570    the guy did some bitching but i acted like i d...
5571                            rofl its true to its name
Name: 1, Length: 5572, dtype: object


In [10]:
# Remove stop words from text messages

from nltk.corpus import stopwords

stops_words = set(stopwords.words('english'))

processed = processed.apply(lambda x: ' '.join(term for term in x.split() if term not in stops_words))

In [11]:
# Remove the word stems using a Porter Stemmer
ps = nltk.PorterStemmer()

processed = processed.apply(lambda x: ' '.join(ps.stem(term) for term in x.split()))

In [12]:
print(processed)

0       go jurong point crazi avail bugi n great world...
1                                   ok lar joke wif u oni
2       free entri numbr wkli comp win fa cup final tk...
3                     u dun say earli hor u c alreadi say
4                    nah think goe usf live around though
                              ...                        
5567    numbrnd time tri numbr contact u u numbr pound...
5568                              ü b go esplanad fr home
5569                                    piti mood suggest
5570    guy bitch act like interest buy someth els nex...
5571                                       rofl true name
Name: 1, Length: 5572, dtype: object


In [13]:
from nltk.tokenize import word_tokenize

# Creating a bag of words
all_words = []

for message in processed:
    words = word_tokenize(message)
    for w in words:
        all_words.append(w)
        
all_words = nltk.FreqDist(all_words)        


In [None]:
# Print the total number of words and 15 most common words
print("Number of words: {}".format(len(all_words)))
print("Most common words: {}".format(all_words.most_common(50)))

In [None]:
# Use the 1500 most common words as features
word_features = list(all_words.keys())[:1500]
print(word_features)

In [32]:
# define a find_features functions
def find_features(message):
    words = word_tokenize(message)
    features = {}
    for word in word_features:
        if word in words:
            features[word] = 1
        else:
            features[word] = 0
        
    return features

# Lets see some results
print(processed[4])
features = find_features(processed[4])
print(features)
for key, value in features.items():
    if value == {True}:
        print(key)

nah think goe usf live around though
{'go': 0, 'jurong': 0, 'point': 0, 'crazi': 0, 'avail': 0, 'bugi': 0, 'n': 0, 'great': 0, 'world': 0, 'la': 0, 'e': 0, 'buffet': 0, 'cine': 0, 'got': 0, 'amor': 0, 'wat': 0, 'ok': 0, 'lar': 0, 'joke': 0, 'wif': 0, 'u': 0, 'oni': 0, 'free': 0, 'entri': 0, 'numbr': 0, 'wkli': 0, 'comp': 0, 'win': 0, 'fa': 0, 'cup': 0, 'final': 0, 'tkt': 0, 'numbrst': 0, 'may': 0, 'text': 0, 'receiv': 0, 'question': 0, 'std': 0, 'txt': 0, 'rate': 0, 'c': 0, 'appli': 0, 'numbrovernumbr': 0, 'dun': 0, 'say': 0, 'earli': 0, 'hor': 0, 'alreadi': 0, 'nah': 1, 'think': 1, 'goe': 1, 'usf': 1, 'live': 1, 'around': 1, 'though': 1, 'freemsg': 0, 'hey': 0, 'darl': 0, 'week': 0, 'word': 0, 'back': 0, 'like': 0, 'fun': 0, 'still': 0, 'tb': 0, 'xxx': 0, 'chg': 0, 'send': 0, 'rcv': 0, 'even': 0, 'brother': 0, 'speak': 0, 'treat': 0, 'aid': 0, 'patent': 0, 'per': 0, 'request': 0, 'mell': 0, 'oru': 0, 'minnaminungint': 0, 'nurungu': 0, 'vettam': 0, 'set': 0, 'callertun': 0, 'caller': 0

In [33]:
# find features set for all messages
messages = zip(processed, Y)

# define seed for reproducability
seed = 1
np.random.seed = seed
#np.random.shuffle(messages)

# Call find_features for all SMS messages
featuresets = [(find_features(text), label) for (text,label) in messages]

#print(featuresets[0])

#from sklearn.feature_extraction import DictVectorizer

#vec = DictVectorizer()
#X = vec.fit_transform([item[0] for item in featuresets])
#Y = [item[1] for item in featuresets]

In [34]:
# Split training and testing datasets using sklearn
from sklearn import model_selection

training, testing = model_selection.train_test_split(featuresets, test_size = 0.25, random_state = seed)

In [35]:
print(len(training))
print(len(testing))

4179
1393


In [None]:
from sklearn.feature_extraction import DictVectorizer

vec = DictVectorizer()
X = vec.fit_transform([item[0] for item in ])

## 3. Scikit-Learn Classifier with NLTK

In [36]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [41]:
# Define model to train
names = ['K Nearest Neightbours', 'Decision Tree', 'Random Forest',
        'Logistic Regression', 'SGD Classifier', 'Naive Bayes', 'SVM Linear']

classifier = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    SGDClassifier( max_iter= 100),
    MultinomialNB(),
    SVC(kernel='linear')
]

# Return dictionary
models = zip(names, classifier)

In [42]:
# Wrap models in nltk
from nltk.classify.scikitlearn import SklearnClassifier

for name, model in models:
    #print(model)
    nltk_model = SklearnClassifier(model)
    nltk_model.train(training)
    accuracy = nltk.classify.accuracy(nltk_model, testing) * 100
    print('{}: Accuracy: {}'.format(name,accuracy))

K Nearest Neightbours: Accuracy: 93.96984924623115
Decision Tree: Accuracy: 97.84637473079684




Random Forest: Accuracy: 98.49246231155779




Logistic Regression: Accuracy: 98.85139985642498
SGD Classifier: Accuracy: 98.42067480258436
Naive Bayes: Accuracy: 98.63603732950466
SVM Linear: Accuracy: 98.92318736539842


In [48]:
# Build a ensemble method - Voting classifier (Used to combine all classification algorithm used above)

from sklearn.ensemble import VotingClassifier

# Define model to train
names = ['K Nearest Neightbours', 'Decision Tree', 'Random Forest',
        'Logistic Regression', 'SGD Classifier', 'Naive Bayes', 'SVM Linear']

classifier = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    SGDClassifier( max_iter= 100),
    MultinomialNB(),
    SVC(kernel='linear')
]

models = list(zip(names, classifier))

nltk_ensemble = SklearnClassifier(VotingClassifier(estimators = models, voting = 'hard', n_jobs = -1))
nltk_ensemble.train(training)
accuracy = nltk.classify.accuracy(nltk_ensemble, testing) * 100
print("Ensemble method accuracy: {}".format(accuracy))


Ensemble method accuracy: 98.85139985642498


In [50]:
# Make the class label predictions for testing set

# Un-zipping file
txt_features, labels = zip(*testing)

predictions = nltk_ensemble.classify_many(txt_features)

In [51]:
# Print the confusion matrix and a classification report
print(classification_report(labels, predictions))

pd.DataFrame(
    confusion_matrix(labels,predictions),
    index = [['actual', 'actual'], ['ham', 'spam']],
    columns = [['predicted', 'predicted'], ['ham', 'spam']])

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      1208
           1       0.99      0.92      0.96       185

    accuracy                           0.99      1393
   macro avg       0.99      0.96      0.97      1393
weighted avg       0.99      0.99      0.99      1393



Unnamed: 0_level_0,Unnamed: 1_level_0,predicted,predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,ham,spam
actual,ham,1207,1
actual,spam,15,170
