# Using NLTK with Sklearn to classify SPAM and HAM messages in a SMS

In [1]:
import pandas as pd
import numpy as np
import nltk

### 1. Load the Dataset

In [2]:

df = pd.read_table('SMSSpamcollection',header=None,encoding='utf-8')

In [3]:
df.head()

Unnamed: 0,0,1
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
#Print useful information about the dataframe
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
0    5572 non-null object
1    5572 non-null object
dtypes: object(2)
memory usage: 87.1+ KB


In [5]:
df.size

11144

In [6]:
#Check class distribution
classes = df[0]
classes.value_counts()

ham     4825
spam     747
Name: 0, dtype: int64

##### This is a skewed distribution

### 2. Preprocessing the Data

In [7]:
# convert class labels to binary values 
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
Y = encoder.fit_transform(classes)

In [8]:
print(Y[:10])

[0 0 1 0 0 1 0 0 1 1]


In [9]:
print(classes[:10])

0     ham
1     ham
2    spam
3     ham
4     ham
5    spam
6     ham
7     ham
8    spam
9    spam
Name: 0, dtype: object


In [10]:
#Store the raw data messages
text_messages = df[1]
print(text_messages[:10])

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
5    FreeMsg Hey there darling it's been 3 week's n...
6    Even my brother is not like to speak with me. ...
7    As per your request 'Melle Melle (Oru Minnamin...
8    WINNER!! As a valued network customer you have...
9    Had your mobile 11 months or more? U R entitle...
Name: 1, dtype: object


In [11]:
#From REGEX Lib.com

In [12]:
# Regex to replace emails, urls , phone nos., other numbers , money sybols
#Replace email id with emailaddr

In [13]:
# Replace email id with 'emailaddr'
processed = text_messages.str.replace('^\w+@[a-zA-Z_]+?\.[a-zA-Z]{2,3}$', 'emailaddr')

#Replace urls with 'webaddress'
processed = processed.str.replace('^(http(s?)\:\/\/)*[0-9a-zA-Z]([-.\w]*[0-9a-zA-Z])*(:(0-9)*)*(\/?)([a-zA-Z0-9\-\.\?\,\'\/\\\+&amp;%\$#_]*)?$','webaddr')

#Replace money symbols with 'moneysymb'
processed = processed.str.replace(r'€|\$','moneysymb')

# Replace 10 digit phone number with 'phonenumbr'
processed = processed.str.replace('^(\(?\+?[0-9]*\)?)?[0-9_\- \(\)]*$','phonenumbr')

#replace normal numbers with numbr
processed = processed.str.replace(r'\d+(\.\d+)?', 'numbr')

In [14]:
#Remove punctuation
processed = processed.str.replace(r'[^\w\d\s]',' ')

In [15]:
#Replace white space between words 
processed = processed.str.replace(r'\s+', ' ')

In [16]:
#remove leading and trailing spaces
processed = processed.str.replace(r'^\s+|\s+?$','')

In [17]:
#Change words to lower cases
processed = processed.str.lower()

In [18]:
print(processed[:10])

0    go until jurong point crazy available only in ...
1                              ok lar joking wif u oni
2    free entry in numbr a wkly comp to win fa cup ...
3          u dun say so early hor u c already then say
4    nah i don t think he goes to usf he lives arou...
5    freemsg hey there darling it s been numbr week...
6    even my brother is not like to speak with me t...
7    as per your request melle melle oru minnaminun...
8    winner as a valued network customer you have b...
9    had your mobile numbr months or more u r entit...
Name: 1, dtype: object


In [19]:
#Remove stop words from messages
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

In [20]:
processed = processed.apply(lambda x: ' '.join(term for term in x.split() if term not in stop_words))

In [21]:
print(processed[:10])

0    go jurong point crazy available bugis n great ...
1                              ok lar joking wif u oni
2    free entry numbr wkly comp win fa cup final tk...
3                  u dun say early hor u c already say
4               nah think goes usf lives around though
5    freemsg hey darling numbr week word back like ...
6       even brother like speak treat like aids patent
7    per request melle melle oru minnaminunginte nu...
8    winner valued network customer selected receiv...
9    mobile numbr months u r entitled update latest...
Name: 1, dtype: object


In [22]:
#Stemming

ps = nltk.PorterStemmer()

processed = processed.apply(lambda x: ' '.join(ps.stem(term) for term in x.split()))

In [23]:
print(processed[:10])

0    go jurong point crazi avail bugi n great world...
1                                ok lar joke wif u oni
2    free entri numbr wkli comp win fa cup final tk...
3                  u dun say earli hor u c alreadi say
4                 nah think goe usf live around though
5    freemsg hey darl numbr week word back like fun...
6        even brother like speak treat like aid patent
7    per request mell mell oru minnaminungint nurun...
8    winner valu network custom select receivea num...
9    mobil numbr month u r entitl updat latest colo...
Name: 1, dtype: object


In [24]:
# Featuring Engineering - Generating features
#Words in each text messages are going to be our features

#!. tokenize

from nltk.tokenize import word_tokenize
#creates bag of words model

all_words = []
for messages in processed:
    words = word_tokenize(messages)
    for w in words:
        all_words.append(w)

In [25]:
all_words = nltk.FreqDist(all_words)

In [26]:
print('Count of all words: {}'.format(len(all_words)))

Count of all words: 6572


In [27]:
print('Most common words: {}'.format(all_words.most_common(15)))

Most common words: [('numbr', 2959), ('u', 1207), ('call', 679), ('go', 456), ('get', 452), ('ur', 391), ('gt', 318), ('lt', 316), ('come', 304), ('free', 284), ('day', 276), ('know', 275), ('ok', 274), ('love', 266), ('like', 261)]


In [28]:
#Use 1500 most common words are features
word_features = list(all_words.keys())[:1500]

In [29]:
def find_features(messages):
    words = word_tokenize(messages)
    features = {}
    for word in word_features:
        features[word] = (w in words)
        
    return features

In [30]:
features = find_features('Hi numbr h r u')
for key, value in features.items():
    if value == True:
        print(key)

In [31]:
#Find features for all messages
messages = zip(processed,Y)

seed = 1
np.random.seed = seed
#np.random.shuffle(messages)

featuressets = [(find_features(text), label) for (text,label) in messages]

In [32]:
#Split trg and testing datasets using sklearn
from sklearn.model_selection import train_test_split

training,testing = train_test_split(featuressets, test_size=0.25, random_state = seed)

print('Training : {}'.format(len(training)))
print('Testing: {}'.format(len(testing)))

Training : 4179
Testing: 1393


### Scikit Learn classifiers with nltk


In [33]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [34]:
names = ['K Nearest Neighbors','Decision Tree','Random Forest','Logistic Regr','SGD Classifier','Multinomial NB','SVC']

In [39]:
classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    SGDClassifier(max_iter=100),
    SVC(kernel='linear')
]

models = list(zip(names,classifiers))



In [40]:
for x, y in models:
    print( x,y)

K Nearest Neighbors KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')
Decision Tree DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
Random Forest RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
Logisti

In [41]:
#wrap models in nltk
from nltk.classify.scikitlearn import SklearnClassifier

for name, model in models:
    nltk_model = SklearnClassifier(model)
    nltk_model.train(training)
    accuracy = nltk.classify.accuracy(nltk_model, testing ) * 100
    print('{} : Accuracy {}'.format(name,accuracy))

K Nearest Neighbors : Accuracy 86.71931083991386
Decision Tree : Accuracy 86.71931083991386




Random Forest : Accuracy 86.71931083991386




Logistic Regr : Accuracy 86.71931083991386




SGD Classifier : Accuracy 86.71931083991386
Multinomial NB : Accuracy 86.71931083991386


In [42]:
#ensemble voting classifierr
from sklearn.ensemble import VotingClassifier

nltk_ensemble = SklearnClassifier(VotingClassifier(estimators = models, voting='hard', n_jobs=-1))

nltk_ensemble.train(training)
accuracy = nltk.classify.accuracy(nltk_ensemble, testing ) * 100



In [43]:
#Make predictions
txt_features, labels = zip(*testing) #This is unzipping
prediction = nltk_ensemble.classify_many(txt_features)

In [None]:
#print report
print(classification_report(labels,prediction))
pd.DataFrame(
    confusion_matrix(labels,prediction),
    index = [['actual', 'actual', ['ham','spam']]],
    columns = [['predicted','predicted', ['ham','spam']]]
)