In [27]:
import sys #module allows us to operate on underlying interpreter (Python interpreter is a bytecode interpreter: its input is instruction sets called bytecode)
import nltk #module allows to work with human language data
import sklearn #module allows us to work with machine learning library for python
import pandas #data analysis library whic provides easy-to-use data structures for data analysis.
import numpy #library for the Python programming language, adding support for large, multi-dimensional arrays and matrices, along with a large collection of high-level mathematical functions to operate on these arrays

print('Python: {}'.format(sys.version))
print('NLTK: {}'.format(nltk.__version__))
print('Scikit-learn: {}'.format(sklearn.__version__))
print('Pandas: {}'.format(pandas.__version__))
print('Numpy: {}'.format(numpy.__version__))

Python: 3.7.1 (default, Dec 10 2018, 22:54:23) [MSC v.1915 64 bit (AMD64)]
NLTK: 3.4
Scikit-learn: 0.20.1
Pandas: 0.23.4
Numpy: 1.15.4


# Load the DataSet

In [28]:
import pandas as pd
import numpy as np

# Load the Dataset of SMS Messages
# read_table(...) - Read general delimited file into DataFrame
df = pd.read_table("SMSSpamCollection", header = None, encoding='utf-8')

In [29]:
#Print Info about the Data Set
print(df.info()) # full summary of the datafram.
print(df.head()) # returns the first n rows for the object based on position. It is useful for quickly testing if your object has the right type of data in it

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
0    5572 non-null object
1    5572 non-null object
dtypes: object(2)
memory usage: 87.1+ KB
None
      0                                                  1
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [30]:
#Check Class distribution, to check how many spam or ham messages we have
classes = df[0] #First column of the dataset
print(classes.value_counts()) #resulting object will be in descending order so that the first element is the most frequently-occurring element. Excludes NA values by default.

ham     4825
spam     747
Name: 0, dtype: int64


## Preprocess the Data

In [31]:
#Convert class labels to classification values(binary values, 0=ham, 1=spam) since ham and spam doesnt make sense to machine learning algorithms.
#we will use sklearn encoder to convert.

from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder() # used to transform non-numerical labels (as long as they are hashable and comparable) to numerical labels.
Y = encoder.fit_transform(classes) # Fit label encoder and return encoded labels
print(classes[:10])
print(Y[:10])

0     ham
1     ham
2    spam
3     ham
4     ham
5    spam
6     ham
7     ham
8    spam
9    spam
Name: 0, dtype: object
[0 0 1 0 0 1 0 0 1 1]


In [32]:
# Store SMS Message Data
text_messages = df[1]
print(text_messages[:10])

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
5    FreeMsg Hey there darling it's been 3 week's n...
6    Even my brother is not like to speak with me. ...
7    As per your request 'Melle Melle (Oru Minnamin...
8    WINNER!! As a valued network customer you have...
9    Had your mobile 11 months or more? U R entitle...
Name: 1, dtype: object


In [33]:
#use Regular expression to replace email address, urls, phone numbers, other numbers, symbols
#replace email addresses with 'emailaddr'
processed = text_messages.replace(r'^.+@[^\.].*\.[a-z]{2,}$',
                                 'emailaddress');
# Replace URLs with 'webaddress'
processed = processed.str.replace(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$',
                                  'webaddress')

# Replace money symbols with 'moneysymb' (£ can by typed with ALT key + 156)
processed = processed.str.replace(r'£|\$', 'moneysymb')
    
# Replace 10 digit phone numbers (formats include paranthesis, spaces, no spaces, dashes) with 'phonenumber'
processed = processed.str.replace(r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$',
                                  'phonenumbr')
    
# Replace numbers with 'numbr'
processed = processed.str.replace(r'\d+(\.\d+)?', 'numbr')

In [34]:
# Remove punctuation
processed = processed.str.replace(r'[^\w\d\s]', ' ')

# Replace whitespace between terms with a single space
processed = processed.str.replace(r'\s+', ' ')

# Remove leading and trailing whitespace
processed = processed.str.replace(r'^\s+|\s+?$', '')

In [11]:
#change words to lower case - Hello, HELLO, hello are all same word
processed = processed.str.lower()
print(processed)

0       go until jurong point crazy available only in ...
1                                 ok lar joking wif u oni
2       free entry in numbr a wkly comp to win fa cup ...
3             u dun say so early hor u c already then say
4       nah i don t think he goes to usf he lives arou...
5       freemsg hey there darling it s been numbr week...
6       even my brother is not like to speak with me t...
7       as per your request melle melle oru minnaminun...
8       winner as a valued network customer you have b...
9       had your mobile numbr months or more u r entit...
10      i m gonna be home soon and i don t want to tal...
11      six chances to win cash from numbr to numbr nu...
12      urgent you have won a numbr week free membersh...
13      i ve been searching for the right words to tha...
14                      i have a date on sunday with will
15      xxxmobilemovieclub to use your credit click th...
16                                 oh k i m watching here
17      eh u r

In [35]:
#Corpus is a large collection of texts.
#nltk.corpus : The modules in this package provide functions that can be used to read corpus files in a variety of formats
from nltk.corpus import stopwords

#Stopwords : Words in English which doesn't add any useful information to the message, 
#natural language words which have very little meaning, such as "and", "the", "a", "an", and similar words
#Set() creates an interable set.
stop_words = set(stopwords.words('english'))

#lambda operator or lambda function is used for creating small, one-time and anonymous function objects in Python
#Append to the string all of teh words as long as they are not included in stop_words
processed = processed.apply(lambda x: ' '.join(term for term in x.split() if term not in stop_words))


In [34]:
#Stem the word, extract the words,
#A stem is a part of a word, a stem is a form to which affixes can be attached
#For Ex : English word friendships contains the stem friend, to which the derivational suffix -ship is attached to form a new stem friendship, to which the inflectional suffix -s is attached.
#remove word stems using a porter stemmer(Algorithm or process for removing the commoner morphological(relating to form) and inflexional( involving a change in the form of a word) endings from words in English)
porter_stemmer = nltk.PorterStemmer()
#stem(term) : Strip affixes from the term and return the stem.
processed = processed.apply(lambda x:' '.join(porter_stemmer.stem(term) for term in x.split()))
print(processed)

0       go jurong point crazi avail bugi n great world...
1                                   ok lar joke wif u oni
2       free entri numbr wkli comp win fa cup final tk...
3                     u dun say earli hor u c alreadi say
4                    nah think goe usf live around though
5       freemsg hey darl numbr week word back like fun...
6           even brother like speak treat like aid patent
7       per request mell mell oru minnaminungint nurun...
8       winner valu network custom select receivea mon...
9       mobil numbr month u r entitl updat latest colo...
10      gonna home soon want talk stuff anymor tonight...
11      six chanc win cash numbr numbr numbr pound txt...
12      urgent numbr week free membership moneysymbnum...
13      search right word thank breather promi wont ta...
14                                            date sunday
15      xxxmobilemovieclub use credit click wap link n...
16                                             oh k watch
17      eh u r

In [36]:
# Now we can move on to generating features.
# Feature Engineering is the process of using domain knowledge of data which we have gained by going through these data, 
# to create features for machine learning algorithms
# so here in that date the words in the text message is going to be our features and for this purpose it is neccessary to tokenize each word
from nltk.tokenize import word_tokenize #A tokenizer that divides a string into substrings by splitting on the specified string (defined in subclasses).

#Extracting all the words as tokens bag of words model
all_words = []

for message in processed:
    words = word_tokenize(message)
    for w in words:
        all_words.append(w)

# FreqDist() class is used to encode “frequency distributions”, which count the number of times that each outcome of an experiment occurs.
all_words = nltk.FreqDist(all_words)

In [37]:
# Print total number of words and also 15 most common words
print('number of words: {}'.format(len(all_words)))
print('most common words: {}'.format(all_words.most_common(15)))

number of words: 10097
most common words: [('numbr', 2642), ('I', 2022), ('u', 821), ('call', 405), ('U', 386), ('get', 338), ('gt', 318), ('lt', 316), ('moneysymbnumbr', 303), ('ur', 298), ('You', 290), ('know', 255), ('go', 250), ('like', 235), ('got', 217)]


In [60]:
# Use 2000 most common words as features
word_features = list(all_words.keys())[:1500]

In [61]:
# Define a function that will determine which of these 2000 word features are contained in each review 
def find_features(message):
    words = word_tokenize(message)
    features = {}
    for word in word_features:
        features[word] = (word in words)
    return features

# Example
features = find_features(processed[0])
for key, value in features.items():
    if value == True:
        print(key)

Go
jurong
point
crazy
Available
bugis
n
great
world
la
e
buffet
Cine
got
amore
wat


In [62]:
# Find Features for all the messages
messages = list(zip(processed, Y)) # Note : Y is spam or not spam, binary class label

# define a seed for reproducibility
seed = 1
np.random.seed = seed
np.random.shuffle(messages)

# call find_features function for each SMS message
featuresets = [(find_features(text), label) for (text, label) in messages]

In [63]:
# Split training and testing data sets using sklearn
from sklearn import model_selection

training, testing = model_selection.train_test_split(featuresets, test_size = 0.25, random_state = seed)

In [64]:
# Print length of training and testing the data
print('Training: {}'.format(len(training)))
print('Testing: {}'.format(len(testing)))

Training: 4179
Testing: 1393


In [65]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix


In [66]:
# Define models to train
names = ['K Nearest Neighbors', 'Decision Tree', 'Random Forest', 'Logistic Regression', 'SGD Classifier', 'Naive Bayes', 'SVM Linear']

classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    SGDClassifier(max_iter = 100),
    MultinomialNB(),
    SVC(kernel = 'linear')
]

models = list(zip(names, classifiers))
print(models)

[('K Nearest Neighbors', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')), ('Decision Tree', DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')), ('Random Forest', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_sta

In [67]:
# wrap models in NLTK 
from nltk.classify.scikitlearn import SklearnClassifier

for name, model in models:
    nltk_model = SklearnClassifier(model)
    nltk_model.train(training)
    accuracy = nltk.classify.accuracy(nltk_model, testing) * 100
    print('{}: Accuracy: {}'.format(name, accuracy))

K Nearest Neighbors: Accuracy: 93.75448671931083
Decision Tree: Accuracy: 97.5592246949031




Random Forest: Accuracy: 98.42067480258436




Logistic Regression: Accuracy: 98.77961234745155




SGD Classifier: Accuracy: 97.91816223977028
Naive Bayes: Accuracy: 98.42067480258436
SVM Linear: Accuracy: 98.34888729361091


In [68]:
# Ensemble methods for Voting classifier 
from sklearn.ensemble import VotingClassifier

names = ["K Nearest Neighbors", "Decision Tree", "Random Forest", "Logistic Regression", "SGD Classifier",
         "Naive Bayes", "SVM Linear"]

classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    SGDClassifier(max_iter = 100),
    MultinomialNB(),
    SVC(kernel = 'linear')
]

models = list(zip(names, classifiers))

nltk_ensemble = SklearnClassifier(VotingClassifier(estimators = models, voting = 'hard', n_jobs = -1))
nltk_ensemble.train(training)
accuracy = nltk.classify.accuracy(nltk_model, testing)*100
print("Voting Classifier: Accuracy: {}".format(accuracy))

Voting Classifier: Accuracy: 98.34888729361091


In [69]:
# make class label prediction for testing set
txt_features, labels = zip(*testing)

prediction = nltk_ensemble.classify_many(txt_features)

In [70]:
# print a confusion matrix and a classification report
print(classification_report(labels, prediction))

pd.DataFrame(
    confusion_matrix(labels, prediction),
    index = [['actual', 'actual'], ['ham', 'spam']],
    columns = [['predicted', 'predicted'], ['ham', 'spam']])

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      1192
           1       0.99      0.94      0.96       201

   micro avg       0.99      0.99      0.99      1393
   macro avg       0.99      0.97      0.98      1393
weighted avg       0.99      0.99      0.99      1393



Unnamed: 0_level_0,Unnamed: 1_level_0,predicted,predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,ham,spam
actual,ham,1190,2
actual,spam,13,188
