In [0]:
import sys
import nltk
import sklearn
import pandas
import numpy

In [2]:
print("Python: {}".format(sys.version))
print("NLTK: {}".format(nltk.__version__))
print("Sklearn: {}".format(sklearn.__version__))
print("pandas: {}".format(pandas.__version__))
print("numpy: {}".format(numpy.__version__))

Python: 3.6.9 (default, Nov  7 2019, 10:44:02) 
[GCC 8.3.0]
NLTK: 3.2.5
Sklearn: 0.22.1
pandas: 0.25.3
numpy: 1.17.5


## 1. Load the Dataset

In [0]:
import pandas as pd
import numpy as np

# load the sms
df = pd.read_table('SMSSpamCollection', header=None, encoding='utf-8')

In [5]:
# print useful data
print(df.info())
print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
0    5572 non-null object
1    5572 non-null object
dtypes: object(2)
memory usage: 87.2+ KB
None
      0                                                  1
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [6]:
# check the class distribution
classes = df[0]
print(classes.value_counts())

ham     4825
spam     747
Name: 0, dtype: int64


## 2. Preprocess the data

In [7]:
# convert class labels to binary values 0=ham, 1=spam
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
Y = encoder.fit_transform(classes)

print(classes[:10])
print(Y[:10])

0     ham
1     ham
2    spam
3     ham
4     ham
5    spam
6     ham
7     ham
8    spam
9    spam
Name: 0, dtype: object
[0 0 1 0 0 1 0 0 1 1]


In [8]:
#store the sms message data
text_messages = df[1]
print(text_messages[:10])

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
5    FreeMsg Hey there darling it's been 3 week's n...
6    Even my brother is not like to speak with me. ...
7    As per your request 'Melle Melle (Oru Minnamin...
8    WINNER!! As a valued network customer you have...
9    Had your mobile 11 months or more? U R entitle...
Name: 1, dtype: object


In [0]:
# use regular expression to replace email, urls,

# replace email address with 'emailaddr'
processed = text_messages.str.replace(r'^.+@[^\.].*\.[a-z]{2,}$', 'emailaddr')

# replace urls with 'webaddress'
processed = processed.str.replace(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}{/\S*}?$', 'webaddress')

# replace money symbols with 'moneysymb'
processed = processed.str.replace(r'�|\$','moneysymb')

# replcae 10 digit phone numbers with 'phone number'
processed = processed.str.replace(r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$', 'phonenumber')

# replace normal numbers with numbr
processed = processed.str.replace(r'\d+(\.\d+)?', 'numbr')

In [0]:
# replace punc
processed = processed.str.replace(r'[\n\d\s]', ' ')

#replace whitespace
processed = processed.str.replace(r'\s+', ' ')

# remove leading and trailing whitespace
processed = processed.str.replace(r'^\s+|\s+?$', '')

In [11]:
# changing the words to lowercase
processed = processed.str.lower()
print(processed[:10])

0    go until jurong point, crazy.. available only ...
1                        ok lar... joking wif u oni...
2    free entry in numbr a wkly comp to win fa cup ...
3    u dun say so early hor... u c already then say...
4    nah i don't think he goes to usf, he lives aro...
5    freemsg hey there darling it's been numbr week...
6    even my brother is not like to speak with me. ...
7    as per your request 'melle melle (oru minnamin...
8    winner!! as a valued network customer you have...
9    had your mobile numbr months or more? u r enti...
Name: 1, dtype: object


In [18]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [0]:
# remove stop words
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

processed = processed.apply(lambda x: ' '.join(term for term in x.split() if term not in stop_words))

In [0]:
# remove word stems using a Porter stemmer
ps = nltk.PorterStemmer()

processed = processed.apply(lambda x: ' '.join(ps.stem(term) for term in x.split()))

In [0]:
from nltk.tokenize import word_tokenize

# creating a bag-of-words
all_words = []

for message in processed:
    words = word_tokenize(message)
    for w in words:
        all_words.append(w)
        
all_words = nltk.FreqDist(all_words)

In [20]:
# print total no of words and print most common
print("Number of words:: {}".format(len(all_words)))
print("Most common words:: {}".format(all_words.most_common(15)))

Number of words:: 8525
Most common words:: [('.', 4944), ('numbr', 2280), (',', 1979), ('?', 1550), ('!', 1397), ('...', 1261), ('u', 1111), ('&', 922), (';', 768), (':', 744), ('call', 644), ('i', 626), (')', 499), ('get', 445), ('go', 443)]


In [0]:
# use the first 1500 most common words as features
word_features = list(all_words.keys())[:1500]

In [22]:
# define find_features function
def find_features(message):
    words = word_tokenize(message)
    features = {}
    for word in word_features:
        features[word] = (word in words)
        
    return features

features = find_features(processed[0])
for key, value in features.items():
    if value == True:
        print(key)

go
jurong
point
,
crazy..
avail
bugi
n
great
world
la
e
buffet
...
cine
got
amor
wat


In [0]:
# find features for all messages
messages = zip(processed, Y)

# define a seed for reproducability
seed = 1
np.random.seed = seed
#np.random.shuffle(messages)

# call find_features for each sms messages
featuresets = [(find_features(text), label) for (text, label) in messages]

In [0]:
# split training and testing data sets using sklearn
from sklearn import model_selection

training, testing = model_selection.train_test_split(featuresets, test_size=0.25, random_state= seed)


In [25]:
print('Training {}'.format(len(training)))
print('Testing {}'.format(len(testing)))

Training 4179
Testing 1393


## 3. Scikit-Learn Classifiers with NLTK

In [0]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [0]:
# define the model to train
names = ['K Nearest Neighbors', 'Decision Tree',
        'Random Forest', 'Logistic Regression',
        'SGD Classifier', 'Naive Bayes', 
        'SVN Linear']

classifier = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    SGDClassifier(max_iter=100),
    MultinomialNB(),
    SVC(kernel='linear')
]
models = zip(names, classifier)

In [30]:
list(models)

[('K Nearest Neighbors',
  KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                       metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                       weights='uniform')),
 ('Decision Tree',
  DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                         max_depth=None, max_features=None, max_leaf_nodes=None,
                         min_impurity_decrease=0.0, min_impurity_split=None,
                         min_samples_leaf=1, min_samples_split=2,
                         min_weight_fraction_leaf=0.0, presort='deprecated',
                         random_state=None, splitter='best')),
 ('Random Forest',
  RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                         criterion='gini', max_depth=None, max_features='auto',
                         max_leaf_nodes=None, max_samples=None,
                         min_impurity_decrease=0.0, min_impurity_split=None,
     

In [31]:
# wrap models in NLTK
from nltk.classify.scikitlearn import SklearnClassifier

for (name, model) in zip(names, classifier):
    nltk_model = SklearnClassifier(model)
    nltk_model.train(training)
    accuracy = nltk.classify.accuracy(nltk_model, testing) * 100
    print('{}: Accuracy: {}'.format(name, accuracy))


K Nearest Neighbors: Accuracy: 93.25197415649676
Decision Tree: Accuracy: 97.4156496769562
Random Forest: Accuracy: 98.63603732950466
Logistic Regression: Accuracy: 98.63603732950466
SGD Classifier: Accuracy: 98.27709978463747
Naive Bayes: Accuracy: 98.42067480258436
SVN Linear: Accuracy: 98.56424982053123


In [0]:
# ensemble method - Voting Classifier
from sklearn.ensemble import VotingClassifier

models = zip(names, classifier)

In [34]:
nltk_ensamble = SklearnClassifier(VotingClassifier(estimators=list(zip(names, classifier)), voting='hard', n_jobs=-1))
nltk_ensamble.train(training)
accuracy = nltk.classify.accuracy(nltk_ensamble, testing) * 100
print("Ensemble Method Accuracy: {}".format(accuracy))

Ensemble Method Accuracy: 98.56424982053123
