In [1]:
import sys
import nltk
import sklearn
import pandas
import numpy

# 1. Load the Dataset

In [2]:
import pandas as pd
import numpy as np
 
# load dataset of sms messages

df =pd.read_csv('SPAM text message 20170820 - Data.csv', header=None, encoding='utf-8')


In [3]:
# Print useful information from dataset
print(df.info())
print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
0    5572 non-null object
1    5572 non-null object
dtypes: object(2)
memory usage: 87.1+ KB
None
      0                                                  1
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [4]:
 # check class distribution
classes = df[0]
print(classes.value_counts())    

ham     4825
spam     747
Name: 0, dtype: int64


 # 2. Preprocess the Data

In [5]:
# Convert class labels into binary 0=ham , 1=spam
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
Y = encoder.fit_transform(classes)
print(classes[:10])
print(Y[:10])

0     ham
1     ham
2    spam
3     ham
4     ham
5    spam
6     ham
7     ham
8    spam
9    spam
Name: 0, dtype: object
[0 0 1 0 0 1 0 0 1 1]


In [6]:
# Store SMS Data
text_messages = df[1]
print(text_messages[:10])

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
5    FreeMsg Hey there darling it's been 3 week's n...
6    Even my brother is not like to speak with me. ...
7    As per your request 'Melle Melle (Oru Minnamin...
8    WINNER!! As a valued network customer you have...
9    Had your mobile 11 months or more? U R entitle...
Name: 1, dtype: object


In [7]:
# use regular expression to replace email addressses, phone numbers, numbers, other numbers, symbols

# email addresses with 'emailadr'

processed = text_messages.str.replace(r'^.+@[^\.], *\,[a-z]{2}$', 'emailaddr')

# replace urls with 'webaddress'

processed = processed.str.replace(r' ^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\$+)7$', 'webaddress')


# replace money symbols with 'moneysymb'
processed = processed.str.replace(r' £|\$', 'moneysymb')

# replace 10 digits phone numbers with simply 'phonenumber'

processed = processed.str.replace(r'^\(?[\d]{3}\)?[\d]{3}[\s-]?[\d]{4}$', 'phonenumber')

# replace number with 'number'

processed = processed.str.replace(r'\d+(\.\d+)?', 'number')



In [8]:
# remove punctuation

processed= processed.str.replace(r'[^\w\d\s]', ' ')

# replace whitespace btween term with single space

processed = processed.str.replace(r' \s+', ' ')

# removing leading and trailing whitespace
processed = processed.str.replace(r'^\s+|\s+7$', '')


In [9]:
# change the word hello,HELLO, Hello all are same words

processed= processed.str.lower()

print(processed[:10])

0    go until jurong point crazy available only in ...
1                             ok lar joking wif u oni 
2    free entry in number a wkly comp to win fa cup...
3         u dun say so early hor u c already then say 
4    nah i don t think he goes to usf he lives arou...
5    freemsg hey there darling it s been number wee...
6    even my brother is not like to speak with me t...
7    as per your request melle melle oru minnaminun...
8    winner as a valued network customer you have b...
9    had your mobile number months or more u r enti...
Name: 1, dtype: object


In [10]:
# remove stop words from the text messages.
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

processed = processed.apply(lambda x: ' '. join(term for term in x.split() if term not in stop_words))

In [11]:
# remove words stem usign Porter stemmer
ps = nltk.PorterStemmer()

processed = processed.apply(lambda x: ' '. join(ps.stem(term) for term in x.split()))

In [12]:
print(processed)

0       go jurong point crazi avail bugi n great world...
1                                   ok lar joke wif u oni
2       free entri number wkli comp win fa cup final t...
3                     u dun say earli hor u c alreadi say
4                    nah think goe usf live around though
5       freemsg hey darl number week word back like fu...
6           even brother like speak treat like aid patent
7       per request mell mell oru minnaminungint nurun...
8       winner valu network custom select receiveamone...
9       mobil number month u r entitl updat latest col...
10      gonna home soon want talk stuff anymor tonight...
11      six chanc win cash number number number pound ...
12      urgent number week free membership ourmoneysym...
13      search right word thank breather promis wont t...
14                                            date sunday
15      xxxmobilemovieclub use credit click wap link n...
16                                             oh k watch
17      eh u r

In [13]:
from nltk.tokenize import word_tokenize

#creatig a bag-of-words model
all_words = []

for message in processed:
    words = word_tokenize(message)
    for w in words:
        all_words.append(w)
        
all_words = nltk.FreqDist(all_words)

In [14]:

# print the total numbers of words and 15 most common words
print('Number of words: {}'.format(len(all_words)))
print ('Most common words: {}'. format(all_words.most_common(15)))

Number of words: 6613
Most common words: [('number', 2769), ('u', 1203), ('call', 675), ('go', 456), ('get', 452), ('ur', 383), ('gt', 318), ('lt', 316), ('come', 304), ('ok', 293), ('free', 284), ('day', 276), ('know', 275), ('love', 266), ('like', 261)]


In [15]:
# use the 1500 most common words as feature
word_features = list(all_words.keys())[:1500]

In [26]:
# define find_features functions
def  find_features(message):
    words = word_tokenize(message)
    features = {}
    for word in word_features:
        features[word] = (word in words)
        
        return features
# lets see an example
features = find_features(processed[0])
for key, value in features.items():
    if value == True:
        print(key)
        

go


In [27]:
features

{'go': True}