In [1]:
#import required libraries

import pandas as pd
import string
from nltk.corpus import stopwords

In [2]:
#Get the spam data collection 

data = pd.read_csv('SpamCollection',sep='\t',names = ['response','message'])

In [3]:
#display first 5 rows of data

data.head()

Unnamed: 0,response,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
#view more information about the data

data.describe()

Unnamed: 0,response,message
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


In [5]:
#describe the data after grouping it according to response

data.groupby('response').describe()

Unnamed: 0_level_0,message,message,message,message
Unnamed: 0_level_1,count,unique,top,freq
response,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,653,Please call our customer service representativ...,4


In [6]:
#Verify length of the messages and also add it as a new column 

data['length'] = data['message'].apply(len)

In [7]:
#view first 5 messages with length

data.head()

Unnamed: 0,response,message,length
0,ham,"Go until jurong point, crazy.. Available only ...",111
1,ham,Ok lar... Joking wif u oni...,29
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155
3,ham,U dun say so early hor... U c already then say...,49
4,ham,"Nah I don't think he goes to usf, he lives aro...",61


In [8]:
#define a function to get rid of stopwords present in the messages

def text_process(row):
    no_punct = [char for char in row if char not in string.punctuation]
    no_punct = ''.join(no_punct)
    return [word for word in no_punct.split() if word.lower() not in stopwords.words('english')]

In [9]:
#verify the function is working

data.head()['message'].apply(text_process)

0    [Go, jurong, point, crazy, Available, bugis, n...
1                       [Ok, lar, Joking, wif, u, oni]
2    [Free, entry, 2, wkly, comp, win, FA, Cup, fin...
3        [U, dun, say, early, hor, U, c, already, say]
4    [Nah, dont, think, goes, usf, lives, around, t...
Name: message, dtype: object

In [10]:
#start text processing with vectorizer 

from sklearn.feature_extraction.text import CountVectorizer

In [11]:
#use bag of words by applying the function and fit the data into it

bag = CountVectorizer(analyzer = text_process).fit(data['message'])

In [12]:
#print length of bag of words stored in the vocabulary_ attribute

len(bag.vocabulary_)

11425

In [13]:
#store bag of words using transform

bag_cv = bag.transform(data['message'])

In [14]:
#apply tfidf transformer and fit the bag of words into it (transformed version)

from sklearn.feature_extraction.text import TfidfTransformer
bag_tfidf = TfidfTransformer().fit(bag_cv)

In [15]:
#print shape of the tfidf 

bag_transformed = bag_tfidf.transform(bag_cv)
print(bag_transformed.shape)

(5572, 11425)


In [16]:
#choose naive Bayes model to detect the spam and fit the tfidf data into it

from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB().fit(bag_transformed,data['response'])

In [18]:
#check model for the predicted and expected value say for message#2 and message#5

message1 = data['message'][2]
bag_for_message = bag.transform([message1])
tfidf = bag_tfidf.transform(bag_for_message)

print('predicted',model.predict(tfidf[0]))
print('expected',data['response'][2])

predicted ['spam']
expected spam
