In [1]:
#importing the required packages 
import pandas as pd 
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer 

In [2]:
#loading the data set 
email_data = pd.read_csv("sms_raw_NB.csv")

In [4]:
#looking at the data 
email_data

Unnamed: 0,type,text
0,ham,Hope you are having a good week. Just checking in
1,ham,K..give back my thanks.
2,ham,Am also doing in cbe only. But have to pay.
3,spam,"complimentary 4 STAR Ibiza Holiday or �10,000 ..."
4,spam,okmail: Dear Dave this is your final notice to...
...,...,...
5554,ham,You are a great role model. You are giving so ...
5555,ham,"Awesome, I remember the last time we got someb..."
5556,spam,"If you don't, your prize will go to another cu..."
5557,spam,"SMS. ac JSco: Energy is high, but u may not kn..."


In [5]:
#removing all the stop words from the data 
from nltk.corpus import stopwords

In [6]:
stop_words = set(stopwords.words('english'))

In [8]:
#cleaning the data
import re
def cleaning_text(i):
    i = re.sub("[^A-Za-z" "]+"," ",i).lower()
    i = re.sub("[0-9" "]+"," ",i)
    w = []
    for word in i.split(" "):
        if len(word)>3:
            w.append(word)
    return (" ".join(w))

In [33]:
email_data.text = email_data.text.apply(cleaning_text)

In [34]:
# removing empty rows
email_data = email_data.loc[email_data.text != " ",:]

In [35]:
#looking at the data after cleaning 
email_data

Unnamed: 0,type,text
0,ham,hope having good week just checking
1,ham,give back thanks
2,ham,also doing only have
3,spam,complimentary star ibiza holiday cash needs yo...
4,spam,okmail dear dave this your final notice collec...
...,...,...
5554,ham,great role model giving much really wish each ...
5555,ham,awesome remember last time somebody high first...
5556,spam,your prize will another customer polo suite lo...
5557,spam,jsco energy high know where channel leadership...


In [36]:
# splitting the data into train and test data sets 
from sklearn.model_selection import train_test_split

email_train, email_test = train_test_split(email_data, test_size = 0.2)


In [37]:
# creation of matrix for the entire document 
def split_into_words(i):
    return [word for word in i.split(" ")]

In [38]:
# Bag of Words
emails_bow = CountVectorizer(analyzer = split_into_words).fit(email_data.text)

In [39]:
#BOW for all messages
all_emails_matrix = emails_bow.transform(email_data.text)

In [40]:
# BOW For training messages
train_emails_matrix = emails_bow.transform(email_train.text)


In [41]:
# BOW For testing messages
test_emails_matrix = emails_bow.transform(email_test.text)


In [42]:
# Term weighting and normalizing on ALL emails
tfidf_transformer = TfidfTransformer().fit(all_emails_matrix)


In [43]:
# Preparing TFIDF for train emails
train_tfidf = tfidf_transformer.transform(train_emails_matrix)
train_tfidf.shape # (row, column)


(4447, 6661)

In [44]:
# Preparing TFIDF for test emails
test_tfidf = tfidf_transformer.transform(test_emails_matrix)
test_tfidf.shape #  (row, column)


(1112, 6661)

In [45]:
# Preparing a naive bayes model on training data set 

from sklearn.naive_bayes import MultinomialNB as MB

In [46]:
# Multinomial Naive Bayes
classifier_mb = MB(alpha=1)
classifier_mb.fit(train_tfidf, email_train.type)


MultinomialNB(alpha=1)

In [47]:
# Evaluation on Test Data
test_pred_m = classifier_mb.predict(test_tfidf)
accuracy_test_m = np.mean(test_pred_m == email_test.type)
accuracy_test_m

0.9550359712230215

In [48]:
from sklearn.metrics import accuracy_score
accuracy_score(test_pred_m, email_test.type) 

pd.crosstab(test_pred_m, email_test.type)


type,ham,spam
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
ham,950,49
spam,1,112


In [49]:
# Training Data accuracy
train_pred_m = classifier_mb.predict(train_tfidf)
accuracy_train_m = np.mean(train_pred_m == email_train.type)
accuracy_train_m

0.969417584888689