# Spam Text Classification

In [1]:
"""
cd .\02spam_msg_classification\
jupyter nbconvert --to markdown spam.ipynb --output README.md

"""
import pandas as pd
import numpy as np
import nltk


## Load,Explore and Clean Data

[dataset](https://www.kaggle.com/uciml/sms-spam-collection-dataset)

In [3]:
df = pd.read_csv('spam.csv')
df.head()


Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
df = df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1)
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
df.rename(columns={'v1': 'label', 'v2': 'messages'}, inplace=True)
df.head()

Unnamed: 0,label,messages
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
# Replace ham with 0 and spam with 1
df["label"] = df["label"].replace(['ham', 'spam'], [0, 1])
df.head()


Unnamed: 0,label,messages
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
# Count the number of words in each Text
df['Count'] = df['messages'].apply(len)
df.head()
# or,
# create "Count" column
# df['Count'] = 0
# for i in np.arange(0, len(df.messages)):
#     df.loc[i, 'Count'] = int(len(df.loc[i, 'messages']))


Unnamed: 0,label,messages,Count
0,0,"Go until jurong point, crazy.. Available only ...",111
1,0,Ok lar... Joking wif u oni...,29
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,155
3,0,U dun say so early hor... U c already then say...,49
4,0,"Nah I don't think he goes to usf, he lives aro...",61


In [8]:
# Total ham(0) and spam(1) messages
df['label'].value_counts()


0    4825
1     747
Name: label, dtype: int64

## Preprocess Messages

In [9]:
df['messages'][0],df['messages'][1],df['messages'][2]

('Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...',
 'Ok lar... Joking wif u oni...',
 "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's")

In [10]:
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re


In [11]:
ps = PorterStemmer()

In [12]:
n,=df['messages'].shape
n

5572

In [13]:
corpus = []
for i in np.arange(0, n):
	msg = df['messages'][i]
	if i==0:
		print("Original",msg)
	# Remove Emails
	msg = re.sub('\b[\w\-.]+?@\w+?\.\w{2,4}\b', 'emailaddr', msg)
	# Remove url's
	msg = re.sub('(http[s]?\S+)|(\w+\.[A-Za-z]{2,4}\S*)', 'httpaddr', msg)
	# Remove Money Symbols
	msg = re.sub('£|\$', 'moneysymb', msg)
	# Remove Phone Numbers
	msg = re.sub('\b(\+\d{1,2}\s?\d?[\-(.]?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4})\b', 'phonenumbr', msg)
	# Remove Numbers
	msg = re.sub('\d+(\.\d+)?', 'numbr', msg)
	# Remove Punctuation
	msg = re.sub('[^\w\s]', '', msg)
	# Remove Extra Spaces
	msg = re.sub('\s+', ' ', msg)
	
	
	if i==0:
		print("After Regular Expression: ", msg)
	
	# Lower case
	msg = msg.lower()
	if i == 0:
		print("After Lower case: ", msg)
	
	# Tokenize
	words = word_tokenize(msg)
	if i == 0:
		print("After Tokenize M: ",words)
	
	# Remove Stop Words
	stop_words = set(stopwords.words('english'))
	words = [w for w in words if not w in stop_words]
	if i == 0:
		print("After Remove Stop Words: ",words)

	# Stemming
	stemmed_words = [ps.stem(w) for w in words]
	if i == 0:
		print("After Stemming: ",stemmed_words)

	# Join the words back into one string separated by space,
	sen = ' '.join(stemmed_words)
	if i == 0:
		print("Final: \n",sen)
	
	corpus.append(sen)
		

Original Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
After Regular Expression:  Go until jurong point crazy Available only in bugis n great world la e buffet Cine there got amore wat
After Lower case:  go until jurong point crazy available only in bugis n great world la e buffet cine there got amore wat
After Tokenize M:  ['go', 'until', 'jurong', 'point', 'crazy', 'available', 'only', 'in', 'bugis', 'n', 'great', 'world', 'la', 'e', 'buffet', 'cine', 'there', 'got', 'amore', 'wat']
After Remove Stop Words:  ['go', 'jurong', 'point', 'crazy', 'available', 'bugis', 'n', 'great', 'world', 'la', 'e', 'buffet', 'cine', 'got', 'amore', 'wat']
After Stemming:  ['go', 'jurong', 'point', 'crazi', 'avail', 'bugi', 'n', 'great', 'world', 'la', 'e', 'buffet', 'cine', 'got', 'amor', 'wat']
Final: 
 go jurong point crazi avail bugi n great world la e buffet cine got amor wat


In [14]:
corpus[:5]

['go jurong point crazi avail bugi n great world la e buffet cine got amor wat',
 'ok lar joke wif u oni',
 'free entri numbr wkli comp win fa cup final tkt numbrst may numbr text fa numbr receiv entri questionstd txt ratetc appli numbrovernumbr',
 'u dun say earli hor u c alreadi say',
 'nah dont think goe usf live around though']

In [15]:
def text_process(msg):
	# Remove Emails
	msg = re.sub('\b[\w\-.]+?@\w+?\.\w{2,4}\b', 'emailaddr', msg)
	# Remove url's
	msg = re.sub('(http[s]?\S+)|(\w+\.[A-Za-z]{2,4}\S*)', 'httpaddr', msg)
	# Remove Money Symbols
	msg = re.sub('£|\$', 'moneysymb', msg)
	# Remove Phone Numbers
	msg = re.sub('\b(\+\d{1,2}\s?\d?[\-(.]?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4})\b', 'phonenumbr', msg)
	# Remove Numbers
	msg = re.sub('\d+(\.\d+)?', 'numbr', msg)
	# Remove Punctuation
	msg = re.sub('[^\w\s]', '', msg)
	# Remove Extra Spaces
	msg = re.sub('\s+', ' ', msg)
	
	# Lower case
	msg = msg.lower()
	
	# Tokenize
	words = word_tokenize(msg)
	
	# Remove Stop Words
	stop_words = set(stopwords.words('english'))
	words = [w for w in words if not w in stop_words]

	# Stemming
	stemmed_words = [ps.stem(w) for w in words]

	# Join the words back into one string separated by space,
	stemmed_sen = ' '.join(stemmed_words)
	
	return stemmed_sen



In [16]:
df['clean_msg'] = df.messages.apply(text_process)
df.head()

Unnamed: 0,label,messages,Count,clean_msg
0,0,"Go until jurong point, crazy.. Available only ...",111,go jurong point crazi avail bugi n great world...
1,0,Ok lar... Joking wif u oni...,29,ok lar joke wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,155,free entri numbr wkli comp win fa cup final tk...
3,0,U dun say so early hor... U c already then say...,49,u dun say earli hor u c alreadi say
4,0,"Nah I don't think he goes to usf, he lives aro...",61,nah dont think goe usf live around though


## Vectorization

In [18]:
X = df.clean_msg
y = df.label
print(X.shape)
print(y.shape)


(5572,)
(5572,)


In [19]:
# split X and y into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)


(4179,)
(1393,)
(4179,)
(1393,)


In [20]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()
X_train_dtm = vect.fit_transform(X_train)
X_test_dtm = vect.transform(X_test)


In [21]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer()
tfidf_transformer.fit(X_train_dtm)
tfidf_transformer.transform(X_train_dtm)


<4179x5974 sparse matrix of type '<class 'numpy.float64'>'
	with 35127 stored elements in Compressed Sparse Row format>

## 🤖 Building and evaluating a model


In [22]:
# import and instantiate a Multinomial Naive Bayes model
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()


In [23]:
# train the model using X_train_dtm (timing it with an IPython "magic command")
%time nb.fit(X_train_dtm, y_train)


Wall time: 5 ms


MultinomialNB()

In [75]:
msg = vect.inverse_transform(X_test_dtm[1])
" ".join(msg[0])

'anyway even good mani'

In [80]:
print("Actual: ", y_test.iloc[1])


Actual:  0


In [82]:
print("Predicted: ", nb.predict(X_test_dtm[1])[0])

Predicted:  0


In [None]:
y_pred_class = nb.predict(X_test_dtm)

In [32]:
# make class predictions for X_test_dtm
y_pred_class = nb.predict(X_test_dtm)


In [27]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report


In [28]:
# calculate accuracy of class predictions
accuracy_score(y_test, y_pred_class)


0.9842067480258435

In [30]:
print(classification_report(y_test, nb.predict(X_test_dtm)))


              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1213
           1       0.95      0.93      0.94       180

    accuracy                           0.98      1393
   macro avg       0.97      0.96      0.96      1393
weighted avg       0.98      0.98      0.98      1393



In [33]:
confusion_matrix(y_test, y_pred_class)


array([[1204,    9],
       [  13,  167]], dtype=int64)

In [34]:
# print message text for false positives (ham incorrectly classifier)
# X_test[(y_pred_class==1) & (y_test==0)]
X_test[y_pred_class > y_test]


4598                                laid airtel line rest
386                                     custom place call
1289    heygreat dealfarm tour numbram numbrpm moneysy...
3245    funni fact nobodi teach volcano numbr erupt ts...
1235    opinion numbr numbr jada numbr kusruthi numbr ...
2146                                    collect ur laptop
5094    hi shanilrakhesh httpaddr exchang uncut diamon...
494                                      free nowcan call
3140                                    custom place call
Name: clean_msg, dtype: object

In [35]:
# print message text for false negatives (spam incorrectly classifier)
X_test[y_pred_class < y_test]


4674    hi babe chloe r u smash saturday night great w...
3528    xma new year eve ticket sale club day numbram ...
4247     accordingli repeat text word ok mobil phone send
3417    life never much fun great came made truli spec...
2773    come take littl time child afraid dark becom t...
5       freemsg hey darl numbr week word back id like ...
2078                         numbr freeringtonerepli real
1457    clair havin borin time alon u wan na cum numbr...
190             uniqu enough find numbrth august httpaddr
2429    guess ithi first time creat web page httpaddr ...
4067    tbspersolvo chase us sinc sept fornumbr defini...
3358               sorri miss call let talk time im numbr
2821    romcapspam everyon around respond well presenc...
Name: clean_msg, dtype: object

### Pipeline 

In [36]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline

pipe = Pipeline([('bow', CountVectorizer()),
                 ('tfid', TfidfTransformer()),
                 ('model', MultinomialNB())])
pipe.fit(X_train, y_train)


Pipeline(steps=[('bow', CountVectorizer()), ('tfid', TfidfTransformer()),
                ('model', MultinomialNB())])

In [37]:
y_pred = pipe.predict(X_test)

In [38]:
accuracy_score(y_test, y_pred)


0.9698492462311558