<center>

# Spam Message Classification

<center>

In [1]:
# importing libraries
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

# visualization
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style('whitegrid')
%matplotlib inline

# string processing
import string

# scikit-learn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

# NLTK
from nltk.corpus import stopwords

In [2]:
# dataset
messages = [line.strip() for line in open('SMSSpamCollection')]
print("Dataset's Length: {}".format(len(messages)))

Dataset's Length: 5574


In [3]:
# printing the first 10 messages
for num, message in enumerate(messages[:10]):
    print(num, message)
    print('\n')

0 ham	Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...


1 ham	Ok lar... Joking wif u oni...


2 spam	Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's


3 ham	U dun say so early hor... U c already then say...


4 ham	Nah I don't think he goes to usf, he lives around here though


5 spam	FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, Â£1.50 to rcv


6 ham	Even my brother is not like to speak with me. They treat me like aids patent.


7 ham	As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune


8 spam	WINNER!! As a valued network customer you have been selected to receivea Â£900 prize reward! To claim call 09061701461. Claim cod

In [4]:
# transforming data into data frame
messages = pd.read_csv(
    'Dataset/SMSSpamCollection', 
    sep = '\t', 
    names = ['label', 'message']
)

messages.head()

FileNotFoundError: [Errno 2] File b'Dataset/SMSSpamCollection' does not exist: b'Dataset/SMSSpamCollection'

In [None]:
# dataset description
messages.describe()

In [None]:
# dataset info
messages.info()

In [None]:
# data description given each label
messages.groupby('label').describe()

In [None]:
# length column to display length of each message
messages['length'] = messages['message'].apply(len)
messages.head()

In [None]:
# visualizing messages length 
messages['length'].plot(bins = 50, kind = ' hist')

In [None]:
# difference of length given label
messages.hist(column = 'length', by = 'label', bins = 50, figsize = (10, 4))

In [None]:
# messages' length description
messages['length'].describe()

In [None]:
# displaying the longest message
longest_message = messages[messages['length'] == 910]['message'].iloc[0]
longest_message

In [None]:
# cleaning punctuation from the longest message
longest_message_nopunc = [char for char in longest_message if char not in string.punctuation]
longest_message_nopunc = ''.join(longest_message_nopunc)
longest_message_nopunc

In [None]:
# removing stopwords (commonly used words) from the message
longest_message_clean = [word for word in longest_message_nopunc.split() if word.lower() not in stopwords.words('english')]
longest_message_clean

In [None]:
# English text processing function
def text_process(message):
    
    # remove all punctuation
    message_nopunc = [char for char in message if char not in string.punctuation]
    message_nopunc = ''.join(message_nopunc)
    
    # remove all stopwords and return a list of the cleaned text
    return [word for word in message_nopunc.split() if word.lower() not in stopwords.words('english')]

In [None]:
# applying the text processing function into the first 5 messages
messages['message'].head(5).apply(text_process)

In [None]:
# using count vectorizer 
bow_transformer = CountVectorizer(analyzer = text_process)

# fitting the transformer
bow_transformer.fit(messages['message'])

In [None]:
# displaying the effect
message_sample = messages['message'][3]
print(message_sample)

In [None]:
# transforming the sample
bow_sample = bow_transformer.transform([message_sample])
print(bow_sample)

In [None]:
# getting the 4068th and 9554th digit as they appear twice
print(bow_transformer.get_feature_names()[4068])
print(bow_transformer.get_feature_names()[9554])

In [None]:
messages_bow = bow_transformer.transform(messages['message'])
print('Shape of Sparse Matrix: {}'.format(messages_bow.shape))
print('Amount of Non-Zero Occurences: '.format(messages_bow.nnz))
print('Sparsity: {:.2f}%',format((messages_bow.nnz / (messages_bow.shape[0] * messages_bow.shape[1])) * 100))

In [None]:
# using term frequency inverse document frequency (tfidf) transformer
tfidf_transformer = TfidfTransformer().fit(messages_bow)
tfidf_sample = tfidf_transformer.transform(bow_sample)
print(tfidf_sample)

In [None]:
# the inverse document frequency for "u" and "say"
print('The Inverse Document Frequency for the word "u" is {}'.format(tfidf_transformer.idf_[bow_transformer.vocabulary_['u']]))
print('The Inverse Document Frequency for the word "say" is {}'.format(tfidf_transformer.idf_[bow_transformer.vocabulary_['say']]))

In [None]:
# shape of tfidf transformed messages
messages_tfidf = tfidf_transformer.transform(messages_bow)
print('Shape of tfidf transformed messages: {}'.format(messages_tfidf.shape))

In [None]:
# creating spam detection model using Multinomial Naive Bayes
spam_detection_model = MultinomialNB().fit(messages_tfidf, messages['label'])
print('Predicted: {}'.format(spam_detection_model.predict(tfidf_sample)[0]))
print('Expected: {}'.format(messages['label'][3]))

In [None]:
# applying detection to all predictions
all_predictions = spam_detection_model.predict(messages_tfidf)
print(all_predictions)

In [None]:
# printing classification report
print(classification_report(messages['label'], all_predictions))

In [None]:
# splitting the training and test data
msg_train, msg_test, label_train, label_test = train_test_split(messages['message'], messages['label'], test_size = 0.2)
print("The length of training data: {}\nThe length of test data: {}\nThe total length of data: {}".format(len(msg_train), len(msg_test), len(msg_train) + len(msg_test)))

In [None]:
# creating data pipeline
pipeline = Pipeline([
    ('bow', CountVectorizer(analyzer = text_process)),
    ('tfidf', TfidfTransformer()),
    ('classifier', MultinomialNB())
])

# fitting the model
pipeline.fit(msg_train, label_train)

In [None]:
# prediction and classification report
predictions = pipeline.predict(msg_test)
print(classification_report(predictions, label_test))