In [1]:
#Naive bays assumes each word is independent of other

#  https://www.kdnuggets.com/2020/07/spam-filter-python-naive-bayes-scratch.html

#Lets load data set
import numpy as np
import pandas as pd

sms_spam = pd.read_csv('SMSSpamCollection', sep='\t',
                       header=None, names=['Label', 'SMS'])


In [2]:
sms_spam.head()


Unnamed: 0,Label,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
print(sms_spam.shape)


(5572, 2)


In [5]:
#Split data into train /test
data_randomized = sms_spam.sample(frac=1, random_state=1)
print(data_randomized)
training_test_index = round(len(data_randomized) * 0.8)
print(training_test_index)
training_set = data_randomized[:training_test_index].reset_index(drop=True)
test_set = data_randomized[training_test_index:].reset_index(drop=True)


     Label                                                SMS
1078   ham                       Yep, by the pretty sculpture
4028   ham      Yes, princess. Are you going to make me moan?
958    ham                         Welp apparently he retired
4642   ham                                            Havent.
4674   ham  I forgot 2 ask ü all smth.. There's a card on ...
...    ...                                                ...
905    ham  We're all getting worried over here, derek and...
5192   ham  Oh oh... Den muz change plan liao... Go back h...
3980   ham  CERI U REBEL! SWEET DREAMZ ME LITTLE BUDDY!! C...
235   spam  Text & meet someone sexy today. U can find a d...
5157   ham                            K k:) sms chat with me.

[5572 rows x 2 columns]
4458


In [6]:
# Remove punctuation
training_set['SMS'] = training_set['SMS'].str.replace('\W', ' ')
#Convert  to lower case
training_set['SMS'] = training_set['SMS'].str.lower()

training_set.head()


  training_set['SMS'] = training_set['SMS'].str.replace('\W', ' ')


Unnamed: 0,Label,SMS
0,ham,yep by the pretty sculpture
1,ham,yes princess are you going to make me moan
2,ham,welp apparently he retired
3,ham,havent
4,ham,i forgot 2 ask ü all smth there s a card on ...


In [7]:
training_set['SMS'] = training_set['SMS'].str.split()

vocabulary = []
for sms in training_set['SMS']:
   for word in sms:
      vocabulary.append(word)

vocabulary = list(set(vocabulary))




In [8]:
len(vocabulary)
#number of unique words

7783

In [9]:
vocabulary


['moves',
 'chechi',
 '08001950382',
 'soul',
 'figures',
 'rub',
 'sitll',
 'cultures',
 'specially',
 'animation',
 'astne',
 'thot',
 'box39822',
 'anti',
 '09066364589',
 'revealed',
 'server',
 'afford',
 '1mega',
 'low',
 'owns',
 'smoothly',
 'subscriber',
 'lautech',
 'anywhere',
 'gym',
 'strt',
 '09066362231',
 'location',
 'wednesday',
 'tihs',
 'crazyin',
 'mega',
 'fonin',
 'prasanth',
 'abta',
 'amongst',
 'didnt',
 'neshanth',
 'wit',
 'misbehaved',
 '2wu',
 'dinner',
 'ayo',
 '81618',
 'subscriptn3gbp',
 'almost',
 'all',
 'congratulation',
 'canada',
 'recpt',
 'amigos',
 '2gthr',
 'ridden',
 'lots',
 'wana',
 'mayb',
 'gving',
 'gsoh',
 'frm',
 'warwick',
 'directors',
 '4eva',
 'rebtel',
 'on',
 'route',
 'sam',
 'torture',
 'cs',
 'someonone',
 'noworriesloans',
 '150ppm',
 'cstore',
 'identifier',
 '0789xxxxxxx',
 'pei',
 'mobno',
 'asking',
 'prin',
 'bck',
 'alter',
 'apeshit',
 'triumphed',
 'engaged',
 'nurses',
 'china',
 'rhythm',
 '42478',
 'elaborating',
 '

In [10]:
#Now we want
#Label          Message      
#Spam           this is a cat it is this cat
#To Become
#               this        is          a           cat         it 
#  Spam         2           2           1           2           1 


In [11]:
word_counts_per_sms = {unique_word: [
    0] * len(training_set['SMS']) for unique_word in vocabulary}

for index, sms in enumerate(training_set['SMS']):
   for word in sms:
      word_counts_per_sms[word][index] += 1


In [12]:
word_counts = pd.DataFrame(word_counts_per_sms)
word_counts.head()


Unnamed: 0,moves,chechi,08001950382,soul,figures,rub,sitll,cultures,specially,animation,...,abel,combination,nice,george,wikipedia,help08714742804,84025,fox,pocy,marking
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
training_set_clean = pd.concat([training_set, word_counts], axis=1)
training_set_clean.head()
#Adding the label

Unnamed: 0,Label,SMS,moves,chechi,08001950382,soul,figures,rub,sitll,cultures,...,abel,combination,nice,george,wikipedia,help08714742804,84025,fox,pocy,marking
0,ham,"[yep, by, the, pretty, sculpture]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ham,"[yes, princess, are, you, going, to, make, me,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ham,"[welp, apparently, he, retired]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ham,[havent],0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ham,"[i, forgot, 2, ask, ü, all, smth, there, s, a,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
#Finding spam and ham messages
spam_messages = training_set_clean[training_set_clean['Label'] == 'spam']
ham_messages = training_set_clean[training_set_clean['Label'] == 'ham']


In [15]:
spam_messages.head()

Unnamed: 0,Label,SMS,moves,chechi,08001950382,soul,figures,rub,sitll,cultures,...,abel,combination,nice,george,wikipedia,help08714742804,84025,fox,pocy,marking
16,spam,"[freemsg, why, haven, t, you, replied, to, my,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18,spam,"[congrats, 2, mobile, 3g, videophones, r, your...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
56,spam,"[free, message, activate, your, 500, free, tex...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
60,spam,"[call, from, 08702490080, tells, u, 2, call, 0...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
61,spam,"[someone, has, conacted, our, dating, service,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
ham_messages.head()

Unnamed: 0,Label,SMS,moves,chechi,08001950382,soul,figures,rub,sitll,cultures,...,abel,combination,nice,george,wikipedia,help08714742804,84025,fox,pocy,marking
0,ham,"[yep, by, the, pretty, sculpture]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ham,"[yes, princess, are, you, going, to, make, me,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ham,"[welp, apparently, he, retired]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ham,[havent],0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ham,"[i, forgot, 2, ask, ü, all, smth, there, s, a,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
#Find probabilties
p_spam = len(spam_messages) / len(training_set_clean)
p_ham = len(ham_messages) / len(training_set_clean)

print(p_spam)
print(p_ham)

0.13458950201884254
0.8654104979811574


In [18]:
n_words_per_spam_message = spam_messages['SMS'].apply(len)
n_spam = n_words_per_spam_message.sum()

n_words_per_ham_message = ham_messages['SMS'].apply(len)
n_ham = n_words_per_ham_message.sum()

n_vocabulary = len(vocabulary)


# Laplace smoothing
alpha = 1


In [19]:
# Initiate parameters
parameters_spam = {unique_word: 0 for unique_word in vocabulary}
parameters_ham = {unique_word: 0 for unique_word in vocabulary}

# Calculate parameters
for word in vocabulary:
   # spam_messages already defined
   n_word_given_spam = spam_messages[word].sum()
   p_word_given_spam = (n_word_given_spam + alpha) / \
       (n_spam + alpha*n_vocabulary)
   parameters_spam[word] = p_word_given_spam

   n_word_given_ham = ham_messages[word].sum()  # ham_messages already defined
   p_word_given_ham = (n_word_given_ham + alpha) / (n_ham + alpha*n_vocabulary)
   parameters_ham[word] = p_word_given_ham


In [20]:
import re

def classify(message):
   '''
   message: a string
   '''

   message = re.sub('\W', ' ', message)
   message = message.lower().split()

   p_spam_given_message = p_spam
   p_ham_given_message = p_ham

   for word in message:
      if word in parameters_spam:
         p_spam_given_message *= parameters_spam[word]

      if word in parameters_ham:
         p_ham_given_message *= parameters_ham[word]

   print('P(Spam|message):', p_spam_given_message)
   print('P(Ham|message):', p_ham_given_message)

   if p_ham_given_message > p_spam_given_message:
      print('Label: Ham')
   elif p_ham_given_message < p_spam_given_message:
      print('Label: Spam')
   else:
      print('Equal proabilities, have a human classify this!')


In [21]:
classify('URGENT! You have won a 1 week FREE membership in our £100,000 Prize Jackpot! Txt the word: CLAIM to No: 81010 T&C www.dbuk.net LCCLTD POBOX 4403LDNW1A7RW18')


P(Spam|message): 3.2607778238371276e-86
P(Ham|message): 5.3240024322206324e-108
Label: Spam


In [22]:
classify("yep by the pretty sculpture")


P(Spam|message): 1.1631822187809428e-19
P(Ham|message): 1.9794443217005706e-17
Label: Ham


In [23]:
def classify_test_set(message):
   '''
   message: a string
   '''

   message = re.sub('\W', ' ', message)
   message = message.lower().split()

   p_spam_given_message = p_spam
   p_ham_given_message = p_ham

   for word in message:
      if word in parameters_spam:
         p_spam_given_message *= parameters_spam[word]

      if word in parameters_ham:
         p_ham_given_message *= parameters_ham[word]

   if p_ham_given_message > p_spam_given_message:
      return 'ham'
   elif p_spam_given_message > p_ham_given_message:
      return 'spam'
   else:
      return 'needs human classification'


In [24]:
test_set['predicted'] = test_set['SMS'].apply(classify_test_set)
test_set.head()


Unnamed: 0,Label,SMS,predicted
0,ham,Later i guess. I needa do mcat study too.,ham
1,ham,But i haf enuff space got like 4 mb...,ham
2,spam,Had your mobile 10 mths? Update to latest Oran...,spam
3,ham,All sounds good. Fingers . Makes it difficult ...,ham
4,ham,"All done, all handed in. Don't know if mega sh...",ham


In [25]:
correct = 0
total = test_set.shape[0]

for row in test_set.iterrows():
   row = row[1]
   if row['Label'] == row['predicted']:
      correct += 1

print('Correct:', correct)
print('Incorrect:', total - correct)
print('Accuracy:', correct/total)


Correct: 1100
Incorrect: 14
Accuracy: 0.9874326750448833
