In [1]:
import numpy as np
import pandas as pd

In [2]:
class NBBinaryClassifier:
    """ Naive Bayes binary classifier """
    
    def fit(self, X, Y):
        """ Fit classifier on dataset
        
        Arguments:
        X - a list of sentences. Each sentence is a list of words
        Y - binary labels for samples in `X`
        """
        self.trainPositive, self.trainNegative = {}, {}
        self.positiveTotal, self.negativeTotal = 0, 0
        
        total = 0
        num_spam = 0
        for sentence, label in zip(X, Y):
            if label:
                num_spam +=1
            self._processSample(sentence, label)
        
        self.num_words = len(X)
        
        # prior probabilities
        self.prior_spam = num_spam / len(X)
        self.prior_not_spam = 1 - num_spam / len(X)
        
    def predict(self, sentence):
        """ Make a prediction. Calculate conditional probability and return the class with the
        highest probability. 
        
        Arguments:
        sentence - The sentence to predict on, format is a list of words
        
        Returns:
        1 = spam or 0 = not spam
        """
        prob_spam = self.prior_spam * self._conditionalSample(sentence, 1) # P (A | B)
        prob_not_spam = self.prior_not_spam * self._conditionalSample(sentence, 0) # P(¬A | B)
        
        # return most probable
        return int(prob_spam > prob_not_spam)
        
    def _processSample(self, sentence, label):
        """ Increases the prior count and likelihood count for the (sentence, label) pair """
        for word in sentence:
            if label == 1:
                self.trainPositive[word] = self.trainPositive.get(word, 0) + 1
                self.positiveTotal += 1
            else:
                self.trainNegative[word] = self.trainNegative.get(word, 0) + 1
                self.negativeTotal += 1
                
    def _conditionalSample(self, sentence , spam) :
        """ Calculate the contidional probability for a sentence """
        conditional = 1
        for word in sentence:
            conditional *= self._conditionalWord(word , spam)
        return conditional
    
    def _conditionalWord(self, word, spam):
        """ Calculate conditional probability for a word """

        #Laplace Smoothing for the words not present in the training set (aplha=1)
        if spam:
            return (self.trainPositive.get(word,0)+1)/(self.positiveTotal+1*self.num_words)
        else:
            return (self.trainNegative.get(word,0)+1)/(self.negativeTotal+1*self.num_words)

In [6]:
# load spam or ham dataset (from kaggle) and preprocess
data = pd.read_csv('./spam.csv', encoding="latin1")
X = [ sentence.lower().split(' ') for sentence in data['v2']]
Y = [ int(y == 'spam') for y in data['v1']]

# train
nb = NBBinaryClassifier()
nb.fit(X,Y)

# predict on dataset
correct, i = 0, 0
for i, (sentence, label) in enumerate(zip(X, Y)):
    y_pred = nb.predict(sentence)
    correct += np.equal(y_pred, label)
    
    if i % 500 == 0:
        print('-- SPAM -- ' if y_pred else ' --NOT SPAM -- ', " ".join(sentence))
        print('\n')
    
print('accuracy: {0}'.format(correct / len(X)))

 --NOT SPAM --  go until jurong point, crazy.. available only in bugis n great world la e buffet... cine there got amore wat...


 --NOT SPAM --  fighting with the world is easy, u either win or lose bt fightng with some1 who is close to u is dificult if u lose - u lose if u win - u still lose.


 --NOT SPAM --  no..but heard abt tat..


-- SPAM --  host-based idps for linux systems.


 --NOT SPAM --  but i'll b going 2 sch on mon. my sis need 2 take smth.


 --NOT SPAM --  remember to ask alex about his pizza


-- SPAM --  this message is free. welcome to the new & improved sex & dogging club! to unsubscribe from this service reply stop. msgs@150p 18+only


 --NOT SPAM --  says the  &lt;#&gt;  year old with a man and money. i'm down to my last  &lt;#&gt; . still waiting for that check.


 --NOT SPAM --  he's just gonna worry for nothing. and he won't give you money its no use.


 --NOT SPAM --  so wat's da decision?


 --NOT SPAM --  nope. since ayo travelled, he has forgotten his guy