In [1]:
import pandas as pd
import numpy as np

In [2]:
train_data = pd.read_csv('data.csv')
train_data.head()

Unnamed: 0,document,class
0,just plain boring,0
1,entirely predictable and lacks energy,0
2,no surprises and very few laughs,0
3,very powerful,1
4,the most fun film of the summer,1


In [3]:
X_train = train_data['document'].values
y_train = train_data['class'].values
X_train, y_train

(array(['just plain boring', 'entirely predictable and lacks energy',
        'no surprises and very few laughs', 'very powerful',
        'the most fun film of the summer'], dtype=object),
 array([0, 0, 0, 1, 1], dtype=int64))

In [11]:
class NaiveBayes:
    def __init__(self):
        self.voc = set() # vocabulary in X_train


    def tokenize(self, unk=True):
        # eliminate symbols on input text
        sims = "!\"#$%&()*+-.,'/:;<=>?@[\]^_`{|}~\n"
        for si in sims:
            text = text.replace(si, '')

        # lower text
        text = text.lower()

        # separate text by words
        words = text.split(' ')

        # add words to vocabulary
        if unk:
            for wi in words:
                if wi not in self.voc:
                    self.voc.add(wi)
        else:
            aux = [] # only add words in vocabulary
            for wi in words:
                if wi in self.voc:
                    aux.append(wi)
            words = aux

        return words


    def fit(self, X_train, y_train):
        # identify unique classes and calculate prior probabilities p(c) for each class
        self.clases, counts = np.unique(y_train, return_counts=True)
        self.Pcs = [ci/len(X_train) for ci in counts]
        #print(f'Pcs: {self.Pcs}')

        # calculate bag of words per class
        self.bags = []
        self.bags_len = []
        for i, ci in enumerate(self.clases):
            #print(f'class: {ci}')
            
            # documents of each class
            docs = X_train[y_train==ci]
            #print(docs)

            # concatenate docs and identify unique words and count
            concat = []
            for di in docs:
                concat += self.tokenize(di)
            
            self.bags_len.append(len(concat)) # bag len

            ks, vs = np.unique(concat, return_counts=True)
            auxd = {}
            for j in range(len(ks)):
                auxd[ks[j]] = vs[j]
            self.bags.append(auxd) # add bag of words
            #print(f'bag: {auxd}')
        #print(f'bags: {self.bags}\n len: {self.bags_len}')
    

    def count(self, w, ci):
        if w in self.bags[ci]: # if w is in ci bag of words
            return self.bags[ci][w]
        else: # if w not in the ci bag of words, return 0
            return 0


    def predict(self, X_test):
        preds = []

        for xi in X_test:
            auxp = []
            for ci, cl in enumerate(self.clases):
                print(f'class: {cl}')
                # class probability
                priorc = self.Pcs[ci]

                # likelohood probability
                lp = 1
                for wi in self.tokenize(xi, unk=False):
                    lp = lp * (self.count(wi, ci) + 1) / (self.bags_len[ci] + 1)
                probc = priorc*lp # probability of xi belonginf to class c
                auxp.append(probc)
                print(f'prob: {probc}')
        
            
    def tokenize(self, text, unk=True):
        # eliminate symbols on input text
        sims = "!\"#$%&()*+-.,'/:;<=>?@[\]^_`{|}~\n"
        for si in sims:
            text = text.replace(si, '')

        # lower text
        text = text.lower()

        # separate text by words
        words = text.split(' ')

        # add words to vocabulary
        if unk:
            for wi in words:
                if wi not in self.voc:
                    self.voc.add(wi)
        else:
            aux = [] # only add words in vocabulary
            for wi in words:
                if wi in self.voc:
                    aux.append(wi)
            words = aux

        return words




test = NaiveBayes()
test.fit(X_train, y_train)
test.predict(['predictable with no fun'])


class: 0
prob: 0.000711111111111111
class: 1
prob: 0.0008
