In [1]:
from nltk.stem import PorterStemmer
import pandas as pd
import re

In [2]:
filepath = 'amazon-fine-food-reviews/Reviews.csv'
data = pd.read_csv(filepath, sep=',')

regex = r"[-'a-zA-ZÀ-ÖØ-öø-ÿ0-9]+"

stemmer = PorterStemmer()

In [3]:
data['Score'].value_counts().to_dict()

{1: 52268, 2: 29769, 3: 42640, 4: 80655, 5: 363122}

In [4]:
del_columns = [col for col in data.columns if col not in ['Text','Score']]
data = data.drop(del_columns, 1)
print(data.head())

   Score                                               Text
0      5  I have bought several of the Vitality canned d...
1      1  Product arrived labeled as Jumbo Salted Peanut...
2      4  This is a confection that has been around a fe...
3      2  If you are looking for the secret ingredient i...
4      5  Great taffy at a great price.  There was a wid...


In [7]:
test_data = data.head()
test_data

Unnamed: 0,Score,Text
0,5,I have bought several of the Vitality canned d...
1,1,Product arrived labeled as Jumbo Salted Peanut...
2,4,This is a confection that has been around a fe...
3,2,If you are looking for the secret ingredient i...
4,5,Great taffy at a great price. There was a wid...


In [41]:
class NaiveBayes:
    def __init__(self, X_train=None, y_train=None, stem=False):
        self.class_docs_count  = dict([]) # número de documentos por classe
        self.prior             = dict([]) # proporção do corpus de treinamento
        self.likelihood        = dict([]) # chance da palavra ser encontrada na classe
        if X_train is not None and y_train is not None:
            self.train(X_train, y_train, stem)
    
    def train(self, X_train, y_train, stem=False):
        
        def to_bow(text):
            words = re.findall(regex, text)
            count = {}
            for word in words:
                word = word.lower()
                if word not in count:
                    count[word] = 0
                count[word] += 1
            return count
    
        def stem_text(text):
            words = []
            for word in re.findall(regex, text):
                words += [stemmer.stem(word)]

            return ' '.join(words)
        
        if isinstance(y_train, pd.Series):
            y_train = y_train.tolist()
        if isinstance(X_train, pd.Series):
            X_train = X_train.tolist()
            
        if stem == True:
            X_train = [stem_text(doc) for doc in X_train]
            
        self.class_vocabulary = {}
        self.vocabulary = set()    
        self.class_docs_count = {val: y_train.count(val) for val in set(y_train)}
        
        print('class_docs_count =', self.class_docs_count)
        
        ### a partir daqui X e y são listas
        
        all_docs    = len(X_train)
        print('all docs =', all_docs)
        
        for text, grade in zip(X_train, y_train):
            bag = to_bow(text)
            for word in bag:
                self.vocabulary.add(word)
                if word not in self.class_vocabulary:
                    self.class_vocabulary[grade] = []
                self.class_vocabulary[grade] += [word]
        
        for c in set(y_train):
            class_docs    = self.class_docs_count[c]
            self.prior[c] = class_docs/all_docs
            count_class_words = sum([self.class_vocabulary.get(word, 0) for word in self.vocabulary])

            for word in self.vocabulary:
                self.likelihood[(word,c)]  = ( (self.class_vocabulary[c].count(word) + 1)/
                                               (count_class_words + len(self.vocabulary))  )
                print('likelihood[(%s, %s)] = %f' % (word, c, self.likelihood[(word,c)]))
                
        print('prior =', self.prior)
    def test(self, x_test):
        prob = dict([])
        for c in self.class_docs_count:
            prob[c] = self.prior[c]
            for word in x_test:
                if word in self.vocabulary:
                    prob[c]  *= self.likelihood[(word,c)]
        return max(prob, key=prob.get)

In [42]:
model = NaiveBayes()
test_data = data.head(100)
model.train(test_data['Text'], test_data['Score'])

class_docs_count = {1: 8, 2: 4, 3: 9, 4: 12, 5: 67}
all docs = 100
likelihood[(raw, 1)] = 0.000639
likelihood[(confectionery, 1)] = 0.000639
likelihood[(everyone, 1)] = 0.000639
likelihood[(insulted, 1)] = 0.000639
likelihood[(fake, 1)] = 0.000639
likelihood[(spots, 1)] = 0.000639
likelihood[(boring, 1)] = 0.000639
likelihood[(better, 1)] = 0.000639
likelihood[(absence, 1)] = 0.000639
likelihood[(tequila, 1)] = 0.000639
likelihood[(given, 1)] = 0.000639
likelihood[(okay, 1)] = 0.000639
likelihood[(takes, 1)] = 0.000639
likelihood[(friends, 1)] = 0.000639
likelihood[(lying, 1)] = 0.000639
likelihood[(gas, 1)] = 0.000639
likelihood[(it's, 1)] = 0.000639
likelihood[(bag, 1)] = 0.000639
likelihood[(guilt, 1)] = 0.000639
likelihood[(jumping, 1)] = 0.000639
likelihood[(tired, 1)] = 0.000639
likelihood[(soupy, 1)] = 0.000639
likelihood[(stuff, 1)] = 0.000639
likelihood[(earliest, 1)] = 0.000639
likelihood[(hated, 1)] = 0.000639
likelihood[(chance, 1)] = 0.000639
likelihood[(floor, 1)] = 0.000

likelihood[(that's, 3)] = 0.000639
likelihood[(digest, 3)] = 0.000639
likelihood[(very, 3)] = 0.000639
likelihood[(will, 3)] = 0.000639
likelihood[(hardy, 3)] = 0.000639
likelihood[(twin, 3)] = 0.000639
likelihood[(poop, 3)] = 0.000639
likelihood[(unfortunately, 3)] = 0.000639
likelihood[(staple, 3)] = 0.000639
likelihood[(issue, 3)] = 0.000639
likelihood[(every, 3)] = 0.000639
likelihood[(anywhere', 3)] = 0.000639
likelihood[(light, 3)] = 0.000639
likelihood[(research, 3)] = 0.000639
likelihood[(stick, 3)] = 0.000639
likelihood[(might, 3)] = 0.000639
likelihood[(city, 3)] = 0.000639
likelihood[(wasted, 3)] = 0.000639
likelihood[(melon, 3)] = 0.000639
likelihood[(what's, 3)] = 0.000639
likelihood[(guilty, 3)] = 0.000639
likelihood[(preparation, 3)] = 0.000639
likelihood[(weston, 3)] = 0.000639
likelihood[(eaten, 3)] = 0.000639
likelihood[(owner, 3)] = 0.000639
likelihood[(apart, 3)] = 0.000639
likelihood[(garage, 3)] = 0.000639
likelihood[(wrong, 3)] = 0.000639
likelihood[(buyer, 3)] =

likelihood[(beans, 5)] = 0.000639
likelihood[(needs, 5)] = 0.000639
likelihood[(glass, 5)] = 0.000639
likelihood[(bummed, 5)] = 0.000639
likelihood[(gummy, 5)] = 0.000639
likelihood[(meal, 5)] = 0.000639
likelihood[(thickeners, 5)] = 0.000639
likelihood[(offer, 5)] = 0.000639
likelihood[(haven't, 5)] = 0.000639
likelihood[(non-coffee, 5)] = 0.000639
likelihood[(earlier, 5)] = 0.000639
likelihood[(storebought, 5)] = 0.000639
likelihood[(course, 5)] = 0.000639
likelihood[(canning, 5)] = 0.000639
likelihood[(20, 5)] = 0.000639
likelihood[(tiny, 5)] = 0.000639
likelihood[(happily, 5)] = 0.000639
likelihood[(save, 5)] = 0.000639
likelihood[(robitussin, 5)] = 0.000639
likelihood[(become, 5)] = 0.000639
likelihood[(simply, 5)] = 0.000639
likelihood[(b001eo5qw8, 5)] = 0.000639
likelihood[(breaking, 5)] = 0.000639
likelihood[(time, 5)] = 0.000639
likelihood[(measured, 5)] = 0.000639
likelihood[(dropped, 5)] = 0.000639
likelihood[(1998, 5)] = 0.000639
likelihood[(salty, 5)] = 0.000639
likelihood

In [19]:
test = 'this is a test string'
def stem_text(text):
    words = []
    for word in re.findall(regex, text):
        words += [stemmer.stem(word)]

    return ' '.join(words)

stem_text(test)

'thi is a test string'