In [1]:
from nltk.stem import PorterStemmer
import pandas as pd
import re

In [2]:
filepath = './Reviews.csv'
data = pd.read_csv(filepath, sep=',')

In [3]:
data['Score'].value_counts().to_dict()

{1: 52268, 2: 29769, 3: 42640, 4: 80655, 5: 363122}

In [4]:
del_columns = [col for col in data.columns if col not in ['Text','Score']]
data = data.drop(del_columns, 1)
print(data.head())

   Score                                               Text
0      5  I have bought several of the Vitality canned d...
1      1  Product arrived labeled as Jumbo Salted Peanut...
2      4  This is a confection that has been around a fe...
3      2  If you are looking for the secret ingredient i...
4      5  Great taffy at a great price.  There was a wid...


In [5]:
class NaiveBayes:
    def __init__(self, X_train=None, y_train=None, stem=False):
        self.class_docs_count  = dict([]) # número de documentos por classe
        self.prior             = dict([]) # proporção do corpus de treinamento
        self.likelihood        = dict([]) # chance da palavra ser encontrada na classe
        if X_train is not None and y_train is not None:
            self.train(X_train, y_train, stem)
    
    def train(self, X_train, y_train, stem=False):
        stemmer = PorterStemmer()         # auxiliares
        regex = r"[-'a-zA-ZÀ-ÖØ-öø-ÿ0-9]+"
        
        def to_bow(text):                 # transforma uma string em um dicionário no modelo bag-of-words
            words = re.findall(regex, text)
            count = {}
            for word in words:
                word = word.lower()
                if word not in count:
                    count[word] = 0
                count[word] += 1
            return count
    
        def stem_text(text):              # retorna uma string com os radicais das palavras de entrada
            words = []
            for word in re.findall(regex, text):
                words += [stemmer.stem(word)]

            return ' '.join(words)
        
        if isinstance(y_train, pd.Series):
            y_train = y_train.tolist()
        if isinstance(X_train, pd.Series):
            X_train = X_train.tolist()
            
        ''' a partir daqui X e y são listas '''
            
        if stem == True:
            X_train = [stem_text(doc) for doc in X_train]
            
        self.class_vocabulary = {cl: [] for cl in set(y_train)}
        self.vocabulary = set()    
        self.class_docs_count = {val: y_train.count(val) for val in set(y_train)}
        
        # print('class_docs_count =', self.class_docs_count)
        
        all_docs    = len(X_train)
        # print('all docs =', all_docs)
        
        for text, c in zip(X_train, y_train):
            bag = to_bow(text)
            for word in bag:
                self.vocabulary.add(word)
                self.class_vocabulary[c] += [word] * bag[word] # insere n vezes
                
        # print(self.class_vocabulary)
        for c in set(y_train):
            class_docs    = self.class_docs_count[c]
            self.prior[c] = class_docs/all_docs
            count_class_words = len(self.class_vocabulary[c])
            # print('count_class_words[%d] ='%(c), count_class_words)
            for word in self.vocabulary:
                self.likelihood[(word,c)]  = ( (self.class_vocabulary[c].count(word) + 1)/ # suavização de laplace
                                               (count_class_words + len(self.vocabulary))  )
                # print('lh[%s, %s] = %d + 1/%d + %d'%(word, c, self.class_vocabulary[c].count(word),count_class_words, len(self.vocabulary)))
        # print('prior =', self.prior) 
    def test(self, x_test):
        # print('text: ', x_test)
        prob = dict([])
        for c in self.class_docs_count:
            prob[c] = self.prior[c]
            for word in re.findall(regex,x_test):
                # print('%s ->'%(word), self.likelihood.get((word,c), 1/(len(self.class_vocabulary[c])+len(self.vocabulary))))
                prob[c]  *= self.likelihood.get((word,c), 1/(len(self.class_vocabulary[c])+len(self.vocabulary)))
        # print('prob', prob)
        return max(prob, key=prob.get)

In [89]:
x = ['just plain boring',
     'entirely predictable and lacks energy',
     'no surprises and very few laughs',
     'very powerful',
     'the most fun film of the summer']
y = ['neg',
     'neg',
     'neg',
     'pos',
     'pos']
model = NaiveBayes()
model.train(x, y)

In [90]:
model.test('predictable with no fun')

'neg'

In [105]:
train_size = 100
train_data = data.iloc[:train_size]
x = train_data['Text']
y = train_data['Score']
model.train(x, y)
sum([1 for a, b in zip(y, [model.test(val) for val in x]) if a != b])

21

In [92]:
test_size = 10
test_data = data.iloc[train_size:train_size+test_size]
comp = list(zip(test_data['Score'], [model.test(val) for val in test_data['Text']]))
diff = sum([1 for a, b in comp if a != b])/test_size
diff

0.7

In [35]:
test_data = data.head(1000)
x = test_data['Text']
y = test_data['Score']
model.train(x, y, stem=True)

In [36]:
test_range = 1000
comp = list(zip(data['Score'][1000:2000], [model.test(data['Text'][val]) for val in range(1000, 2000)]))
diff = sum([1 for a, b in comp if a != b])/test_range
diff

0.604

In [43]:
comp = list(zip(data['Score'][1000:2000], [5 for i in range(1000, 2000)]))
diff = sum([1 for a, b in comp if a != b])/test_range
diff

0.339