In [2]:
from IPython.display import display
import numpy as np
import pandas as pd
import nltk
import re
import string
from nltk.stem.snowball import SnowballStemmer
from nltk.classify import NaiveBayesClassifier

nltk.download('stopwords')

STOPWORDS = set(nltk.corpus.stopwords.words('english'))
REGEX = re.compile('[%s]' % re.escape(string.punctuation))
STEMMER = SnowballStemmer("english")

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/vitorlima/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Text Classification (Naive Bayes)

In [4]:
movie_reviews = pd.read_csv('../movie_reviews.csv')
display(movie_reviews.head())

Unnamed: 0,movie,reviewer,metascore,review_score,review_text,review_date,release_date,acclaim_rate,user_score,director,...,Sci-Fi,Short,Sport,Talk-Show,Thriller,War,Western,unknown,release_decade,sentiment
0,American Swing,Stephen Holden,59.0,40,It leaves you feeling queasy.,,2009-03-27,Universal acclaim,83.0,Jon Hart,...,0,0,0,0,0,0,0,0,2000s,mixed
1,The Other Woman,A.O. Scott,37.0,50,In spite of its air of seriousness and sophist...,2011-02-03,2011-02-04,Mixed or average reviews,49.0,Don Roos,...,0,0,0,0,0,0,0,0,2010s,mixed
2,Casino Jack and the United States of Money,Stephen Holden,68.0,50,A liability of Casino Jack is the relative abs...,,2010-05-07,Universal acclaim,83.0,Alex Gibney,...,0,0,0,0,0,0,0,0,2000s,mixed
3,Old School,A.O. Scott,54.0,50,Like a half-empty glass of Coke that's been si...,,2003-02-21,Universal acclaim,87.0,Todd Phillips,...,0,0,0,0,0,0,0,0,2000s,mixed
4,The Circle,Nicolas Rapold,67.0,60,The sense of an invisible world being revealed...,2014-11-20,2014-11-14,Generally favorable reviews,75.0,Stefan Haupt,...,0,0,0,0,0,0,0,0,2010s,mixed


In [280]:
def word_parser(x):
    if x not in STOPWORDS:
        return ({STEMMER.stem(REGEX.sub('',x)):True})


def create_word_features(text):
    words={}
    for word in text.split():
        if word not in STOPWORDS:
            words.update(word_parser(word))
    return words
    
def category_filter(category):
    reviews =[]
    for x in movie_reviews.review_text[(movie_reviews.sentiment == category)]:   
        reviews.append((create_word_features(x),category))
    return reviews

def predict(model,text):
    words = {}
    for x in text.split():
        parsed_word = word_parser(x)
        if parsed_word:
            words.update(parsed_word)
    return model.classify(words)

## Coletando features

In [299]:
from random import shuffle

neg = category_filter('mixed') + category_filter('pos') + category_filter('neg')
shuffle(neg)
        
train_set = neg[:8400]
test_set = neg[8400:]

print('Positive reviews:',len(pos)) 
print('Mixed reviews:',len(mixed))
print('Negative reviews:',len(neg))

Positive reviews: 5621
Mixed reviews: 5147
Negative reviews: 12410


## Treinando Classificador

In [314]:
print('Train set: {} reviews'.format(len(train_set)))
print('Teste set: {} reviews'.format(len(test_set)))

model = NaiveBayesClassifier.train(train_set)
accuracy = nltk.classify.util.accuracy(model, test_set)
print("\nAccuracy: %.2f" %(accuracy*100) + "%\n_____________________________________ \n")
print("The 10 most informative words are: \n")
model.show_most_informative_features(20)

Train set: 8400 reviews
Teste set: 4010 reviews

Accuracy: 57.08%
_____________________________________ 

The 10 most informative words are: 

Most Informative Features
                 incoher = True              neg : pos    =     42.7 : 1.0
                 tedious = True              neg : pos    =     40.4 : 1.0
                 witless = True              neg : mixed  =     24.7 : 1.0
                   inept = True              neg : mixed  =     22.5 : 1.0
                laughabl = True              neg : pos    =     21.9 : 1.0
                 clichéd = True              neg : pos    =     19.6 : 1.0
                  devast = True              pos : mixed  =     18.3 : 1.0
                   lurch = True              neg : pos    =     17.3 : 1.0
                   hokum = True              neg : pos    =     17.3 : 1.0
                    thud = True              neg : pos    =     17.3 : 1.0
                    drab = True              neg : pos    =     15.0 : 1.0
      

## Fazendo predições

In [301]:
predict(model, "I love this movie!")

'pos'

In [302]:
predict(model, "I hate this movie!")

'neg'

In [303]:
predict(model, "Even though I think it's average, my wife loves it!")

'pos'