In [12]:
from IPython.display import display
import numpy as np
import pandas as pd
import nltk
import re
import string
import random
from random import shuffle
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.classify import NaiveBayesClassifier
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from nltk.classify import ClassifierI

nltk.download('stopwords')
random.seed(10)

STOPWORDS = set(nltk.corpus.stopwords.words('english'))
REGEX = re.compile('[%s]' % re.escape(string.punctuation))
STEMMER = SnowballStemmer('english')
LEMMATIZER = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/vitorlima/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Text Classification (Naive Bayes)

In [8]:
movie_reviews = pd.read_csv('../movie_data.csv')
display(movie_reviews.head())

Unnamed: 0,movie,reviewer,metascore,review_score,review_text,review_date,sentiment,release_date,user_acclaim,user_score,...,Romance,SciFi,Short,Sport,TalkShow,Thriller,War,Western,unknown,release_decade
0,Notes on Blindness,Stephen Holden,75.0,90,The tone of the narration is so wrenchingly ho...,2016-11-15,pos,2016-11-16,No score yet,,...,0,0,0,0,0,0,0,0,0,2010s
1,Devils on the Doorstep,Stephen Holden,70.0,80,"In its dry and forceful way, it delivers the s...",,pos,2002-12-18,Universal acclaim,83.0,...,0,0,0,0,0,0,1,0,0,2000s
2,The Upside of Anger,Dana Stevens,63.0,60,A seriously flawed movie wrapped around two ne...,,mixed,2005-03-11,Generally favorable reviews,80.0,...,1,0,0,0,0,0,0,0,0,2000s
3,Monster,Stephen Holden,74.0,70,The movie's biggest disappointment is the vagu...,,pos,2003-12-24,Generally favorable reviews,67.0,...,1,0,0,0,0,0,0,0,0,2000s
4,Rock in the Red Zone,Ken Jaworowski,54.0,50,Rock in the Red Zone has its best moments when...,2015-11-12,mixed,2015-11-12,No score yet,,...,0,0,0,0,0,0,1,0,0,2010s


# Etapa 0 - Funções de Tratamento

In [13]:
def word_parser(x):
    if x not in STOPWORDS:
        return ({STEMMER.stem(REGEX.sub('',x).lower()):True})


def create_word_features(text):
    words={}
    for word in word_tokenize(text):
        if (word not in STOPWORDS):
            words.update(word_parser(word))
    return words
    
def category_filter(category):
    reviews = []
    for x in movie_reviews.review_text[(movie_reviews.sentiment == category)]:   
        reviews.append((create_word_features(x),category))
    for x in reviews:
        if '' in x[0]:
            del x[0]['']
    return reviews

def predict(model,text):
    words = {}
    for x in word_tokenize(text):
        parsed_word = word_parser(LEMMATIZER.lemmatize(x))
        if parsed_word:
            words.update(parsed_word)
    return model.classify(words)

def mode(lista):
    counts = [lista.count(x) for x in lista]
    return lista[counts.index(max(counts))]
    

# Etapa 1 - Features

In [14]:
pos = category_filter('pos')
neg = category_filter('neg')
mixed = category_filter('mixed')

all_classes = pos+neg+mixed
shuffle(all_classes)

print('Positive reviews:',len(pos)) 
print('Mixed reviews:',len(mixed))
print('Negative reviews:',len(neg))

Positive reviews: 5621
Mixed reviews: 5147
Negative reviews: 1642


In [15]:
train_set = all_classes[:8680]
test_set = all_classes[8680:]

print('Train set: {} reviews'.format(len(train_set)))
print('Test set: {} reviews'.format(len(test_set)))


Train set: 8680 reviews
Test set: 3730 reviews


# Etapa 2 - Classificadores

### Naive Bayes Classifier

In [16]:
NB_classifier = NaiveBayesClassifier.train(train_set)

### Multinomial Naive Bayes Classifier

In [17]:
MNB_classifier = SklearnClassifier(MultinomialNB()).train(train_set)

### Gaussian Naive Bayes Classifier (quebrado)

In [18]:
#GNB_classifier = SklearnClassifier(GaussianNB()).train(train_set)

### Logistic Regression Classifier

In [19]:
LR_classifier = SklearnClassifier(LogisticRegression()).train(train_set)

### SGD Classifier

In [20]:
SGD_classifier = SklearnClassifier(SGDClassifier()).train(train_set)

### SVC Classifier (quebrado)

In [21]:
#SVC_classifier = SklearnClassifier(SVC()).train(train_set)

### Linear SVC Classifier

In [22]:
LinearSVC_classifier = SklearnClassifier(LinearSVC()).train(train_set)

### NuSVC Classifier

In [23]:
NuSVC_classifier = SklearnClassifier(NuSVC(nu=0.4)).train(train_set)

### Classificador por votos

In [24]:
class CustomClassifier(ClassifierI):
    
    def __init__(self,*classifiers):
        self._classifiers = classifiers
    
    def classify(self, feature_set):
        return mode([classifier.classify(feature_set) for classifier in self._classifiers]) 
    
    def prob(self, feature_set):
        votes = [classifier.classify(feature_set) for classifier in self._classifiers]
        return votes.count(mode(votes)) / len(votes)

C_classifier = CustomClassifier(NB_classifier, 
                                     MNB_classifier,
                                     LR_classifier,
                                     SGD_classifier, 
                                     LinearSVC_classifier,
                                     NuSVC_classifier)

# Etapa 3 - Acurácia

In [25]:
print("\nNB Accuracy: %.2f" %(nltk.classify.util.accuracy(NB_classifier, test_set)*100) + "%")

print("\nMNB Accuracy: %.2f" %(nltk.classify.util.accuracy(MNB_classifier, test_set)*100) + "%")

#print("\nGNB Accuracy: %.2f" %(nltk.classify.util.accuracy(GNB_classifier, test_set)*100) + "%")

print("\nLR Accuracy: %.2f" %(nltk.classify.util.accuracy(LR_classifier, test_set)*100) + "%")

print("\nSGD Accuracy: %.2f" %(nltk.classify.util.accuracy(SGD_classifier, test_set)*100) + "%")

#print("\nSVC Accuracy: %.2f" %(nltk.classify.util.accuracy(SVC_classifier, test_set)*100) + "%")

print("\nLinear SVC Accuracy: %.2f" %(nltk.classify.util.accuracy(LinearSVC_classifier, test_set)*100) + "%")

print("\nNuSVC Accuracy: %.2f" %(nltk.classify.util.accuracy(NuSVC_classifier, test_set)*100) + "%")

print("\nC_Classifier Accuracy: %.2f" %(nltk.classify.util.accuracy(C_classifier,test_set)*100) + "%")


NB Accuracy: 57.56%

MNB Accuracy: 60.13%

LR Accuracy: 59.12%

SGD Accuracy: 56.35%

Linear SVC Accuracy: 54.96%

NuSVC Accuracy: 56.73%

C_Classifier Accuracy: 59.01%


# Etapa 4 - Predições

In [31]:
print('------------------------------------------------------------------\
\nPredições para: What a terrible movie, I was bored the whole time!\n\
------------------------------------------------------------------\n')

print("NB Classifier: {}\n".format(predict(NB_classifier, "What a terrible movie, I was bored the whole time!")))
print("MNB Classifier: {}\n".format(predict(MNB_classifier, "What a terrible movie, I was bored the whole time!")))
print("LRClassifier: {}\n".format(predict(LR_classifier, "What a terrible movie, I was bored the whole time!")))
print("SGD Classifier: {}\n".format(predict(SGD_classifier, "What a terrible movie, I was bored the whole time!")))
#print("SVC Classifier: {}\n".format(predict(SVC_classifier, "What a terrible movie, I was bored the whole time!")))
print("Linear SVC Classifier: {}\n".format(predict(LinearSVC_classifier, "What a terrible movie, I was bored the whole time!")))
print("NuSVC Classifier: {}\n".format(predict(NuSVC_classifier, "What a terrible movie, I was bored the whole time!")))
print("Custom Classifier: {}".format(predict(C_classifier, "What a terrible movie, I was bored the whole time!")))

------------------------------------------------------------------
Predições para: What a terrible movie, I was bored the whole time!
------------------------------------------------------------------

NB Classifier: neg

MNB Classifier: mixed

LRClassifier: neg

SGD Classifier: neg

Linear SVC Classifier: neg

NuSVC Classifier: neg

Custom Classifier: neg


In [27]:
print('----------------------------------\
\nPredições para: I love this movie!\n\
----------------------------------\n')

print("NB Classifier: {}\n".format(predict(NB_classifier, "I love this movie!")))
print("MNB Classifier: {}\n".format(predict(MNB_classifier, "I love this movie!")))
print("LRClassifier: {}\n".format(predict(LR_classifier, "I love this movie!")))
print("SGD Classifier: {}\n".format(predict(SGD_classifier, "I love this movie!")))
#print("SVC Classifier: {}\n".format(predict(SVC_classifier, "I love this movie!")))
print("Linear SVC Classifier: {}\n".format(predict(LinearSVC_classifier, "I love this movie!")))
print("NuSVC Classifier: {}\n".format(predict(NuSVC_classifier, "I love this movie!")))
print("Custom Classifier: {}".format(predict(C_classifier, "I love this movie!")))

----------------------------------
Predições para: I love this movie!
----------------------------------

NB Classifier: pos

MNB Classifier: pos

LRClassifier: pos

SGD Classifier: pos

Linear SVC Classifier: pos

NuSVC Classifier: pos

Custom Classifier: pos


In [28]:
print("-------------------------------------------------------------------\
\nPredições para: Even though I think it's average, my wife loves it!\n\
-------------------------------------------------------------------\n")

print("NB Classifier: {}\n".format(predict(NB_classifier, "Even though I think it's average, my wife loves it!")))
print("MNB Classifier: {}\n".format(predict(MNB_classifier, "Even though I think it's average, my wife loves it!")))
print("LRClassifier: {}\n".format(predict(LR_classifier, "Even though I think it's average, my wife loves it!")))
print("SGD Classifier: {}\n".format(predict(SGD_classifier, "Even though I think it's average, my wife loves it!")))
#print("SVC Classifier: {}\n".format(predict(SVC_classifier, "Even though I think it's average, my wife loves it!")))
print("Linear SVC Classifier: {}\n".format(predict(LinearSVC_classifier, "Even though I think it's average, my wife loves it!")))
print("NuSVC Classifier: {}\n".format(predict(NuSVC_classifier, "Even though I think it's average, my wife loves it!")))
print("Custom Classifier: {}".format(predict(C_classifier, "Even though I think it's average, my wife loves it!")))

-------------------------------------------------------------------
Predições para: Even though I think it's average, my wife loves it!
-------------------------------------------------------------------

NB Classifier: mixed

MNB Classifier: mixed

LRClassifier: pos

SGD Classifier: pos

Linear SVC Classifier: neg

NuSVC Classifier: pos

Custom Classifier: pos


# Etapa 5 - Outras Caracteristicas

In [29]:
NB_classifier.show_most_informative_features(20)

Most Informative Features
                 tedious = True              neg : pos    =     34.4 : 1.0
                 incoher = True              neg : pos    =     20.6 : 1.0
                   lurch = True              neg : pos    =     20.2 : 1.0
                    flat = True              neg : pos    =     19.2 : 1.0
                  devast = True              pos : mixed  =     18.9 : 1.0
                daughter = True              neg : pos    =     17.8 : 1.0
                    poor = True              neg : pos    =     16.8 : 1.0
                portrait = True              pos : neg    =     15.6 : 1.0
                    sour = True              neg : pos    =     15.4 : 1.0
                laughabl = True              neg : pos    =     15.4 : 1.0
                    thud = True              neg : pos    =     15.4 : 1.0
                  recycl = True              neg : pos    =     15.4 : 1.0
                  dreari = True              neg : pos    =     14.9 : 1.0