# Importando Pacotes

In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.model_selection import train_test_split

In [2]:
nltk.download('punkt')
nltk.download('stopwords')
vectorizer = CountVectorizer()
stop = stopwords.words('english')
stemming = PorterStemmer()
transformer = TfidfTransformer(smooth_idf=False)

[nltk_data] Downloading package punkt to /home/gustavo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/gustavo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Lendo Arquivos .csv

In [3]:
reviews = pd.read_csv('IMDB_Dataset.csv/IMDB_Dataset.csv')

## Fazendo breve preview

In [4]:
reviews.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


<h1 style="text-align: center; font-size: 50px;">Parte 1 - Treinando Classificadores</h1>

# Parte 1-1

## a) Removendo tags de html com BeautifulSoup e deixando dados para análise com Vader (https://github.com/cjhutto/vaderSentiment)

In [5]:
reviews["review"] = reviews["review"].apply(lambda review: BeautifulSoup(review, 'html.parser').get_text())
reviews["review_vader"] = reviews["review"]

### Mostrando exemplo

In [6]:
reviews.loc[1, "review"]

'A wonderful little production. The filming technique is very unassuming- very old-time-BBC fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece. The actors are extremely well chosen- Michael Sheen not only "has got all the polari" but he has all the voices down pat too! You can truly see the seamless editing guided by the references to Williams\' diary entries, not only is it well worth the watching but it is a terrificly written and performed piece. A masterful production about one of the great master\'s of comedy and his life. The realism really comes home with the little things: the fantasy of the guard which, rather than use the traditional \'dream\' techniques remains solid then disappears. It plays on our knowledge and our senses, particularly with the scenes concerning Orton and Halliwell and the sets (particularly of their flat with Halliwell\'s murals decorating every surface) are terribly well done.'

## b) Selecionando apenas letras

In [7]:
reviews["review"] = reviews["review"].apply(lambda review: "".join(re.findall(r"[a-zA-Z | ]", review)))

### Mostrando exemplo

In [8]:
reviews.loc[1, "review"]

'A wonderful little production The filming technique is very unassuming very oldtimeBBC fashion and gives a comforting and sometimes discomforting sense of realism to the entire piece The actors are extremely well chosen Michael Sheen not only has got all the polari but he has all the voices down pat too You can truly see the seamless editing guided by the references to Williams diary entries not only is it well worth the watching but it is a terrificly written and performed piece A masterful production about one of the great masters of comedy and his life The realism really comes home with the little things the fantasy of the guard which rather than use the traditional dream techniques remains solid then disappears It plays on our knowledge and our senses particularly with the scenes concerning Orton and Halliwell and the sets particularly of their flat with Halliwells murals decorating every surface are terribly well done'

## c) Todas as letras minúsculas

In [9]:
reviews["review"] = reviews["review"].apply(lambda review: review.lower())

### Mostrando exemplo

In [10]:
reviews.loc[1, "review"]

'a wonderful little production the filming technique is very unassuming very oldtimebbc fashion and gives a comforting and sometimes discomforting sense of realism to the entire piece the actors are extremely well chosen michael sheen not only has got all the polari but he has all the voices down pat too you can truly see the seamless editing guided by the references to williams diary entries not only is it well worth the watching but it is a terrificly written and performed piece a masterful production about one of the great masters of comedy and his life the realism really comes home with the little things the fantasy of the guard which rather than use the traditional dream techniques remains solid then disappears it plays on our knowledge and our senses particularly with the scenes concerning orton and halliwell and the sets particularly of their flat with halliwells murals decorating every surface are terribly well done'

## d) Tirar Stopwords

### Tokenization

In [11]:
reviews["review"] = reviews["review"].apply(lambda review: nltk.word_tokenize(review))

In [12]:
reviews.loc[1, "review"][0:10]

['a',
 'wonderful',
 'little',
 'production',
 'the',
 'filming',
 'technique',
 'is',
 'very',
 'unassuming']

### Tirando Stop Words

In [13]:
reviews ["review"] = reviews["review"].apply(lambda review: [item for item in review if item not in stop])

### Mostrando exemplo

In [14]:
reviews.loc[1, "review"][0:10]

['wonderful',
 'little',
 'production',
 'filming',
 'technique',
 'unassuming',
 'oldtimebbc',
 'fashion',
 'gives',
 'comforting']

## e) Stemming

In [15]:
reviews["review"] = reviews["review"].apply(lambda review: [stemming.stem(item) for item in review])

### Mostrando Exemplo de Texto Processado

In [16]:
print(" ".join(reviews.loc[1, "review"]))

wonder littl product film techniqu unassum oldtimebbc fashion give comfort sometim discomfort sens realism entir piec actor extrem well chosen michael sheen got polari voic pat truli see seamless edit guid refer william diari entri well worth watch terrificli written perform piec master product one great master comedi life realism realli come home littl thing fantasi guard rather use tradit dream techniqu remain solid disappear play knowledg sens particularli scene concern orton halliwel set particularli flat halliwel mural decor everi surfac terribl well done


# Parte 1-2

## Transformando em string novamente (Detokenization (https://en.wiktionary.org/wiki/detokenize kkk...))

In [17]:
reviews["review"] = reviews["review"].apply(lambda review: " ".join(review))

### Resultado

In [18]:
reviews.loc[2, "review"]

'thought wonder way spend time hot summer weekend sit air condit theater watch lightheart comedi plot simplist dialogu witti charact likabl even well bread suspect serial killer may disappoint realiz match point risk addict thought proof woodi allen still fulli control style mani us grown lovethi id laugh one woodi comedi year dare say decad ive never impress scarlet johanson manag tone sexi imag jump right averag spirit young womanthi may crown jewel career wittier devil wear prada interest superman great comedi go see friend'

## Usando CountVectorizer

In [19]:
bag_of_words = vectorizer.fit_transform(reviews["review"])

### Vendo a frequência de algumas palavras

In [20]:
vectorizer.get_feature_names()[1000:1015]

['across',
 'acrossdirect',
 'acrossgo',
 'acrosshowev',
 'acrossoveral',
 'acrossth',
 'acrosstheboard',
 'acrossthepond',
 'acrossther',
 'acrossy',
 'acrown',
 'acroyd',
 'acryl',
 'act',
 'actaft']

In [21]:
len(nltk.word_tokenize(reviews.loc[40, "review"]))

57

In [22]:
bag_of_words[40].toarray().sum(1)

array([57])

#### Parece ter funcionando

### Normalizando

In [23]:
tfidf = transformer.fit_transform(bag_of_words)

# Parte 1-3

## Criando labels

In [24]:
print(tfidf.shape)
print(reviews["sentiment"].shape)

(50000, 175062)
(50000,)


In [25]:
X = tfidf
y = reviews["sentiment"]

### Transformando sentimentos em em binário

In [26]:
y = (y == 'positive')
y = y.apply(lambda sentiment: sentiment + 0)
y[0:10]

0    1
1    1
2    1
3    0
4    1
5    1
6    1
7    0
8    0
9    1
Name: sentiment, dtype: int64

## Separando em teste e treino e treinando os modelos

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

clf_mult = MultinomialNB()
clf_pac = PassiveAggressiveClassifier()
clf_mult.partial_fit(X_train, y_train, classes = [0,1])
clf_pac.partial_fit(X_train, y_train, classes = [0,1])

PassiveAggressiveClassifier()

## Gerando e avaliando previsões

In [28]:
y_pred_mult = clf_mult.predict(X_test)
y_pred_pac = clf_pac.predict(X_test)

In [29]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

### Naïve Bayes

In [30]:
ACC = accuracy_score(y_test, y_pred_mult)
F1 = f1_score(y_test, y_pred_mult, average= 'binary')
print("ACC do NB: ", ACC)
print("F1 do NB: ", F1)
print("Confusion Matrix do NB: \n", confusion_matrix(y_test, y_pred_mult))

ACC do NB:  0.8556969696969697
F1 do NB:  0.8543641812954921
Confusion Matrix do NB: 
 [[7135 1073]
 [1308 6984]]


### Passive Agressive Classifier (https://jmlr.csail.mit.edu/papers/volume7/crammer06a/crammer06a.pdf)

In [31]:
ACC = accuracy_score(y_test, y_pred_pac)
F1 = f1_score(y_test, y_pred_pac, average= 'binary')
print("ACC do PAC: ", ACC)
print("F1 do PAC: ", F1)
print("Confusion Matrix do PAC: \n", confusion_matrix(y_test, y_pred_pac))

ACC do PAC:  0.8860606060606061
F1 do PAC:  0.8892032060348891
Confusion Matrix do PAC: 
 [[7076 1132]
 [ 748 7544]]


### Analisando se o data set está balanceado

In [32]:
pd.DataFrame(y).mean()

sentiment    0.5
dtype: float64

<h3 style="text-align: center;">Conclusões</h3>
<div style="margin-top:10px;">
    <p>O modelo que obteve melhores resultados foi o PAC (os quais foram rasoavelmente satisfatórios e equilibrados). Isso já era esperado pois como mostrado no gráfico abaixo, o PAC costuma performar melhor que o NB para um número grande de training examples.</p>
    <p>Vale ressaltar que a média dos dados é 0.5, ou seja, o data set está perfeitamente equilibrado, o que é um ponto positivo em geral, pois costuma gerar modelos menos enviesados.</p>
</div>

<figure style="text-align: center;">
    <img src="https://scikit-learn.org/0.15/_images/plot_out_of_core_classification_001.png" alt="graph" height=600 width=600>
    <figcaption><a style="font-size:20px" href:"/graph">Fonte</a></figcaption>
</figure>

<h1 style="text-align: center; font-size: 50px;">Parte 2 - Usando Vader</h1>

## Importando Vader e seus métodos

In [35]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyser = SentimentIntensityAnalyzer()

## Calculando scores

In [48]:
scores_compound = reviews["review_vader"].apply(lambda review: analyser.polarity_scores(review)["compound"])

In [53]:
y_pred_vader = scores_compound.apply(lambda score: int(score > 0))

## Avaliando Resultados

In [57]:
ACC = accuracy_score(y, y_pred_vader)
F1 = f1_score(y, y_pred_vader, average= 'binary')
print("ACC do Vader: ", ACC)
print("F1 do Vader: ", F1)
print("Confusion Matrix do Vader: \n", confusion_matrix(y, y_pred_vader))

ACC do Vader:  0.69528
F1 do Vader:  0.7382580312661055
Confusion Matrix do Vader: 
 [[13277 11723]
 [ 3513 21487]]


## Comparando Resultados

<div>
    <p>É interessante perceber que em certa medida o Vader funciona mesmo sendo um método de unsupervised learning para sentimentos. Os métodos supervisionados demosntram uma performance melhor, mas isso já é esperado, pois eles conseguem tirar insights melhores da relação das palavras com o contexto dos dados.</p>
</div>

<h1 style="text-align: center; font-size: 50px;">Parte 3 - Word2Vec</h1>

In [59]:
from gensim.models.phrases import Phrases, Phraser

## Tokenization

In [61]:
reviews["review"] = reviews["review"].apply(lambda review: nltk.word_tokenize(review))

In [62]:
reviews.loc[1, "review"][0:10]

['wonder',
 'littl',
 'product',
 'film',
 'techniqu',
 'unassum',
 'oldtimebbc',
 'fashion',
 'give',
 'comfort']

## Bigrams

In [63]:
phrases = Phrases(reviews["review"], min_count=30, progress_per=10000)

In [69]:
bigram = Phraser(phrases)

In [70]:
reviews_bigram = bigram[reviews["review"]]

### Visualizando

In [74]:
pd.DataFrame(reviews_bigram)[-4:-1]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1382,1383,1384,1385,1386,1387,1388,1389,1390,1391
49996,bad,plot,bad,dialogu,bad,act,idiot,direct,annoy,porn,...,,,,,,,,,,
49997,cathol,taught,parochi,elementari_school,nun,taught,jesuit,priest,high_school,colleg,...,,,,,,,,,,
49998,im,go,disagre,previou_comment,side,maltin,one,second,rate,excess,...,,,,,,,,,,


<div>
    <p>Aparentemente funcionou, pois existem entries na tabela anterior como <em>previou_comment</em>, <em>elementari_school</em> e <em>high_school</em>; tais entries representam exatamente o que um bigram deveria fazer

## Treinando o modelo

In [75]:
import multiprocessing

from gensim.models import Word2Vec

In [78]:
cores = multiprocessing.cpu_count()

### Definindo os parametros

In [79]:
w2v_model = Word2Vec(min_count=20,
                     window=2,
                     size=300,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1)

### Construindo vocabulário

In [80]:
w2v_model.build_vocab(reviews_bigram, progress_per=10000)

### Treinando o modelo

In [81]:
w2v_model.train(reviews_bigram, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

(92476976, 170088450)

## Resultados

In [83]:
w2v_model.init_sims(replace=True)

In [95]:
w2v_model.wv.most_similar(positive=["action"])

[('non_stop', 0.49957260489463806),
 ('plot', 0.4989018440246582),
 ('suspens', 0.49370312690734863),
 ('action_sequenc', 0.4884621500968933),
 ('fight_sequenc', 0.4802779257297516),
 ('action_flick', 0.4778958559036255),
 ('car_chase', 0.47372567653656006),
 ('charact_develop', 0.45061081647872925),
 ('fastpac', 0.44094353914260864),
 ('movi', 0.4331169128417969)]

Inesperado por mim: fight_sequenc, car_chase

In [94]:
w2v_model.wv.most_similar(positive=["comed"])

[('comic', 0.46979713439941406),
 ('comedi', 0.44748902320861816),
 ('humor', 0.40481036901474),
 ('physic_comedi', 0.35433775186538696),
 ('selfdeprec', 0.33975303173065186),
 ('screwbal', 0.3356996178627014),
 ('paresh_rawal', 0.3108751177787781),
 ('funni', 0.3070267140865326),
 ('wri', 0.30597561597824097),
 ('amus', 0.2996782660484314)]

Inesperado por mim: selfdeprec

In [98]:
w2v_model.wv.most_similar(positive=["romanc"])

[('drama', 0.39001351594924927),
 ('romant', 0.3715023994445801),
 ('stori', 0.3632856011390686),
 ('love', 0.362504780292511),
 ('love_triangl', 0.34583306312561035),
 ('love_affair', 0.34093087911605835),
 ('comedi_drama', 0.33483660221099854),
 ('relationship', 0.3266106843948364),
 ('fairytal', 0.32072120904922485),
 ('action', 0.3165622651576996)]

Inesperado por mim: relationship

In [102]:
w2v_model.wv.most_similar(positive=["cri"])

[('laugh_hard', 0.3940882384777069),
 ('laugh', 0.3785444498062134),
 ('tear_eye', 0.3728519082069397),
 ('tear', 0.3708687424659729),
 ('sob', 0.36907821893692017),
 ('brought_tear', 0.3541452884674072),
 ('laugh_cri', 0.34369102120399475),
 ('wail', 0.33986204862594604),
 ('shed_tear', 0.3362300992012024),
 ('winc', 0.33347487449645996)]

Relações muito boas em geral

In [103]:
w2v_model.wv.most_similar(positive=["laugh"])

[('funni', 0.5948459506034851),
 ('joke', 0.5530734658241272),
 ('chuckl', 0.5466166734695435),
 ('laugh_loud', 0.5356996655464172),
 ('laugh_hard', 0.5257227420806885),
 ('roll_floor', 0.5145230293273926),
 ('watch', 0.49729645252227783),
 ('laughter', 0.4825011193752289),
 ('laugh_riot', 0.4814654588699341),
 ('giggl', 0.47414928674697876)]

In [105]:
w2v_model.wv.most_similar(positive=["star_war"])

[('lotr', 0.4261627495288849),
 ('return_jedi', 0.40652352571487427),
 ('georg_luca', 0.395448237657547),
 ('spacebal', 0.3793693780899048),
 ('indiana_jone', 0.3690086603164673),
 ('babylon', 0.3619828224182129),
 ('jurass_park', 0.3529084324836731),
 ('twilight_zone', 0.3504769206047058),
 ('princess_bride', 0.3477371335029602),
 ('final_fantasi', 0.344648540019989)]

esperava mais das relações encontradas para star_war, mas georg_luca foi uma boa e da onde será que veio babylon?? kkkk

In [107]:
w2v_model.wv.most_similar(positive=["corn"])

[('jelli', 0.38463160395622253),
 ('tin', 0.3796437382698059),
 ('cornfield', 0.35088789463043213),
 ('scarecrow', 0.34904932975769043),
 ('syrup', 0.3462121784687042),
 ('teeni', 0.342457115650177),
 ('microwav', 0.3372858166694641),
 ('maze', 0.3185432553291321),
 ('poop', 0.3151231110095978),
 ('goo', 0.309421181678772)]

Inesperado por mim: microwav

In [108]:
w2v_model.wv.doesnt_match(['indiana_jone', 'coke', 'corn'])

  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)


'indiana_jone'

o modelo entende que indiana_jone não é comida... muito bom

In [120]:
w2v_model.wv.doesnt_match(["star", "star_trek", "star_war", "indiana_jone"])

'star'

Mais um exemplo de que o modelo diferenciou bem. Mesmo sendo "star" uma palavra que está no nome de dois dos três filmes listados, o modelo percebe que ela não está na mesma categoria.

In [140]:
w2v_model.wv.most_similar(positive=["strang"])

[('weird', 0.46301451325416565),
 ('bizarr', 0.43375900387763977),
 ('odd', 0.3781774640083313),
 ('dream_sequenc', 0.36487138271331787),
 ('freakish', 0.33587783575057983),
 ('dreamlik', 0.3347778916358948),
 ('otherworldli', 0.33199626207351685),
 ('hallucin', 0.32091331481933594),
 ('unexplain', 0.3205890655517578),
 ('mysteri', 0.3132975995540619)]

Otimos sinonimos

In [123]:
w2v_model.wv.most_similar(positive=["happi", "cri"], negative=["sad"], topn=3)

[('die', 0.31040921807289124),
 ('care_bear', 0.27467185258865356),
 ('hug', 0.2674460709095001)]

Bizarro...