# Importando Pacotes

In [73]:
import pandas as pd
from bs4 import BeautifulSoup
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.model_selection import train_test_split

In [2]:
nltk.download('punkt')
nltk.download('stopwords')
vectorizer = CountVectorizer()
stop = stopwords.words('english')
stemming = PorterStemmer()
transformer = TfidfTransformer(smooth_idf=False)

[nltk_data] Downloading package punkt to /home/gustavo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/gustavo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Lendo Arquivos .csv

In [3]:
reviews = pd.read_csv('IMDB_Dataset.csv/IMDB_Dataset.csv')

## Fazendo breve preview

In [4]:
reviews.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


# Parte 1-1

## a) Removendo tags de html com BeautifulSoup

In [5]:
reviews["review"] = reviews["review"].apply(lambda review: BeautifulSoup(review, 'html.parser').get_text())

### Mostrando exemplo

In [6]:
reviews.loc[1, "review"]

'A wonderful little production. The filming technique is very unassuming- very old-time-BBC fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece. The actors are extremely well chosen- Michael Sheen not only "has got all the polari" but he has all the voices down pat too! You can truly see the seamless editing guided by the references to Williams\' diary entries, not only is it well worth the watching but it is a terrificly written and performed piece. A masterful production about one of the great master\'s of comedy and his life. The realism really comes home with the little things: the fantasy of the guard which, rather than use the traditional \'dream\' techniques remains solid then disappears. It plays on our knowledge and our senses, particularly with the scenes concerning Orton and Halliwell and the sets (particularly of their flat with Halliwell\'s murals decorating every surface) are terribly well done.'

## b) Selecionando apenas letras

In [7]:
reviews["review"] = reviews["review"].apply(lambda review: "".join(re.findall(r"[a-zA-Z | ]", review)))

### Mostrando exemplo

In [8]:
reviews.loc[1, "review"]

'A wonderful little production The filming technique is very unassuming very oldtimeBBC fashion and gives a comforting and sometimes discomforting sense of realism to the entire piece The actors are extremely well chosen Michael Sheen not only has got all the polari but he has all the voices down pat too You can truly see the seamless editing guided by the references to Williams diary entries not only is it well worth the watching but it is a terrificly written and performed piece A masterful production about one of the great masters of comedy and his life The realism really comes home with the little things the fantasy of the guard which rather than use the traditional dream techniques remains solid then disappears It plays on our knowledge and our senses particularly with the scenes concerning Orton and Halliwell and the sets particularly of their flat with Halliwells murals decorating every surface are terribly well done'

## c) Todas as letras minúsculas

In [9]:
reviews["review"] = reviews["review"].apply(lambda review: review.lower())

### Mostrando exemplo

In [10]:
reviews.loc[1, "review"]

'a wonderful little production the filming technique is very unassuming very oldtimebbc fashion and gives a comforting and sometimes discomforting sense of realism to the entire piece the actors are extremely well chosen michael sheen not only has got all the polari but he has all the voices down pat too you can truly see the seamless editing guided by the references to williams diary entries not only is it well worth the watching but it is a terrificly written and performed piece a masterful production about one of the great masters of comedy and his life the realism really comes home with the little things the fantasy of the guard which rather than use the traditional dream techniques remains solid then disappears it plays on our knowledge and our senses particularly with the scenes concerning orton and halliwell and the sets particularly of their flat with halliwells murals decorating every surface are terribly well done'

## d) Tirar Stopwords

### Tokenization

In [11]:
reviews["review"] = reviews["review"].apply(lambda review: nltk.word_tokenize(review))

In [12]:
reviews.loc[1, "review"][0:10]

['a',
 'wonderful',
 'little',
 'production',
 'the',
 'filming',
 'technique',
 'is',
 'very',
 'unassuming']

### Tirando Stop Words

In [13]:
reviews ["review"] = reviews["review"].apply(lambda review: [item for item in review if item not in stop])

### Mostrando exemplo

In [14]:
reviews.loc[1, "review"][0:10]

['wonderful',
 'little',
 'production',
 'filming',
 'technique',
 'unassuming',
 'oldtimebbc',
 'fashion',
 'gives',
 'comforting']

## e) Stemming

In [15]:
reviews["review"] = reviews["review"].apply(lambda review: [stemming.stem(item) for item in review])

### Mostrando Exemplo de Texto Processado

In [16]:
print(" ".join(reviews.loc[1, "review"]))

wonder littl product film techniqu unassum oldtimebbc fashion give comfort sometim discomfort sens realism entir piec actor extrem well chosen michael sheen got polari voic pat truli see seamless edit guid refer william diari entri well worth watch terrificli written perform piec master product one great master comedi life realism realli come home littl thing fantasi guard rather use tradit dream techniqu remain solid disappear play knowledg sens particularli scene concern orton halliwel set particularli flat halliwel mural decor everi surfac terribl well done


# Parte 1-2

## Transformando em string novamente (Detokenization (https://en.wiktionary.org/wiki/detokenize kkk...))

In [17]:
reviews["review"] = reviews["review"].apply(lambda review: " ".join(review))

### Resultado

In [18]:
reviews.loc[2, "review"]

'thought wonder way spend time hot summer weekend sit air condit theater watch lightheart comedi plot simplist dialogu witti charact likabl even well bread suspect serial killer may disappoint realiz match point risk addict thought proof woodi allen still fulli control style mani us grown lovethi id laugh one woodi comedi year dare say decad ive never impress scarlet johanson manag tone sexi imag jump right averag spirit young womanthi may crown jewel career wittier devil wear prada interest superman great comedi go see friend'

## Usando CountVectorizer

In [19]:
bag_of_words = vectorizer.fit_transform(reviews["review"])

### Vendo a frequência de algumas palavras

In [20]:
vectorizer.get_feature_names()[1000:1015]

['across',
 'acrossdirect',
 'acrossgo',
 'acrosshowev',
 'acrossoveral',
 'acrossth',
 'acrosstheboard',
 'acrossthepond',
 'acrossther',
 'acrossy',
 'acrown',
 'acroyd',
 'acryl',
 'act',
 'actaft']

In [21]:
len(nltk.word_tokenize(reviews.loc[40, "review"]))

57

In [22]:
bag_of_words[40].toarray().sum(1)

array([57])

#### Parece ter funcionando

### Normalizando

In [23]:
tfidf = transformer.fit_transform(bag_of_words)

# Parte 1-3

## Criando labels

In [53]:
print(tfidf.shape)
print(reviews["sentiment"].shape)

(50000, 175062)
(50000,)


In [54]:
X = tfidf
y = reviews["sentiment"]

### Transformando sentimentos em em binário

In [55]:
y = (y == 'positive')
y = y.apply(lambda sentiment: sentiment + 0)
y[0:10]

0    1
1    1
2    1
3    0
4    1
5    1
6    1
7    0
8    0
9    1
Name: sentiment, dtype: int64

## Separando em teste e treino e treinando os modelos

In [74]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

clf_mult = MultinomialNB()
clf_pac = PassiveAggressiveClassifier()
clf_mult.partial_fit(X_train, y_train, classes = [0,1])
clf_pac.partial_fit(X_train, y_train, classes = [0,1])

PassiveAggressiveClassifier()

## Gerando e avaliando previsões

In [77]:
y_pred_mult = clf_mult.predict(X_test)
y_pred_pac = clf_pac.predict(X_test)

In [78]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

### Naïve Bayes

In [81]:
ACC = accuracy_score(y_test, y_pred_mult)
F1 = f1_score(y_test, y_pred_mult, average= 'binary')
print("ACC do NB: ", ACC)
print("F1 do NB: ", F1)
print("Confusion Matrix do NB: \n", confusion_matrix(y_test, y_pred_mult))

ACC do NB:  0.8556969696969697
F1 do NB:  0.8543641812954921
Confusion Matrix do NB: 
 [[7135 1073]
 [1308 6984]]


### Passive Agressive Classifier (https://jmlr.csail.mit.edu/papers/volume7/crammer06a/crammer06a.pdf)

In [82]:
ACC = accuracy_score(y_test, y_pred_pac)
F1 = f1_score(y_test, y_pred_pac, average= 'binary')
print("ACC do PAC: ", ACC)
print("F1 do PAC: ", F1)
print("Confusion Matrix do PAC: \n", confusion_matrix(y_test, y_pred_pac))

ACC do PAC:  0.884
F1 do PAC:  0.8863690334837331
Confusion Matrix do PAC: 
 [[7121 1087]
 [ 827 7465]]


In [101]:
pd.DataFrame(y).mean()

sentiment    0.5
dtype: float64

<h3 style="text-align: center;">Conclusões</h3>
<div style="margin-top:10px;">
    <p>O modelo que obteve melhores resultados foi o PAC (os quais foram rasoavelmente satisfatórios e equilibrados). Isso já era esperado pois como mostrado no gráfico abaixo, o PAC costuma performar melhor que o NB para um número grande de training examples.</p>
    <p>Vale ressaltar que a média dos dados é 0.5, ou seja, o data set está perfeitamente equilibrado, o que é um ponto positivo em geral, pois costuma gerar modelos menos enviesados.</p>
</div>

<figure style="text-align: center;">
    <img src="https://scikit-learn.org/0.15/_images/plot_out_of_core_classification_001.png" alt="graph" height=600 width=600>
    <figcaption><a style="font-size:20px" href:"/graph">Fonte</a></figcaption>
</figure>