#### Análise de Sentimentos

In [2]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB
from sklearn.metrics import accuracy_score
import pickle

#from google.colab import drive
#drive.mount('/content/drive')

In [3]:
data = pd.read_csv('IMDB-Dataset.csv')
data.shape
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [5]:
#data.review[0]
#data.sentiment[1]

In [6]:
data.sentiment.value_counts()

positive    25000
negative    25000
Name: sentiment, dtype: int64

### Removendo as tags html

#### regex rule: '<.*?>'

In [7]:
# A função recebe o texto
# compilando o regex
# Limpando e aplicando espaço nos trechos

def clean(txt):
    cleaned = re.compile(r'<.*?>')
    return re.sub(cleaned, '', txt)

#exemplo de limpeza:
data.review = data.review.apply(clean)

#mostrando
#data.review[0]

In [8]:
#detectando caracteres
def is_special(txt):
    rem = ''
    for i in txt:
        #se for alfanumérico continua
        if i.isalnum():
            rem = rem + i
        #senão, add o espaço
        else:
            rem = rem + ' '
    return rem

#aplicando a função
data.review = data.review.apply(is_special)

#exemplificando
#data.review[0]

In [9]:
# Tudo em minúsculo
def lower(txt):
    return txt.lower()

data.review = data.review.apply(lower)
data.review[0]

'one of the other reviewers has mentioned that after watching just 1 oz episode you ll be hooked  they are right  as this is exactly what happened with me the first thing that struck me about oz was its brutality and unflinching scenes of violence  which set in right from the word go  trust me  this is not a show for the faint hearted or timid  this show pulls no punches with regards to drugs  sex or violence  its is hardcore  in the classic use of the word it is called oz as that is the nickname given to the oswald maximum security state penitentary  it focuses mainly on emerald city  an experimental section of the prison where all the cells have glass fronts and face inwards  so privacy is not high on the agenda  em city is home to many  aryans  muslims  gangstas  latinos  christians  italians  irish and more    so scuffles  death stares  dodgy dealings and shady agreements are never far away i would say the main appeal of the show is due to the fact that it goes where other shows wo

In [10]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

def rem_stopwords(txt):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(txt)
    #se a palavra não esta entre aquelas sem valor, ela será retornada
    return [w for w in words if w not in stop_words]

data.review = data.review.apply(rem_stopwords)
#data.review[0]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\cyrog\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\cyrog\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [11]:
def stem_txt(txt):
    ss = SnowballStemmer('english')
    return " ".join([ss.stem(w) for w in txt])

data.review = data.review.apply(stem_txt)
data.review[0]

'one review mention watch 1 oz episod hook right exact happen first thing struck oz brutal unflinch scene violenc set right word go trust show faint heart timid show pull punch regard drug sex violenc hardcor classic use word call oz nicknam given oswald maximum secur state penitentari focus main emerald citi experiment section prison cell glass front face inward privaci high agenda em citi home mani aryan muslim gangsta latino christian italian irish scuffl death stare dodgi deal shadi agreement never far away would say main appeal show due fact goe show dare forget pretti pictur paint mainstream audienc forget charm forget romanc oz mess around first episod ever saw struck nasti surreal say readi watch develop tast oz got accustom high level graphic violenc violenc injustic crook guard sold nickel inmat kill order get away well manner middl class inmat turn prison bitch due lack street skill prison experi watch oz may becom comfort uncomfort view that get touch darker side'

#### Criando o modelo
  1. Creating BAG OF WORDS


In [12]:
x = np.array(data.iloc[:0].values)
y = np.array(data.sentiment.values)
cv = CountVectorizer(max_features=1000)
X = cv.fit_transform(data.review).toarray()
print("X.shape = ", X.shape)
print("Y.shape = ", y.shape)

X.shape =  (50000, 1000)
Y.shape =  (50000,)


In [13]:
x_train,x_test,y_train, y_test= train_test_split(X, y, test_size=0.3,random_state=9)
print("Train shapes: y={} | y={} ".format(x_train.shape, y_train.shape))  
print("Test shapes: y={} | y={} ".format(x_test.shape, y_test.shape))  

Train shapes: y=(35000, 1000) | y=(35000,) 
Test shapes: y=(15000, 1000) | y=(15000,) 


In [14]:
gnb, mnb, bnb = GaussianNB(), MultinomialNB(alpha=1.0, fit_prior=True), BernoulliNB(alpha=1.0, fit_prior=True)
gnb.fit(x_train, y_train)
mnb.fit(x_train, y_train)
bnb.fit(x_train, y_train)

BernoulliNB()

In [15]:
ypg = gnb.predict(x_test)
mpg = mnb.predict(x_test)
bpg = bnb.predict(x_test)

In [16]:
print("Gaussian: ", accuracy_score(y_test,ypg))
print("Multinomial: ", accuracy_score(y_test,mpg))
print("Bernoulli: ", accuracy_score(y_test,bpg))

Gaussian:  0.7837333333333333
Multinomial:  0.8292
Bernoulli:  0.8358666666666666


In [17]:
pickle.dump(bnb, open('modell.pkl','wb'))

# Usando a máquina para avaliar o sentimento do cliente.



In [18]:
# Dragon Evolution KKKKKK

rev = """For the love of god, if you respect the original series at all, do NOT pay to see this crap!
Let me just start off by saying that there are no redeeming qualities to this movie at all. The multi-million dollar budget has amounted to a teen drama mixed with power rangers. The final product is insulting, to say the least.

The creators of this horrible, horrible adaption should take a good long think about what they have done and why they should have just followed the source material instead of taken it for granted. Do they really think that the entire Dragonball fan-base is this stupid? Akira Toriyama probably has a noose around his neck right about now.

To finish this off: The acting is atrocious, the butt-rock scattered through the soundtrack will make you want to punch the guy running the theater in the face, and the story is basically the manga's, but cut up, put into a blender, stuffed into sausage skins, and stitched back together with an American flag tied around it.

If you want your Dragonball fix, just buy dvds of the show instead, skip the movie, and if you don't skip it, then at least don't pay for this thing. I heard that a sequel is already in the works, and we don't need a trilogy, that would be three of these movies too many."""

In [19]:
f1 = clean(rev)
f2 = is_special(f1)
f3 = lower(f2)
f4 = rem_stopwords(f3)
f5 = stem_txt(f4)

In [20]:
bow, words = [], word_tokenize(f5)

for word in words:
  bow.append(words.count(word))

word_dict = cv.vocabulary_
pickle.dump(word_dict, open('bow.pkl', 'wb'))

inp = []
for i in word_dict:
  inp.append(f5.count(i[0]))

y_prep = bnb.predict(np.array(inp).reshape(1,1000))

# Resultado 

In [21]:
y_prep

array(['negative'], dtype='<U8')