In [1]:
import os
print(os.listdir("../input"))

import pandas as pd
import numpy as np
import bs4
import re
from unicodedata import normalize
from nltk.corpus import stopwords
import operator
from keras.preprocessing import sequence
from tqdm import tqdm_notebook, tqdm

['imdb-dataset', 'sentiment-labelled-sentences-data-set']


Using TensorFlow backend.


In [2]:
imdb_df = pd.read_csv('../input/imdb-dataset/labeledTrainData.tsv', 
                        delimiter='\t')

In [3]:
imdb_df.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


<h1>Processamento dos Comentários</h1>

In [4]:
def preprocessing_data(filepath, topwords, maxlen):
    
    imdb_data = pd.read_csv(filepath, delimiter='\t')
    
    reviews = []
    
    pbar = tqdm_notebook(total=len(imdb_data))
    for review in imdb_data['review']:
        # Remover tags HTML
        review_text = bs4.BeautifulSoup(review, 'html.parser').get_text()
        
        # Remover caracteres especiais, pontuacao e numeros
        review_text = re.sub('[^a-zA-Z]', ' ', review_text)
        
        # Converter para caixa baixa
        review_text = review_text.lower()
        
        # Vetorizar o comentário
        review_words = review_text.split()
        
        # Remover stopwords
        stops = stopwords.words('english')
        
        meaningful_words = [word for word in review_words if not word in stops]
        
        reviews.append(meaningful_words)
        pbar.update(1)
    
    # Construindo dicionário de frequencia
    freq_dict = {}
    
    for review in reviews:
        for word in review:
            if not word in freq_dict:
                freq_dict[word] = 0
            freq_dict[word] += 1
    
    # Selecionar as top-K palavras (jeito inteligente Ass: carlos)
    sorted_tup = sorted(freq_dict.items(), key=operator.itemgetter(1), reverse=True)
    
    word_to_id = {}
    cnt = topwords - 1
    # Top-K palavras
    for i in sorted_tup[:topwords]:
        word_to_id[i[0]] = cnt
        cnt -= 1
    # Restante
    for i in sorted_tup[topwords:]:
        word_to_id[i[0]] = 0
    
    # Mapeando palavras para um id do dicionário
    processed_data = []
    
    for review in reviews:
        aux = []
        for word in review:
            aux.append(word_to_id[word])
        
        processed_data.append(aux)
    
    # Realizando o padding dos comentarios
    ## importar sequence de keras.preprocessing
    processed_data = np.asarray(processed_data)
    processed_data = sequence.pad_sequences(processed_data, maxlen)
    
    sentiment = imdb_data['sentiment'].get_values()
    
    return processed_data, sentiment, word_to_id

In [5]:
data, target, word_to_id = preprocessing_data('../input/imdb-dataset/labeledTrainData.tsv', 5000, 100)

HBox(children=(IntProgress(value=0, max=25000), HTML(value='')))

In [6]:
data[0], target[0]

(array([3500, 4612,    0, 4835, 1038,    0, 4885, 4373, 4501, 4921, 4996,
           0, 3570, 4620, 2837, 4886, 3081, 2497, 4426, 4983, 4940, 4900,
         125,    0, 4740, 3732,    0, 4985, 4426, 4507, 4256, 4363, 4369,
        4997, 4606, 4836, 4554, 4886, 4385, 1734, 3840, 4316, 4952, 3825,
        4776, 4999, 4984, 4996,    0, 4997, 4493, 4938, 4975, 4984, 4360,
        4867, 4769, 4905,    0, 4400, 1561,    0,    0, 3136, 4999, 4872,
        4658, 3558, 4753, 4997, 4135, 4984, 4958, 3513, 4003, 2667, 4988,
        4451, 4614, 4283,    0, 4988, 4959, 4984, 4842, 4638,  608, 1612,
        4959, 4913, 4775, 4562, 4793, 4746, 4883, 4997,    0,    0, 4684,
        3644], dtype=int32), 1)

<h1>Criando o Modelo</h1>

In [7]:
from keras.models import Model
from keras.layers import *

In [8]:
input_node = Input(shape=(100,))

embedding = Embedding(input_dim=5000, 
                      input_length=100, 
                      output_dim=32)(input_node)
dropout = Dropout(0.5)(embedding)
lstm_1 = LSTM(100)(dropout)
dropout = Dropout(0.5)(lstm_1)
fc1 = Dense(1, activation='sigmoid')(dropout)

model = Model(input_node, fc1)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 100, 32)           160000    
_________________________________________________________________
dropout_1 (Dropout)          (None, 100, 32)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dropout_2 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 101       
Total params: 213,301
Trainable params: 213,301
Non-trainable params: 0
_________________________________________________________________


In [9]:
model.compile(loss='binary_crossentropy', optimizer='Adam',
              metrics=['accuracy'])

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
X_train, X_test, y_train, y_test = train_test_split(data, target, 
                                                    test_size=0.33)

In [12]:
from keras.callbacks import *

In [13]:
early_stopping = EarlyStopping(monitor='val_loss', min_delta=1e-6, patience=3)

In [14]:
cb_list = [early_stopping]

In [15]:
model.fit(X_train, y_train, batch_size=64, epochs=20,
         validation_data=(X_test, y_test), callbacks=cb_list)

Train on 16750 samples, validate on 8250 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20


<keras.callbacks.History at 0x7f03b80a0a20>

<h2>Testar nova entrada</h2>

In [16]:
new_review = 'This movie was awesome'

# Remover tags HTML
review_text = bs4.BeautifulSoup(new_review, 'html.parser').get_text()
# Remover caracteres especiais, pontuacao e numeros
review_text = re.sub('[^a-zA-Z]', ' ', review_text)
# Converter para caixa baixa
review_text = review_text.lower()
# Vetorizar o comentário
review_words = review_text.split()
# Remover stopwords
stops = stopwords.words('english')

meaningful_words = [word for word in review_words if not word in stops]

processed_new_reviews = []
for word in meaningful_words:
    processed_new_reviews.append(word_to_id[word])

processed_data = np.asarray(processed_new_reviews).reshape(1, len(processed_new_reviews))
processed_data = sequence.pad_sequences(processed_data, 100)

In [18]:
processed_data

array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0, 4999,
        3970]], dtype=int32)

In [17]:
y_pred = model.predict(processed_data)[0]

if np.round(y_pred) == 1:
    sent = 'positivo'
else:
    sent = 'negativo'

print('A predição do sentimento para a entrada \"{}\" é {}'.format(new_review, sent))

A predição do sentimento para a entrada "This movie was awesome" é positivo


In [19]:
y_pred

array([0.5665269], dtype=float32)