## Notebook setup, imports

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import pandas as pd
from math import *
import os
import requests
import re
import string
from sklearn.preprocessing import LabelEncoder
from wordcloud import STOPWORDS


pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)

print(os.listdir("../input"))
DATA_FOLDER = '../input'


def read_csv(name, index_col=None):
    df = pd.read_csv(os.path.join(DATA_FOLDER, name), low_memory=False, sep='|', index_col=index_col)
    return df

['movies_merged.csv']


# Remover reviews NaN e substitui fresh e rotten


In [3]:
def process_df(name):
    df = read_csv(name)
    df.dropna(inplace=True)
    df['Review'].replace(to_replace='Fresh', value=1, inplace=True)
    df['Review'].replace(to_replace='Rotten', value=0, inplace=True)
    return df

df = process_df('movies_merged.csv')
df.head()

Unnamed: 0,Text,Review
0,Too few films take on the art of arguing as a ...,1
1,The film leaves a tremendous impact.,1
3,From 1957 and first-time director Sidney Lumet...,1
4,"Mechanically written, but within its own middl...",1
5,A strangely realistic thriller.,1


# Training

In [4]:
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from keras.optimizers import RMSprop
from keras.models import Model
from keras.callbacks import EarlyStopping

Using TensorFlow backend.


In [5]:
MAX_WORDS = 1000
MAX_LEN = 150


def get_X_from_df(df):
    X = df.Text
    return X

def get_Y_from_df(df):
    Y = df.Review
    #le = LabelEncoder()
    #Y = le.fit_transform(Y)
    #Y = Y.reshape(-1,1)
    return Y

def get_sequence_matrix_from_X(X, tok):
    tok.fit_on_texts(X)
    sequences = tok.texts_to_sequences(X)
    sequences_matrix = sequence.pad_sequences(sequences, maxlen=MAX_LEN)
    return sequences_matrix


sar_acc = process_df('movies_merged.csv')
X = get_X_from_df(sar_acc)
Y = get_Y_from_df(sar_acc)
TEST_SIZE = 0.2
RANDOM_STATE = 42
X_train,X_test,Y_train,Y_test = train_test_split(X, Y, test_size=TEST_SIZE, random_state=RANDOM_STATE)

def RNN():
    inputs = Input(name='inputs',shape=[MAX_LEN]) # Input of the model. Expects a vector of size MAX_LEN (150))
    layer = Embedding(MAX_WORDS, 50, input_length=MAX_LEN)(inputs) # First layer of the model. Turns positive integers  into dense vectors
    layer = LSTM(64)(layer)
  
    layer = Dense(256)(layer)
    
    layer = Activation('relu')(layer)
    
    layer = Dropout(0.2)(layer)
    
    layer = Dense(1,name='out_layer')(layer)
    layer = Activation('sigmoid')(layer)
    model = Model(inputs=inputs,outputs=layer)
    return model

model = RNN()
model.summary()
model.compile(loss='binary_crossentropy',
              optimizer=RMSprop(),
              metrics=['accuracy'])
tok = Tokenizer(num_words=MAX_WORDS)
X_train_sequences = get_sequence_matrix_from_X(X_train, tok)
model.fit(X_train_sequences,
          Y_train,batch_size=100,
          epochs=5,
          validation_split=0.1,
          callbacks=[EarlyStopping(monitor='val_loss',min_delta=0.0001)])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
inputs (InputLayer)          (None, 150)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 150, 50)           50000     
_________________________________________________________________
lstm_1 (LSTM)                (None, 64)                29440     
_________________________________________________________________
dense_1 (Dense)              (None, 256)               16640     
_________________________________________________________________
activation_1 (Activation)    (None, 256)               0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
out_layer (Dense)            (None, 1)                 257       
__________

<keras.callbacks.History at 0x7ff86904b2e8>

In [6]:
test_sequences = tok.texts_to_sequences(X_test)
test_sequences_matrix = sequence.pad_sequences(test_sequences,maxlen=MAX_LEN)
accr = model.evaluate(test_sequences_matrix,Y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

Test set
  Loss: 0.215
  Accuracy: 0.933
