# Ejemplo de RNN-Simple y LSTM - Large Movie Review Dataset - Imdb

http://ai.stanford.edu/~amaas/data/sentiment/ 

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
import os

In [0]:
path_imdb = "/content/drive/My Drive/data/aclImdb/"

In [0]:
train_dir = os.path.join(path_imdb, 'train')

In [0]:
labels = []
texts = []

In [0]:
for label_type in ['neg', 'pos']:
  dir_label_type = os.path.join(train_dir, label_type)
  for name in os.listdir(dir_label_type):
    if fname[-4:] == '.txt':
      f = open(os.path.join(dir_label_type, fname))
      texts.append(f.read())
      f.close()
      if label_type == 'neg':
        labels.append(0)
      else:
        labels.append(1)


#Tokenizing Data

In [0]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np

In [0]:
maxlen = 100  # usamos solo las primeras 100 palabras de cada documento-reseña.
training_samples = 200  # Por simplicidad entrenamos solo con pocos casos.
validation_samples = 10000  # validation set
max_words = 10000  # tamaño del vocabulario a considerar.

In [0]:
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

In [0]:
word_index = tokenizer.word_index
print(len(word_index))

Found 0 unique tokens.


In [0]:
data = pad_sequences(sequences, maxlen=maxlen)

In [0]:
labels = np.asarray(labels)

In [0]:
print(data.shape)
print(labels.shape)

In [0]:
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

In [0]:
x_train = data[:training_samples]
y_train = labels[:training_samples]
x_test = data[training_samples: training_samples + validation_samples]
y_test = labels[training_samples: training_samples + validation_samples]

In [0]:
vocab_size = 1000 # 10000 # total de palabras deiferentes a considerar.
n_epochs = 10
n_batch = 128
embedded_vector_sz = 32 # tamaño de los word embeddings.

In [0]:
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedded_vector_sz = 32))
model.add(SimpleRNN(32))
#model.add(LSTM(32))
model.add(Dense(1, activation='sigmoid'))

In [0]:
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])

In [0]:
print(model.summary())

In [0]:
H = model.fit(x_train, y_train,
                    epochs=n_epochs,
                    batch_size=n_batch,
                    validation_split=0.2)

Recuerda, en el caso de usar validation split, estás considerando que el conjunto total de datos está dividido en Training, Validation y Testing.

In [0]:
plt.style.use("ggplot")
plt.figure()
plt.plot(np.arange(0,n_epochs),H.history["accuracy"],label="train_acc")
plt.plot(np.arange(0,n_epochs),H.history["val_accuracy"],label="val_acc")
plt.title("Training and Validation Accuracy")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.legend()

plt.figure()
plt.plot(np.arange(0,n_epochs),H.history["loss"],label="train_loss") 
plt.plot(np.arange(0,n_epochs),H.history["val_loss"],label="val_loss")
plt.title("Training and Validation Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()

plt.show()

In [0]:
loss, accuracy = model.evaluate(x_test, y_test, verbose=0)

In [0]:
print('Accuracy: %f' % (accuracy*100))

In [0]:
print('Loss: %f' % (loss))