# IMDB (Internet Movie Database)

## [Keras Dataset](https://keras.io/datasets/)

IMDB Movie reviews sentiment classification

- Reviews de 25.000 peliculas de IMDB
- Etiquetadas por sentimiento (Positivo/Negativo)
- Los reviews ya estan pre-procesados
- Cada review esta codificado como una secuencia de indices de palabras (integers)
- Los indices estan ordenados por frecuencia. Es decir que la palabra que tiene el índice 3 es la 3er palabra mas frecuente.
- 0 no es el indice de una palabra específica sino que se usa para codificar las palabras desconocidas

In [2]:
# Imports
import numpy as np
import keras
from keras.datasets import imdb
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import LSTM
from keras.layers import Conv1D, MaxPooling1D
from keras.layers import Dropout
from keras.layers.embeddings import Embedding
from keras.preprocessing.text import Tokenizer
import matplotlib.pyplot as plt
from keras import optimizers
from fnn_helper import PlotLosses
%matplotlib inline

np.random.seed(42)

In [3]:
# Embedding
max_features = 20000
max_words = 400
maxlen = 400
embedding_size = 32

# Convolution
kernel_size = 3
filters = 64
pool_size = 4

# LSTM
lstm_output_size = 70

# Training
batch_size = 32
epochs = 3


#batch_size is highly sensitive.

#Only 2 epochs are needed as the dataset is very small.

print('Loading data...')

#(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
(x, y), (x_test, y_test) = imdb.load_data(num_words=max_features)

# 40% del dataset para validación
validation = 0.40
N_validation_split = int(x.shape[0]*(1-validation))
print(N_validation_split)

# Training Set
x_train = x[:N_validation_split]
y_train = y[:N_validation_split]

# Cross Validation Set
x_val = x[N_validation_split:]
y_val = y[N_validation_split:]

print(len(x_train), 'train sequences')
print(len(x_val), 'val sequences')
print(len(x_test), 'test sequences')
print('Pad sequences (samples x time)')
print('Review:')
print(x_train[0])
print()
print('Sentimiento:')
print(y_train[0])


Loading data...
15000
15000 train sequences
10000 val sequences
25000 test sequences
Pad sequences (samples x time)
Review:
[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 19193, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 10311, 8, 4, 107, 117, 5952, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 12118, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1

In [4]:
print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_val = sequence.pad_sequences(x_val, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_val shape:', x_val.shape)
print('x_test shape:', x_test.shape)


Pad sequences (samples x time)
x_train shape: (15000, 400)
x_val shape: (10000, 400)
x_test shape: (25000, 400)


In [5]:
num_classes = 2
y_train_categorical = keras.utils.to_categorical(y_train, num_classes)
y_val_categorical = keras.utils.to_categorical(y_val, num_classes)
y_test_categorical = keras.utils.to_categorical(y_test, num_classes)
print(y_train_categorical.shape)
print(y_val_categorical.shape)
print(y_test_categorical.shape)

(15000, 2)
(10000, 2)
(25000, 2)


In [6]:
print('Build model 1.')

#Modelo 1
model_1 = Sequential()
model_1.add(Embedding(max_features, embedding_size, input_length=max_words))
model_1.add(Dropout(0.2))
model_1.add(LSTM(lstm_output_size))
model_1.add(Dropout(0.2))
model_1.add(Activation('relu'))
#model.add(Dense(2))
#model.add(Activation('sigmoid'))
model_1.add(Dense(2, activation='softmax', kernel_initializer='normal', name='salida'))
#model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
#model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
model_1.compile(loss = 'categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model_1.summary())

Build model 1.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 400, 32)           640000    
_________________________________________________________________
dropout_1 (Dropout)          (None, 400, 32)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 70)                28840     
_________________________________________________________________
dropout_2 (Dropout)          (None, 70)                0         
_________________________________________________________________
activation_1 (Activation)    (None, 70)                0         
_________________________________________________________________
salida (Dense)               (None, 2)                 142       
Total params: 668,982
Trainable params: 668,982
Non-trainable params: 0
_______________________________________________________

In [8]:
from keras.callbacks import ModelCheckpoint 
#Callbacks
## Callback para graficar
plot_losses1 = PlotLosses(plot_interval=1, evaluate_interval=20, x_val=x_val, y_val_categorical=y_val_categorical)
## Callback para guardar pesos
checkpointer1 = ModelCheckpoint(filepath='model_1.imdb.hdf5', verbose=1, save_best_only=True)

In [None]:
#Entrenar el modelo 
epochs=3 #epochs=20
model_1.fit(x_train, y_train_categorical, batch_size=batch_size, epochs=epochs, validation_data=(x_val,y_val_categorical), callbacks=[plot_losses1, checkpointer1],)
scores_1 = model_1.evaluate(x_test, y_test_categorical, verbose=2)
print("Accuracy: %.2f%%" % (scores_1[1] * 100))

Train on 15000 samples, validate on 10000 samples
Begin training
Epoch 1/3


In [None]:
#Analisis de los pesos
weights = model_1.get_weights()[0]
biases = model_1.get_weights()[1]
print(weights.shape)
print(biases)

In [None]:
# Cargo los valores del modelo
model_1.load_weights('model_1.imdb.hdf5')
score_1 = model_1.evaluate(x_test, y_test_categorical, verbose=0)
print("loss: ", score_1[0])
print("accuracy: ", score_1[1])

In [None]:
print('Build model 2.')
#Modelo 2
model_2 = Sequential()
model_2.add(Embedding(max_features, embedding_size, input_length=max_words))
model_2.add(Dropout(0.2))
model_2.add(Activation('relu'))
model_2.add(Dense(1, activation='sigmoid'))
rmsprop = optimizers.RMSprop(lr=0.001, rho=0.9, epsilon=1e-08, decay=1e-3)
model_2.add(Dense(2, activation='softmax', kernel_initializer='normal', name='salida'))
model_2.compile(loss = 'categorical_crossentropy', optimizer=rmsprop, metrics=['accuracy'])
print(model_2.summary())

In [None]:
from keras.callbacks import ModelCheckpoint 
#Callbacks
## Callback para graficar
plot_losses2 = PlotLosses(plot_interval=1, evaluate_interval=20, x_val=x_val, y_val_categorical=y_val_categorical)
## Callback para guardar pesos
checkpointer2 = ModelCheckpoint(filepath='model_2.imdb.hdf5', verbose=1, save_best_only=True)

In [None]:
#Entrenar el modelo 
#epochs=20
epochs=3
model_2.fit(x_train, y_train_categorical, batch_size=batch_size, epochs=epochs, validation_data=(x_val,y_val_categorical), callbacks=[plot_losses2, checkpointer2],)
scores_2 = model_2.evaluate(x_test, y_test_categorical, verbose=2)
print("Accuracy: %.2f%%" % (scores_2[1] * 100))

In [None]:
#Analisis de los pesos
weights = model_2.get_weights()[0]
biases = model_2.get_weights()[1]
print(weights.shape)
print(biases)

In [None]:
# Cargo los valores del modelo
model_2.load_weights('model_2.imdb.hdf5')
score_2 = model_2.evaluate(x_test, y_test_categorical, verbose=0)
print("loss: ", score_2[0])
print("accuracy: ", score_2[1])

In [None]:
from keras.callbacks import ModelCheckpoint 
#Callbacks
## Callback para graficar
plot_losses3 = PlotLosses(plot_interval=1, evaluate_interval=20, x_val=x_train, y_val_categorical=y_test)
## Callback para guardar pesos
checkpointer3 = ModelCheckpoint(filepath='model_3.imdb.hdf5', verbose=1, save_best_only=True)

In [None]:
def get_model_3(input_shape, output_size, lr=0.1):
    model = Sequential()
    sgd = optimizers.SGD(lr=lr)
    model.add(Dense(800, activation='sigmoid', kernel_initializer='zeros', name='middle'))
    model.add(Dense(output_size, activation='softmax', kernel_initializer='normal', name='Salid
    model.add(Dense(output_size, input_dim=input_shape, activation='sigmoid', kernel_initializer='normal', name='Salida' ))
    model.compile(loss = 'binary_crossentropy', optimizer=sgd, metrics=['accuracy'])

    return model

model_3 = get_model_3(x_train.shape[1], 1)
model_3.summary()

In [None]:
# Entrenar el modelo
model_3.fit(x_train, y_train_categorical, epochs=50, batch_size=32, validation_data=(x_test, y_test_categorical), callbacks=[plot_losses3, checkpointer3])
scores_3 = model_3.evaluate(x_test, y_test_categorical, verbose=2)
print("Accuracy: %.2f%%" % (scores_3[1] * 100))

Ejemplo:
<img src="images/training_IMDB.png" alt="Drawing" style="width:100%;"/>