In [None]:
from keras.datasets import imdb

In [None]:
(train_data,train_labels), (test_data, test_labels) = imdb.load_data(num_words=10000)

In [None]:
# In the above data set, we are keeping only the 10,000 most frequently used words from the reviews.
train_data[0]
train_labels[0]
max([max(seq) for seq in train_data])

In [None]:
def decode_review(encoded):
    word_idx = imdb.get_word_index()
#    print word_idx
    reverse_word_idx = dict(
        [(value,key) for (key,value) in word_idx.items()]
    )
#    print reverse_word_idx
    
    # We offset the index by 3 as 0,1, and 2 are reserved for padding, start of
    # sequence, and unknown.
    
    decoded = ' '.join([reverse_word_idx.get(i - 3,'?') for i in encoded])
    
    return decoded

In [None]:
print train_data[0]
print decode_review(train_data[0])

In [None]:
# One hot encode the word list into a tensor
import numpy as np

def vectorize_sequences(sequences, dimension=10000):
    results = np.zeros((len(sequences), dimension))
    for i, sequence in enumerate(sequences):
        results[i,sequence] = 1.
    return results

In [None]:
x_train = vectorize_sequences(train_data)
x_test = vectorize_sequences(test_data)

In [None]:
x_train[0]

In [None]:
y_train = np.asarray(train_labels).astype('float32')
y_test = np.asarray(test_labels).astype('float32')

In [None]:
# Define the model
from keras import models
from keras import layers

In [None]:
model = models.Sequential()
model.add(layers.Dense(16,activation='relu', input_shape=(10000,)))
model.add(layers.Dense(16, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))

In [None]:
model.compile(optimizer='rmsprop',
             loss='binary_crossentropy',
             metrics=['accuracy'])

In [None]:
# We can also configure the parameters of the optimizer or pass a custom loss function
# as well
from keras import optimizers
from keras import losses
from keras import metrics

model.compile(
    optimizer=optimizers.RMSprop(lr=0.001),
    loss=losses.binary_crossentropy,
    metrics=[metrics.binary_accuracy]
)

In [None]:
# Set aside 10000 samples from the training data as a validation set
x_val = x_train[:10000]
partial_x_train = x_train[10000:]

In [None]:
y_val = y_train[:10000]
partial_y_train = y_train[10000:]

In [None]:
history = model.fit(partial_x_train,
                   partial_y_train,
                    epochs=20,
                    batch_size=512,
                    validation_data=(x_val,y_val)
                   )

In [None]:
print history.history.keys()

In [None]:
history_dict = history.history
loss = history_dict['loss']
val_loss = history_dict['val_loss']
acc = history_dict['binary_accuracy']

epochs = range(1, len(acc) + 1)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
acc_values = history_dict['binary_accuracy']
val_acc_values = history_dict['val_binary_accuracy']
plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc_values, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
# From the above we're overfit after 20 epochs - do a model from scratch and train it for 4 epochs

model = models.Sequential()
model.add(layers.Dense(16,activation='relu', input_shape=(10000,)))
model.add(layers.Dense(16, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))

model.compile(optimizer='rmsprop',
             loss='binary_crossentropy',
             metrics=['accuracy'])

model.fit(partial_x_train, partial_y_train, epochs=4, batch_size=512)
results = model.evaluate(x_test, y_test)

In [None]:
model.predict(x_test)

In [None]:
x_test[1]

In [None]:
# From the above the model is confident test data at index 1 is a positive review - let's decode it
print decode_review(test_data[1])

In [None]:
# Let's find the index of a negative review, say 0.15 or less
predictions = model.predict(x_test)
for i,p in enumerate(predictions):
    if p < 0.15:
        print 'prediction {} found at index {}'.format(p,i)
        break

In [None]:
print decode_review(test_data[i])