In [1]:
from keras.datasets import imdb
from keras.utils.data_utils import get_file
import _pickle as pickle
import numpy as np
from keras.preprocessing import sequence

Using TensorFlow backend.


In [2]:
idx = imdb.get_word_index()
idx_arr = sorted(idx, key=idx.get)
idx2word = {v: k for k, v in idx.items()}

Downloading data from https://s3.amazonaws.com/text-datasets/imdb_word_index.json


In [3]:
path = get_file('imdb_full.pkl',
                origin='https://s3.amazonaws.com/text-datasets/imdb_full.pkl',
                md5_hash='d091312047c43cf9e4e38fef92437263')
f = open(path, 'rb')
(x_train, labels_train), (x_test, labels_test) = pickle.load(f)

Downloading data from https://s3.amazonaws.com/text-datasets/imdb_full.pkl


In [4]:
len(x_train)

25000

In [5]:
def display_review_at(index):
    print(' '.join([idx2word[o] for o in x_train[index]]))

In [6]:
vocab_size = 5000

trn = [np.array([i if i<vocab_size-1 else vocab_size-1 for i in s]) for s in x_train]
test = [np.array([i if i<vocab_size-1 else vocab_size-1 for i in s]) for s in x_test]

In [7]:
lens = np.array(list(map(len, trn)))
(lens.max(), lens.min(), lens.mean())

(2493, 10, 237.71364)

In [8]:
seq_len = 500

trn = sequence.pad_sequences(trn, maxlen=seq_len, value=0)
test = sequence.pad_sequences(test, maxlen=seq_len, value=0)

In [9]:
' '.join([(idx2word[o] if o!=0 else '') if o!=vocab_size-1 else '####' for o in trn[0]])

"                                                                                                                                                                                                                                                                                                                                                                          #### high is a cartoon comedy it ran at the same time as some other #### about school life such as #### my 35 years in the #### #### lead me to believe that #### #### satire is much closer to reality than is #### the #### to survive #### the #### students who can see right through their pathetic #### #### the #### of the whole situation all remind me of the #### i knew and their students when i saw the episode in which a student repeatedly tried to burn down the school i immediately #### at high a classic line inspector i'm here to #### one of your #### student welcome to #### high i expect that many adults of my age think that #

In [10]:
trn.shape

(25000, 500)

## Fully connected

In [11]:
from keras.layers import Embedding
from keras.models import Sequential
from keras.layers.core import Flatten, Dense, Dropout
from keras.optimizers import Adam

In [12]:
model = Sequential([
    Embedding(vocab_size, 32, input_length=seq_len),
    Flatten(),
    Dense(100, activation='relu'),
    Dropout(0.7),
    Dense(1, activation='sigmoid')])

In [13]:
model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 500, 32)           160000    
_________________________________________________________________
flatten_1 (Flatten)          (None, 16000)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 100)               1600100   
_________________________________________________________________
dropout_1 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 101       
Total params: 1,760,201
Trainable params: 1,760,201
Non-trainable params: 0
_________________________________________________________________


In [14]:
model.fit(trn, labels_train, validation_data=(test, labels_test), epochs=2, batch_size=64)

kwargs passed to function are ignored with Tensorflow backend


Train on 25000 samples, validate on 25000 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7feabfda2fd0>

## Convolutional

In [14]:
from keras.layers.convolutional import Conv1D, MaxPooling1D

In [15]:
conv1 = Sequential([
    Embedding(vocab_size, 32, input_length=seq_len),
    Dropout(0.2),
    Conv1D(64, 5, padding='same', activation='relu'),
    Dropout(0.2),
    MaxPooling1D(),
    Flatten(),
    Dense(100, activation='relu'),
    Dropout(0.7),
    Dense(1, activation='sigmoid')])

In [16]:
conv1.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])

In [18]:
conv1.fit(trn, labels_train, validation_data=(test, labels_test), epochs=4, batch_size=64)

kwargs passed to function are ignored with Tensorflow backend


Train on 25000 samples, validate on 25000 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x7fea78706fd0>

### Examine results so far

In [17]:
trn_predictions = conv1.predict(trn)
test_predictions = conv1.predict(test)

kwargs passed to function are ignored with Tensorflow backend


In [18]:
scores = sorted([ [val[0], idx] for idx,val in enumerate(test_predictions) ])

A very positive review

In [19]:
display_review_at(scores[-1][1])

the only thing that an inconvenient truth proves is that al gore is still an idiot these unchallenged experts are unchallenged because a response to their inane hypotheses is generally beneath real science this is mostly false science folks the greatest source of greenhouse gases co2 is people we exhale it and unless you're willing to start sacrificing your brethren to save the world there's not a darn thing to be done we've heard how the world was going to end as the result of man for more than 50 years fools publish a time line for their doomsday and when the time passes nothing has happened an inconvenient truth is just another vehicle with which a disingenuous faction of american society can peddle their poop br br and as to al leaving the tobacco business because of his sister's death from cancer that is a load too al couldn't run his farm any better than he could run the country he was losing money on the operation because he didn't care to farm when he could make more on speakin

And a very negative one

In [20]:
display_review_at(scores[0][1])

brutal emotionless michael myers stabs his sister to death at age six on halloween night in 1963 on october 30 1978 he escapes from a mental institution and institutes a new reign of terror in his hometown of haddonfield illinois he is pursued the whole time by a psychiatrist donald pleasence who knows just how evil this young man is br br it opens with a bang and sets up a genuinely suspenseful and atmospheric chiller that is actually superior to the many slasher pictures it helped to inspire it's subtle compared to the nasty bloodbaths many of those subsequent movies were subtle and scary it retains the ability to make me jump even after repeated viewings how many movies are there really that can continue to be frightening even after one has seen them before not very many br br pleasence is great in what was probably the definitive role of his career jamie lee curtis in her motion picture debut became a bona fide scream queen after acting in halloween as well as a few subsequent slas

A mixed review

In [21]:
display_review_at([i for i in scores if i[0] > 0.5][0][1])

okay this film is about bedknobs and broomsticks it's one of the most charming delightful movies you'll ever see as a kid it's the unforgettable movie about two adults and two spunky kids on an adventure for fun it may be a little deniable to watch but try it i neither my mother didn't think it was bad i was very enthused with the movie and the animation they were all quite good br br it is a delightfully wondrous comedy for the whole family to enjoy even the kids ages 7 years and up will enjoy this wonderful musical comedy with you and your family especially the animation the animation movements and layouts are really nice and deserve a thumbs up it's a terrifically good musical for the whole family so what are you waiting for go to the video store and rent bedknobs and broomsticks now


## Pretrained vectors

In [22]:
import bcolz
import re
from numpy.random import normal

In [23]:
def load_vectors(loc):
    return (load_array(loc+'.dat'),
        pickle.load(open(loc+'_words.pkl','rb')),
        pickle.load(open(loc+'_idx.pkl','rb')))

def load_array(fname):
    return bcolz.open(fname)[:]

In [24]:
vecs, words, wordidx = load_vectors('/data/glove/results/6B.50d')

In [25]:
def create_emb():
    n_fact = vecs.shape[1]
    emb = np.zeros((vocab_size, n_fact))

    for i in range(1,len(emb)):
        word = idx2word[i]
        if word and re.match(r"^[a-zA-Z0-9\-]*$", word):
            src_idx = wordidx[word]
            emb[i] = vecs[src_idx]
        else:
            # If we can't find the word in glove, randomly initialize
            emb[i] = normal(scale=0.6, size=(n_fact,))

    # This is our "rare word" id - we want to randomly initialize
    emb[-1] = normal(scale=0.6, size=(n_fact,))
    emb/=3
    return emb

In [26]:
emb = create_emb()

In [27]:
conv1 = Sequential([
    Embedding(vocab_size, 50, input_length=seq_len, weights=[emb], trainable=False),
    Dropout(0.2),
    Conv1D(64, 5, padding='same', activation='relu'),
    Dropout(0.2),
    MaxPooling1D(),
    Flatten(),
    Dense(100, activation='relu'),
    Dropout(0.7),
    Dense(1, activation='sigmoid')])

In [28]:
conv1.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])

In [53]:
conv1.fit(trn, labels_train, validation_data=(test, labels_test), epochs=2, batch_size=64)

kwargs passed to function are ignored with Tensorflow backend


Train on 25000 samples, validate on 25000 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7fea3156ceb8>

In [1]:
model.layers[0].trainable=True
model.optimizer.lr=1e-4

NameError: name 'model' is not defined

In [56]:
conv1.fit(trn, labels_train, validation_data=(test, labels_test), epochs=1, batch_size=64)

Train on 25000 samples, validate on 25000 samples
Epoch 1/1


<keras.callbacks.History at 0x7fea31507208>

In [58]:
model.save_weights('/data/trained_models/imdb_sentiment/glove50.h5')

## LSTM

In [34]:
from keras.regularizers import l2
from keras.layers import LSTM

model = Sequential([
    Embedding(vocab_size, 50, input_length=seq_len, mask_zero=True, weights=[emb],
              embeddings_regularizer=l2(1e-6)),
    Dropout(0.2),
    LSTM(100),
    Dense(1, activation='sigmoid')])
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


In [35]:
model.fit(trn, labels_train, validation_data=(test, labels_test), epochs=1, batch_size=64)

kwargs passed to function are ignored with Tensorflow backend


Train on 25000 samples, validate on 25000 samples
Epoch 1/1


<keras.callbacks.History at 0x7fa07cd7d198>