In [105]:
%matplotlib inline
import numpy as np
from keras.datasets import imdb
from keras.layers import Flatten, \
    Dense, \
    Embedding, \
    BatchNormalization, \
    Dropout, \
    Input, \
    Convolution1D, \
    MaxPooling1D
from keras.models import Sequential
from keras.optimizers import Adam
from keras.utils.data_utils import get_file
from keras.preprocessing.sequence import pad_sequences
import pickle
import os
from utils import load_array
import re


## Prepare Dataset

### Load Word Index

In [2]:
word_index = imdb.get_word_index()
idx_word = {idx:word for word, idx in word_index.iteritems()}

Downloading data from https://s3.amazonaws.com/text-datasets/imdb_word_index.pkl


### Load IMDB Dataset

In [3]:
path = get_file('imdb_full.pkl',
                origin='https://s3.amazonaws.com/text-datasets/imdb_full.pkl',
                md5_hash='d091312047c43cf9e4e38fef92437263')
f = open(path, 'rb')
(x_train, labels_train), (x_test, labels_test) = pickle.load(f)

Downloading data from https://s3.amazonaws.com/text-datasets/imdb_full.pkl


#### Sample of first 100 words

In [39]:
' '.join([idx_word[i] for i in x_train[0][0:100]])

"bromwell high is a cartoon comedy it ran at the same time as some other programs about school life such as teachers my 35 years in the teaching profession lead me to believe that bromwell high's satire is much closer to reality than is teachers the scramble to survive financially the insightful students who can see right through their pathetic teachers' pomp the pettiness of the whole situation all remind me of the schools i knew and their students when i saw the episode in which a student repeatedly tried to burn down the school i immediately recalled at high"

### Data munging for training

In [4]:
vocab_size = 5000
n_factors = 50
review_length = 500

In [5]:
def massage_data(review_list):
    return [np.minimum(np.array(review), vocab_size - 1) for review in review_list]

In [6]:
trn = massage_data(x_train)
val = massage_data(x_test)

In [7]:
trn = pad_sequences(trn, maxlen=review_length)
val = pad_sequences(val, maxlen=review_length)

## Create Basic Model

In [140]:
model = Sequential([Embedding(input_dim=vocab_size, output_dim=n_factors, input_length=review_length),
                    Dense(32, activation='relu'),
                    Flatten(),
                    Dropout(0.7),
                    Dense(1, activation='sigmoid')])

In [113]:
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_10 (Embedding)         (None, 500, 50)       250000      embedding_input_8[0][0]          
____________________________________________________________________________________________________
dense_16 (Dense)                 (None, 500, 32)       1632        embedding_10[0][0]               
____________________________________________________________________________________________________
flatten_9 (Flatten)              (None, 16000)         0           dense_16[0][0]                   
____________________________________________________________________________________________________
dropout_6 (Dropout)              (None, 16000)         0           flatten_9[0][0]                  
___________________________________________________________________________________________

In [141]:
model.compile(optimizer=Adam(lr=0.001), loss='binary_crossentropy', metrics=['accuracy'])

In [142]:
model.fit(trn, labels_train, batch_size=64, nb_epoch=1, validation_data=(val, labels_test))

Train on 25000 samples, validate on 25000 samples
Epoch 1/1


<keras.callbacks.History at 0x7fc4c2880fd0>

In [143]:
model.optimizer.lr = 0.1
model.fit(trn, labels_train, batch_size=64, nb_epoch=1, validation_data=(val, labels_test))

Train on 25000 samples, validate on 25000 samples
Epoch 1/1


<keras.callbacks.History at 0x7fc4c2880d90>

## Single Conv Layer

In [14]:
model = Sequential([Embedding(input_dim=vocab_size, output_dim=n_factors, input_length=review_length),
                    Convolution1D(nb_filter=32, filter_length=3),
                    Dropout(0.2),
                    Dense(100, activation='relu'),
                    BatchNormalization(),
                    Flatten(),
                    Dropout(0.7),
                    Dense(1, activation='sigmoid')])

In [136]:
model.compile(optimizer=Adam(lr=0.001), loss='binary_crossentropy', metrics=['accuracy'])

In [137]:
model.fit(trn, labels_train, batch_size=128, nb_epoch=1, validation_data=(val, labels_test))

Train on 25000 samples, validate on 25000 samples
Epoch 1/1


<keras.callbacks.History at 0x7f544b2a8d90>

In [140]:
model.optimizer.lr = 0.1
model.fit(trn, labels_train, batch_size=128, nb_epoch=1, validation_data=(val, labels_test))

Train on 25000 samples, validate on 25000 samples
Epoch 1/1


<keras.callbacks.History at 0x7f544ac6b550>

## Glove Embeddings

In [19]:
def get_glove_dataset(dataset):
    """Download the requested glove dataset from files.fast.ai
    and return a location that can be passed to load_vectors.
    """
    # see wordvectors.ipynb for info on how these files were
    # generated from the original glove data.
    md5sums = {'6B.50d': '8e1557d1228decbda7db6dfd81cd9909',
               '6B.100d': 'c92dbbeacde2b0384a43014885a60b2c',
               '6B.200d': 'af271b46c04b0b2e41a84d8cd806178d',
               '6B.300d': '30290210376887dcc6d0a5a6374d8255'}
    glove_path = os.path.abspath('data/glove/results')
    %mkdir -p $glove_path
    return get_file(dataset,
                    'http://files.fast.ai/models/glove/' + dataset + '.tgz',
                    cache_subdir=glove_path,
                    md5_hash=md5sums.get(dataset, None),
                    untar=True)

In [17]:
def load_vectors(loc):
    return (load_array(loc+'.dat'),
        pickle.load(open(loc+'_words.pkl','rb')),
        pickle.load(open(loc+'_idx.pkl','rb')))

In [46]:
glove_vecs, glove_words, glove_word_idx = load_vectors(get_glove_dataset('6B.50d'))

Untaring file...


In [52]:
def create_emb():
    n_fac = vecs.shape[1]
    emb = np.empty((vocab_size, n_fac))
    
    for idx in range(1, vocab_size):
        word = idx_word[idx]
        if word and re.search('^[a-zA-Z0-9\-]*$', word):
            emb[idx] = glove_vecs[glove_word_idx[word]]
        else:
            emb[idx] = np.random.normal(scale=0.6, size=(n_fac,))
    emb[-1] = np.random.normal(scale=0.6, size=(n_fac,))
    return emb

In [53]:
emb = create_emb()

In [144]:
model = Sequential([Embedding(input_dim=vocab_size, output_dim=vecs.shape[1], input_length=review_length, 
                              weights=[emb], dropout=0.2, trainable=False),
                    Convolution1D(nb_filter=64, filter_length=5, activation='relu'),
                    MaxPooling1D(),
                    Flatten(),
                    Dense(100, activation='relu'),
                    Dropout(0.7),
                    Dense(1, activation='sigmoid')])

In [150]:
model = Sequential([Embedding(vocab_size, vecs.shape[1], input_length=review_length, dropout=0.2, 
                              weights=[emb], trainable=False),
                    Dropout(0.25),
                    Convolution1D(64, 5, border_mode='same', activation='relu'),
                    Dropout(0.25),
                    MaxPooling1D(),
                    Flatten(),
                    Dense(100, activation='relu'),
                    Dropout(0.7),
                    Dense(1, activation='sigmoid')])

In [151]:
model.compile(optimizer=Adam(lr=0.001), loss='mse', metrics=['accuracy'])

In [152]:
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_20 (Embedding)         (None, 500, 50)       250000      embedding_input_20[0][0]         
____________________________________________________________________________________________________
dropout_5 (Dropout)              (None, 500, 50)       0           embedding_20[0][0]               
____________________________________________________________________________________________________
convolution1d_17 (Convolution1D) (None, 500, 64)       16064       dropout_5[0][0]                  
____________________________________________________________________________________________________
dropout_6 (Dropout)              (None, 500, 64)       0           convolution1d_17[0][0]           
___________________________________________________________________________________________

In [153]:
model.fit(trn, labels_train, nb_epoch=1, validation_data=(val, labels_test))

Train on 25000 samples, validate on 25000 samples
Epoch 1/1


<keras.callbacks.History at 0x7fc4c206ba50>

In [156]:
model.optimizer.lr = 0.1
model.fit(trn, labels_train, nb_epoch=2, validation_data=(val, labels_test))

Train on 25000 samples, validate on 25000 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7fc4c1bc3fd0>