In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

# load imdb data (includes word index and movie review)

In [2]:
from keras.datasets import imdb
idx = imdb.get_word_index()

Using TensorFlow backend.


Downloading data from https://s3.amazonaws.com/text-datasets/imdb_word_index.json
   8192/1641221 [..............................] - ETA: 0s

In [8]:
idx_arr = sorted(idx, key=idx.get)
idx_arr[:10]

['the', 'and', 'a', 'of', 'to', 'is', 'br', 'in', 'it', 'i']

In [12]:
idx2word = {v: k for k, v in idx.items()}
idx2word

{34701: 'fawn',
 52006: 'tsukino',
 52007: 'nunnery',
 16816: 'sonja',
 63951: 'vani',
 1408: 'woods',
 16115: 'spiders',
 2345: 'hanging',
 2289: 'woody',
 52008: 'trawling',
 52009: "hold's",
 11307: 'comically',
 40830: 'localized',
 30568: 'disobeying',
 52010: "'royale",
 40831: "harpo's",
 52011: 'canet',
 19313: 'aileen',
 52012: 'acurately',
 52013: "diplomat's",
 25242: 'rickman',
 6746: 'arranged',
 52014: 'rumbustious',
 52015: 'familiarness',
 52016: "spider'",
 68804: 'hahahah',
 52017: "wood'",
 40833: 'transvestism',
 34702: "hangin'",
 2338: 'bringing',
 40834: 'seamier',
 34703: 'wooded',
 52018: 'bravora',
 16817: 'grueling',
 1636: 'wooden',
 16818: 'wednesday',
 52019: "'prix",
 34704: 'altagracia',
 52020: 'circuitry',
 11585: 'crotch',
 57766: 'busybody',
 52021: "tart'n'tangy",
 14129: 'burgade',
 52023: 'thrace',
 11038: "tom's",
 52025: 'snuggles',
 29114: 'francesco',
 52027: 'complainers',
 52125: 'templarios',
 40835: '272',
 52028: '273',
 52130: 'zaniacs',

In [13]:
import pickle

f = open('imdb_full.pkl', 'rb')
(x_train, labels_train), (x_test, labels_test) = pickle.load(f)

In [15]:
len(x_train), len(x_test)

(25000, 25000)

In [16]:
', '.join(map(str, x_train[0]))

'23022, 309, 6, 3, 1069, 209, 9, 2175, 30, 1, 169, 55, 14, 46, 82, 5869, 41, 393, 110, 138, 14, 5359, 58, 4477, 150, 8, 1, 5032, 5948, 482, 69, 5, 261, 12, 23022, 73935, 2003, 6, 73, 2436, 5, 632, 71, 6, 5359, 1, 25279, 5, 2004, 10471, 1, 5941, 1534, 34, 67, 64, 205, 140, 65, 1232, 63526, 21145, 1, 49265, 4, 1, 223, 901, 29, 3024, 69, 4, 1, 5863, 10, 694, 2, 65, 1534, 51, 10, 216, 1, 387, 8, 60, 3, 1472, 3724, 802, 5, 3521, 177, 1, 393, 10, 1238, 14030, 30, 309, 3, 353, 344, 2989, 143, 130, 5, 7804, 28, 4, 126, 5359, 1472, 2375, 5, 23022, 309, 10, 532, 12, 108, 1470, 4, 58, 556, 101, 12, 23022, 309, 6, 227, 4187, 48, 3, 2237, 12, 9, 215'

In [17]:
idx2word[23022]

'bromwell'

In [18]:
' '.join([idx2word[o] for o in x_train[0]])

"bromwell high is a cartoon comedy it ran at the same time as some other programs about school life such as teachers my 35 years in the teaching profession lead me to believe that bromwell high's satire is much closer to reality than is teachers the scramble to survive financially the insightful students who can see right through their pathetic teachers' pomp the pettiness of the whole situation all remind me of the schools i knew and their students when i saw the episode in which a student repeatedly tried to burn down the school i immediately recalled at high a classic line inspector i'm here to sack one of your teachers student welcome to bromwell high i expect that many adults of my age think that bromwell high is far fetched what a pity that it isn't"

In [19]:
labels_train[:10]

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

### remove rare words

In [21]:
import numpy as np

vocab_size = 5000
trn = [np.array([i if i<vocab_size-1 else vocab_size-1 for i in s]) for s in x_train]
test = [np.array([i if i<vocab_size-1 else vocab_size-1 for i in s]) for s in x_test]

In [32]:
lens = np.array(list(map(len, trn)))
(lens.max(), lens.min(), lens.mean())

(2493, 10, 237.71364)

### truncated or pre-padding with zero

In [34]:
from keras.preprocessing import sequence

seq_len = 500

trn = sequence.pad_sequences(trn, maxlen=seq_len, value=0)
test = sequence.pad_sequences(test, maxlen=seq_len, value=0)

In [35]:
trn.shape

(25000, 500)

# create simple model

In [49]:
from keras.models import Sequential
from keras.layers.embeddings import Embedding
from keras.layers.core import Flatten, Dense, Dropout
from keras.optimizers import Adam

In [47]:
model = Sequential([
    Embedding(vocab_size, 32, input_length=seq_len),
    Flatten(),
    Dense(100, activation='relu'),
    Dropout(0.7),
    Dense(1, activation='sigmoid')])

In [50]:
model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 500, 32)           160000    
_________________________________________________________________
flatten_1 (Flatten)          (None, 16000)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 100)               1600100   
_________________________________________________________________
dropout_1 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 101       
Total params: 1,760,201
Trainable params: 1,760,201
Non-trainable params: 0
_________________________________________________________________


In [51]:
model.fit(trn, labels_train, validation_data=(test, labels_test), epochs=2, batch_size=64)

Train on 25000 samples, validate on 25000 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f8630164048>

# CNN

In [57]:
from keras.layers.convolutional import Conv1D
from keras.layers.pooling import MaxPooling1D

In [59]:
conv1 = Sequential([
    Embedding(vocab_size, 32, input_length=seq_len),
    Dropout(0.2),
    Conv1D(64, 5, padding='same', activation='relu'),
    Dropout(0.2),
    MaxPooling1D(),
    Flatten(),
    Dense(100, activation='relu'),
    Dropout(0.7),
    Dense(1, activation='sigmoid')])

In [60]:
conv1.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])

In [61]:
conv1.fit(trn, labels_train, validation_data=(test, labels_test), epochs=4, batch_size=64)

Train on 25000 samples, validate on 25000 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x7f85e6300668>

In [63]:
conv1.save_weights('conv1.h5')

In [64]:
conv1.load_weights('conv1.h5')

### using pre-trained embeddings

In [120]:
import bcolz

def save_array(fname, arr):
    c=bcolz.carray(arr, rootdir=fname, mode='w')
    c.flush()

def load_array(fname):
    return bcolz.open(fname)[:]

def load_vectors(loc):
    return (load_array(loc+'.dat'),
        pickle.load(open(loc+'_words.pkl','rb'), encoding='latin1'),
        pickle.load(open(loc+'_idx.pkl','rb'), encoding='latin1'))

vecs, words, wordidx = load_vectors('6B.50d')
vecs.shape

(400000, 50)

In [121]:
print(wordidx['the'])
print(vecs[wordidx['the']])

0
[  4.18000013e-01   2.49679998e-01  -4.12420005e-01   1.21699996e-01
   3.45270008e-01  -4.44569997e-02  -4.96879995e-01  -1.78619996e-01
  -6.60229998e-04  -6.56599998e-01   2.78430015e-01  -1.47670001e-01
  -5.56770027e-01   1.46579996e-01  -9.50950012e-03   1.16579998e-02
   1.02040000e-01  -1.27920002e-01  -8.44299972e-01  -1.21809997e-01
  -1.68009996e-02  -3.32789987e-01  -1.55200005e-01  -2.31309995e-01
  -1.91809997e-01  -1.88230002e+00  -7.67459989e-01   9.90509987e-02
  -4.21249986e-01  -1.95260003e-01   4.00710011e+00  -1.85939997e-01
  -5.22870004e-01  -3.16810012e-01   5.92130003e-04   7.44489999e-03
   1.77780002e-01  -1.58969998e-01   1.20409997e-02  -5.42230010e-02
  -2.98709989e-01  -1.57490000e-01  -3.47579986e-01  -4.56370004e-02
  -4.42510009e-01   1.87849998e-01   2.78489990e-03  -1.84110001e-01
  -1.15139998e-01  -7.85809994e-01]


In [122]:
import re

def create_emb():
    n_fact = vecs.shape[1]
    emb = np.zeros((vocab_size, n_fact))
    notfound = 0
    
    for i in range(1,len(emb)):
        word = idx2word[i]
        if word and re.match(r"^[a-zA-Z0-9\-]*$", word):
            src_idx = wordidx[word]
            emb[i] = vecs[src_idx]
        else:
            # If we can't find the word in glove, randomly initialize
            emb[i] = np.random.normal(scale=0.6, size=(n_fact,))
            notfound += 1

    print(notfound)
    # This is our "rare word" id - we want to randomly initialize
    emb[-1] = np.random.normal(scale=0.6, size=(n_fact,))
    emb/=3
    return emb
emb = create_emb()

85


In [123]:
model = Sequential([
    Embedding(vocab_size, 50, input_length=seq_len, weights=[emb], trainable=False),
    Dropout(0.2),
    Conv1D(64, 5, padding='same', activation='relu'),
    Dropout(0.25),
    MaxPooling1D(),
    Flatten(),
    Dense(100, activation='relu'),
    Dropout(0.6),
    Dense(1, activation='sigmoid')])

In [124]:
model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])

In [125]:
model.fit(trn, labels_train, validation_data=(test, labels_test), epochs=8, batch_size=64)

Train on 25000 samples, validate on 25000 samples
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.callbacks.History at 0x7f85b7f0ee80>

In [126]:
model.layers[0].trainable=True
model.optimizer.lr=1e-4
model.fit(trn, labels_train, validation_data=(test, labels_test), epochs=3, batch_size=64)

Train on 25000 samples, validate on 25000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f85b7e2acc0>

In [127]:
model.save_weights('glove50.h5')

# multi-size CNN

In [136]:
from keras.layers import Merge, Input
from keras.models import Model
from keras.layers.merge import concatenate

In [137]:
graph_in = Input ((vocab_size, 50))
convs = [ ] 
for fsz in range (3, 6): 
    x = Conv1D(64, fsz, padding='same', activation="relu")(graph_in)
    x = MaxPooling1D()(x) 
    x = Flatten()(x) 
    convs.append(x)
out = concatenate(convs) 
graph = Model(graph_in, out)

In [138]:
emb = create_emb()

85


In [139]:
model = Sequential ([
    Embedding(vocab_size, 50, input_length=seq_len, weights=[emb]),
    Dropout (0.2),
    graph,
    Dropout (0.5),
    Dense (100, activation="relu"),
    Dropout (0.7),
    Dense (1, activation='sigmoid')
    ])

In [140]:
model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])

In [141]:
model.fit(trn, labels_train, validation_data=(test, labels_test), epochs=2, batch_size=64)

Train on 25000 samples, validate on 25000 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f85ab11f668>

In [143]:
model.layers[0].trainable=False
model.optimizer.lr=1e-5
model.fit(trn, labels_train, validation_data=(test, labels_test), epochs=2, batch_size=64)

Train on 25000 samples, validate on 25000 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f85af7c02b0>

In [146]:
graph.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_5 (InputLayer)             (None, 5000, 50)      0                                            
____________________________________________________________________________________________________
conv1d_25 (Conv1D)               (None, 5000, 64)      9664        input_5[0][0]                    
____________________________________________________________________________________________________
conv1d_26 (Conv1D)               (None, 5000, 64)      12864       input_5[0][0]                    
____________________________________________________________________________________________________
conv1d_27 (Conv1D)               (None, 5000, 64)      16064       input_5[0][0]                    
___________________________________________________________________________________________

In [147]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_18 (Embedding)     (None, 500, 50)           250000    
_________________________________________________________________
dropout_40 (Dropout)         (None, 500, 50)           0         
_________________________________________________________________
model_2 (Model)              multiple                  38592     
_________________________________________________________________
dropout_41 (Dropout)         (None, 48000)             0         
_________________________________________________________________
dense_25 (Dense)             (None, 100)               4800100   
_________________________________________________________________
dropout_42 (Dropout)         (None, 100)               0         
_________________________________________________________________
dense_26 (Dense)             (None, 1)                 101       
Total para

# LSTM

In [150]:
from keras.layers import LSTM
from keras.regularizers import l2

In [154]:
model = Sequential([
    Embedding(vocab_size, 32, input_length=seq_len, mask_zero=True, embeddings_regularizer=l2(1e-6)),
    LSTM(100, implementation=2),
    Dense(1, activation='sigmoid')])
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_22 (Embedding)     (None, 500, 32)           160000    
_________________________________________________________________
lstm_4 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dense_30 (Dense)             (None, 1)                 101       
Total params: 213,301
Trainable params: 213,301
Non-trainable params: 0
_________________________________________________________________


In [155]:
model.fit(trn, labels_train, validation_data=(test, labels_test), epochs=5, batch_size=64)

Train on 25000 samples, validate on 25000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f8597f313c8>