# Import

In [1]:
from keras.datasets import reuters
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Softmax, Dropout
from keras.layers import SimpleRNN, LSTM, Embedding, Bidirectional, GlobalAveragePooling1D
from keras.utils import to_categorical

import numpy as np
import os

Using TensorFlow backend.


# Constants

In [2]:
MAX_SEQUENCE_LENGTH = 200
VOCAB_SIZE = 10000
EMBEDDING_DIM = 25

# Data Loading

In [6]:
(x_train, y_train), (x_test, y_test) = reuters.load_data(path="reuters.npz",
                                                         num_words=VOCAB_SIZE,
                                                         skip_top=0,
                                                         maxlen=MAX_SEQUENCE_LENGTH,
                                                         test_split=0.2,
                                                         seed=113,
                                                         start_char=1,
                                                         oov_char=2,
                                                         index_from=3)
vocab = reuters.get_word_index(path="reuters_word_index.json")
word2idx = {k: vocab[k] for k in vocab if vocab[k] < VOCAB_SIZE}
word2idx


{'woods': 8803,
 'hermann': 8804,
 'heublein': 8352,
 'four': 185,
 'grains': 1642,
 'wednesday': 1220,
 'duffour': 7593,
 'elections': 3914,
 '270': 2563,
 '271': 3551,
 '272': 5113,
 '273': 3552,
 '274': 3400,
 'rudman': 7975,
 '276': 3401,
 '277': 3478,
 '278': 3632,
 '279': 4309,
 'dormancy': 9381,
 'errors': 7247,
 'deferred': 3086,
 'cooking': 8805,
 'nawg': 7972,
 'affiliates': 2891,
 'china': 595,
 'affiliated': 3189,
 'climbed': 3028,
 'controversy': 6693,
 'millimetres': 9382,
 'golden': 4007,
 'projection': 5689,
 "hudson's": 7903,
 'lme': 2394,
 'therefore': 1984,
 'distortions': 6959,
 'meteorologist': 8806,
 'loss': 43,
 'exco': 9383,
 'nakasone': 1267,
 "india's": 3633,
 'wang': 3029,
 'want': 850,
 'absolute': 7973,
 'travel': 4677,
 'cutback': 6422,
 'modest': 1858,
 'welcomed': 2461,
 'fit': 4205,
 'bringing': 1916,
 'fix': 4819,
 '624': 6164,
 'wales': 6165,
 'fin': 8807,
 'effects': 1788,
 'undeveloped': 8808,
 'allan': 6960,
 '393': 3891,
 '392': 4008,
 '391': 4206

In [4]:
X_train = pad_sequences(x_train, maxlen=MAX_SEQUENCE_LENGTH, value=0)
X_test = pad_sequences(x_test, maxlen=MAX_SEQUENCE_LENGTH, value=0)

# Pre-trained embeddings

In [5]:
embeddings_index = {}
f = open(os.path.join('glove/', 'glove.twitter.27B.25d.txt'))
# These are GloVe embeddings trained on 6 billion words! These are 25 element vectors (2Gb)
# (but they have 100, 300 and 500 elements long)
# Can use FastText embeddings instead
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

FileNotFoundError: [Errno 2] No such file or directory: '/Users/fahim/Downloads/glove/glove.twitter.27B.25d.txt'

In [7]:
found = 0
embedding_matrix = np.zeros((len(word2idx) + 1, EMBEDDING_DIM)) # Initializing random will be better than zeros
for word, i in word2idx.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
        found += 1

print("Loaded %d/%d pre-trained vectors"%(found, len(word2idx)))
embedding_matrix.shape
embedding_matrix

Loaded 0/9999 pre-trained vectors


array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

# Model Definition

In [8]:
model = Sequential()
model.add(Embedding(output_dim=EMBEDDING_DIM, 
                    input_dim=VOCAB_SIZE, 
                    input_length=MAX_SEQUENCE_LENGTH,
                    weights=[embedding_matrix], # Additionally we give the Wi
                    trainable=False)) # Don't train the embeddings - just use GloVe embeddings
# We can start with pre-trained embeddings and then fine-tune them using our data by setting trainable to True
model.add(SimpleRNN(100, activation='relu'))
model.add(Dense(46))
model.add(Softmax())

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['acc'])

In [9]:
model.fit(X_train, to_categorical(y_train), epochs=5, validation_split=0.05)

Train on 6722 samples, validate on 354 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1834228c88>

In [12]:
loss, acc = model.evaluate(X_test, to_categorical(y_test))
print("Test accuracy: %0.2f%%"%(acc*100))

Test accuracy: 52.37%


Accuracy increased from 40% to 52% by just using GloVe embeddings