In [3]:
from keras.layers import Embedding, CuDNNLSTM, Bidirectional, Dense, CuDNNGRU
from keras.initializers import Constant
from keras import Sequential
import keras.backend as K
import numpy as np
from keras.backend import epsilon

EMBEDDING_DIM = 300
INPUT_LENGTH = 1000


class Model:
    def __init__(self, tokenizer, embedding_matrix,
                 rnn_units=50,
                 bidirectional=True,
                 rnn_type='lstm',
                 show_summary=True):
        rnn_types = {
            'lstm': CuDNNLSTM,
            'gru': CuDNNGRU
        }
        rnn_type = rnn_types[rnn_type]

        # load pre-trained word embeddings into an Embedding layer
        # note that we set trainable = False so as to keep the embeddings fixed
        num_words = len(tokenizer.word_index) + 1
        embedding_layer = Embedding(num_words,
                                    EMBEDDING_DIM,
                                    embeddings_initializer=Constant(embedding_matrix),
                                    input_length=INPUT_LENGTH,
                                    trainable=False)
        model = Sequential()
        model.add(embedding_layer)
        if bidirectional:
            model.add(Bidirectional(rnn_type(rnn_units)))
        else:
            model.add(rnn_type(rnn_units))
        model.add(Dense(num_words, activation='softmax'))
        if show_summary:
            model.summary()

        self.model = model
        self.tokenizer = tokenizer

    def train(self, X, y, epochs=5, batch_size=32, callbacks=[]):
        model = self.model
        # compile network
        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=[_perplexity])
        # fit network
        model.fit(X, y, 
                  epochs=epochs,
                  batch_size=batch_size,
                  verbose=1, 
                  shuffle=True,
                  validation_split=0.2,
                  callbacks=callbacks)

    def predict(self, first_word, n_words, B=3):
        tokenizer = self.tokenizer
        model = self.model
        in_text, result = first_word, first_word
        encoded = get_encoded(in_text, tokenizer)
        beam_sequences_scores = [[[encoded], 0]]

        while len(result) < n_words:
            all_candidates = []
            beam_sequences_scores = beam_step(beam_sequences_scores)
            assert len(beam_sequences_scores) == B
            for seq_score in beam_sequences_scores:
                seq_scores = beam_step(seq_score)
                all_candidates.append(seq_scores)
            beam_sequences_scores = sorted(all_candidates, reverse=True, key=lambda seq, score: score)[:B]
            assert len(beam_sequences_scores) == B
            result, _= beam_sequences_scores[0]
        
        words = [get_word(token) for token in result]
        return ' '.join(words)
        
def beam_step(beam_sequences_scores):            
    all_candidates = []
    for seq, score in beam_sequences_scores: # for each sequence
        # predict top B words
        words_probs = model.predict_proba(seq, verbose=0)
        words_probs_sorted = -np.sort(-words_probs) # sorting in descending order
        top_b_words_probs = words_probs_sorted[:B] # top B words with max probability
        # for each prob in top B words, create a candidate
        for prob in top_b_words_probs: 
            word_token = list(top_b_words_probs).index(prob)
            candidate = [seq + [word_token], score + np.log(prob + epsilon())] # todo: word_token 
            all_candidates.append(candidate)
    # take candidates with max score
    beam_sequences_scores = sorted(all_candidates, reverse=True, key=lambda tup: tup[1])[:B]
    return beam_sequences_scores

def get_encoded(text, tokenizer):
    encoded = self.tokenizer.texts_to_sequences([text])[0]
    encoded = np.array(encoded)
    return encoded

def get_word(index):
    for word, idx in self.tokenizer.word_index.items():
         if idx == index:
            return word

def _perplexity(y_true, y_pred):
    cross_entropy = K.categorical_crossentropy(y_true, y_pred)
    perplexity = K.pow(2.0, cross_entropy)
    return perplexity


Using TensorFlow backend.


In [6]:
import warnings
warnings.filterwarnings("ignore")
import src.dataset as ds
import numpy as np
from src.embeddings import extract_embedding_weights
from keras.layers import Embedding, CuDNNLSTM, Bidirectional, Dense, CuDNNGRU
from keras.initializers import Constant
from keras import Sequential
from tensorflow.python.keras.callbacks import TensorBoard
# from src.model import Model

from time import time
import tensorflow as tf


In [7]:
X, y, tokenizer = ds.load_tokenized_data()
embedding_matrix = extract_embedding_weights()

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 615/615 [00:01<00:00, 357.69it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 615/615 [00:01<00:00, 364.44it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 615/615 [00:01<00:00, 364.23it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 615/615 [00:01<00:00, 360.39it/s]


In [9]:
model = Model(tokenizer, embedding_matrix)

Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1000, 300)         2251800   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 100)               140800    
_________________________________________________________________
dense_1 (Dense)              (None, 7506)              758106    
Total params: 3,150,706
Trainable params: 898,906
Non-trainable params: 2,251,800
_________________________________________________________________


In [10]:
model.train(X,y, epochs=1, batch_size=20)

ValueError: Error when checking input: expected embedding_1_input to have shape (1000,) but got array with shape (1,)