# Baseline Models

In [1]:
import math
import random

import numpy as np

import gensim

from tensorflow.keras import utils
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.layers import GRU, LSTM, Activation, Dense, Dropout, Embedding
from tensorflow.keras.callbacks import History



In [2]:
w2v_model = gensim.models.KeyedVectors.load("w2v.model", mmap='r')

In [3]:
vocab_size, emdedding_size = w2v_model.wv.vectors.shape
vocab_size, emdedding_size

(33831, 128)

In [4]:
x = np.load('data/x.npy')
y = np.load('data/y.npy')[:,0]

In [5]:
x.shape, y.shape

((17628, 133), (17628,))

In [6]:
# instantiate history to save losses
history = History()

## Baseline #1: GRU

In [7]:
gru = Sequential()

gru.add(Embedding(input_dim=vocab_size, output_dim=emdedding_size))
gru.add(GRU(256, input_shape=(vocab_size, emdedding_size), return_sequences=True))
gru.add(Dropout(0.3))
gru.add(GRU(128))
gru.add(Dense(vocab_size, activation='softmax'))

In [8]:
gru.compile(loss='sparse_categorical_crossentropy', optimizer='adam')

In [9]:
filepath = "weights/gru.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
callbacks = [checkpoint]

In [10]:
gru_loss = gru.fit(x, y, validation_split=0.2, batch_size=64, epochs=20, callbacks=callbacks)

Train on 14102 samples, validate on 3526 samples
Epoch 1/20
Epoch 00001: val_loss improved from inf to 7.19822, saving model to weights/gru.hdf5
Epoch 2/20
Epoch 00002: val_loss improved from 7.19822 to 6.78132, saving model to weights/gru.hdf5
Epoch 3/20
Epoch 00003: val_loss improved from 6.78132 to 6.60920, saving model to weights/gru.hdf5
Epoch 4/20
Epoch 00004: val_loss improved from 6.60920 to 6.45769, saving model to weights/gru.hdf5
Epoch 5/20
Epoch 00005: val_loss improved from 6.45769 to 6.31160, saving model to weights/gru.hdf5
Epoch 6/20
Epoch 00006: val_loss improved from 6.31160 to 6.18405, saving model to weights/gru.hdf5
Epoch 7/20
Epoch 00007: val_loss improved from 6.18405 to 6.00096, saving model to weights/gru.hdf5
Epoch 8/20
Epoch 00008: val_loss improved from 6.00096 to 5.73432, saving model to weights/gru.hdf5
Epoch 9/20
Epoch 00009: val_loss improved from 5.73432 to 5.40945, saving model to weights/gru.hdf5
Epoch 10/20
Epoch 00010: val_loss improved from 5.40945

## Baseline #2: GRU + Word2Vec

In [11]:
gru_w2v = Sequential()

gru_w2v.add(Embedding(input_dim=vocab_size, output_dim=emdedding_size, weights=[w2v_model.wv.vectors]))
gru_w2v.add(GRU(256, input_shape=(vocab_size, emdedding_size), return_sequences=True))
gru_w2v.add(Dropout(0.3))
gru_w2v.add(GRU(128))
gru_w2v.add(Dense(vocab_size, activation='softmax'))

In [12]:
gru_w2v.compile(loss='sparse_categorical_crossentropy', optimizer='adam')

In [13]:
filepath = "weights/gru_w2v.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
callbacks = [checkpoint]

In [14]:
gru_w2v_loss = gru_w2v.fit(x, y, validation_split=0.2, batch_size=64, epochs=20, callbacks=callbacks)

Train on 14102 samples, validate on 3526 samples
Epoch 1/20
Epoch 00001: val_loss improved from inf to 7.42383, saving model to weights/gru_w2v.hdf5
Epoch 2/20
Epoch 00002: val_loss improved from 7.42383 to 7.01568, saving model to weights/gru_w2v.hdf5
Epoch 3/20
Epoch 00003: val_loss improved from 7.01568 to 6.13331, saving model to weights/gru_w2v.hdf5
Epoch 4/20
Epoch 00004: val_loss improved from 6.13331 to 5.37418, saving model to weights/gru_w2v.hdf5
Epoch 5/20
Epoch 00005: val_loss improved from 5.37418 to 4.71446, saving model to weights/gru_w2v.hdf5
Epoch 6/20
Epoch 00006: val_loss improved from 4.71446 to 4.14173, saving model to weights/gru_w2v.hdf5
Epoch 7/20
Epoch 00007: val_loss improved from 4.14173 to 3.68866, saving model to weights/gru_w2v.hdf5
Epoch 8/20
Epoch 00008: val_loss improved from 3.68866 to 3.34726, saving model to weights/gru_w2v.hdf5
Epoch 9/20
Epoch 00009: val_loss improved from 3.34726 to 3.08607, saving model to weights/gru_w2v.hdf5
Epoch 10/20
Epoch 0

## Baseline #3: LSTM 

In [15]:
lstm = Sequential()

lstm.add(Embedding(input_dim=vocab_size, output_dim=emdedding_size))
lstm.add(LSTM(256, input_shape=(vocab_size, emdedding_size), return_sequences=True))
lstm.add(Dropout(0.2))
lstm.add(LSTM(256, return_sequences=True))
lstm.add(Dropout(0.2))
lstm.add(LSTM(256))
lstm.add(Dropout(0.2))
lstm.add(Dense(vocab_size, activation='softmax'))

In [16]:
lstm.compile(loss='sparse_categorical_crossentropy', optimizer='adam')

In [17]:
filepath = "weights/lstm.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
callbacks = [checkpoint]

In [None]:
lstm_loss = lstm.fit(x, y, validation_split=0.2, batch_size=64, epochs=20, callbacks=callbacks)

Train on 14102 samples, validate on 3526 samples
Epoch 1/20
Epoch 00001: val_loss improved from inf to 7.56309, saving model to weights/lstm.hdf5
Epoch 2/20
Epoch 00002: val_loss improved from 7.56309 to 7.40684, saving model to weights/lstm.hdf5
Epoch 3/20

## Baseline #4: LSTM + Word2Vec 

In [None]:
lstm_w2v = Sequential()

lstm_w2v.add(Embedding(input_dim=vocab_size, output_dim=emdedding_size, weights=[w2v_model.wv.vectors]))
lstm_w2v.add(LSTM(256, input_shape=(vocab_size, emdedding_size), return_sequences=True))
lstm_w2v.add(LSTM(256, return_sequences=True))
lstm_w2v.add(LSTM(256))
lstm_w2v.add(Dense(vocab_size, activation='softmax'))

In [None]:
lstm_w2v.compile(loss='sparse_categorical_crossentropy', optimizer='adam')

In [None]:
filepath = "weights/lstm_w2v.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
callbacks = [checkpoint]

In [None]:
lstm_w2v_loss = lstm_w2v.fit(x, y, validation_split=0.2, batch_size=64, epochs=20, callbacks=callbacks)

## Generation

In [None]:
# using top k sampling
def sample(preds, top_k):
    
    top_ids = preds.argsort()[-top_k:][::-1]
    next_id = top_ids[random.sample(range(top_k),1)[0]]
    
    return next_id

In [None]:
def word_to_id(word):
    return w2v_model.wv.key_to_index[word]

def id_to_word(id):
    return w2v_model.wv.index_to_key[id]

In [None]:
def generate(model=gru, prompt='In this paper', n=20, top_k=10):
    
    word_ids = [word_to_id(word) for word in prompt.lower().split()]
    
    for i in range(n):
        prediction = model.predict(x=np.array(word_ids))
        id = sample(prediction[-1], top_k)
        word_ids.append(id)
        
    words = [id_to_word(w) for w in word_ids]
    
    return ' '.join(words)

In [None]:
generate()

In [None]:
generate(model=lstm, n=50)

In [None]:
generate(model=lstm_w2v, prompt='In this paper we present a novel approach', n=20)

## Evaluation

In [None]:
models = {gru_loss: 'GRU', gru_w2v_loss: 'GRU + Word2Vec', lstm_loss: 'LSTM', lstm_w2v_loss: 'LSTM + Word2Vec'}

In [None]:
# get minimimum validation loss within a set num of epochs
def min_val_loss(model, max_epochs=50):
    return min(model.history['val_loss'][:max_epochs])

In [None]:
for m in models.keys():
    print("Minimum validation loss for {}: {:.5f}".format(models[m], min_val_loss(m)))
    print("Perplexity for model {}: {:.2f}\n".format(models[m], math.exp(min_val_loss(m))))