# Baseline Models

In [100]:
import math
import random

import numpy as np

import gensim

from tensorflow.keras import utils
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.layers import GRU, LSTM, Activation, Dense, Dropout, Embedding
from tensorflow.keras.callbacks import History

In [2]:
w2v_model = gensim.models.KeyedVectors.load("w2v.model", mmap='r')

In [3]:
vocab_size, emdedding_size = w2v_model.wv.vectors.shape
vocab_size, emdedding_size

(33831, 128)

In [4]:
x = np.load('data/x.npy')
y = np.load('data/y.npy')[:,0]

In [5]:
x.shape, y.shape

((17628, 133), (17628,))

In [6]:
# instantiate history to save losses
history = History()

## Baseline #1: GRU

In [35]:
gru = Sequential()

gru.add(Embedding(input_dim=vocab_size, output_dim=emdedding_size))
gru.add(GRU(256, input_shape=(vocab_size, emdedding_size), return_sequences=True))
gru.add(Dropout(0.3))
gru.add(GRU(128))
gru.add(Dense(vocab_size, activation='softmax'))

In [36]:
gru.compile(loss='sparse_categorical_crossentropy', optimizer='adam')

In [37]:
filepath = "weights/gru.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
callbacks = [checkpoint]

In [38]:
gru_loss = gru.fit(x, y, validation_split=0.2, batch_size=64, epochs=20, callbacks=callbacks)

Train on 14102 samples, validate on 3526 samples
Epoch 1/20
Epoch 00001: val_loss improved from inf to 7.35866, saving model to weights/gru.hdf5
Epoch 2/20
Epoch 00002: val_loss improved from 7.35866 to 6.63899, saving model to weights/gru.hdf5
Epoch 3/20
Epoch 00003: val_loss improved from 6.63899 to 6.54964, saving model to weights/gru.hdf5
Epoch 4/20
Epoch 00004: val_loss improved from 6.54964 to 6.49495, saving model to weights/gru.hdf5
Epoch 5/20
Epoch 00005: val_loss improved from 6.49495 to 6.38037, saving model to weights/gru.hdf5
Epoch 6/20
Epoch 00006: val_loss improved from 6.38037 to 6.20430, saving model to weights/gru.hdf5
Epoch 7/20
Epoch 00007: val_loss improved from 6.20430 to 6.02909, saving model to weights/gru.hdf5
Epoch 8/20
Epoch 00008: val_loss improved from 6.02909 to 5.77966, saving model to weights/gru.hdf5
Epoch 9/20
Epoch 00009: val_loss improved from 5.77966 to 5.68209, saving model to weights/gru.hdf5
Epoch 10/20
Epoch 00010: val_loss improved from 5.68209

## Baseline #2: GRU + Word2Vec

In [11]:
gru_w2v = Sequential()

gru_w2v.add(Embedding(input_dim=vocab_size, output_dim=emdedding_size, weights=[w2v_model.wv.vectors]))
gru_w2v.add(GRU(256, input_shape=(vocab_size, emdedding_size), return_sequences=True))
gru_w2v.add(Dropout(0.3))
gru_w2v.add(GRU(128))
gru_w2v.add(Dense(vocab_size, activation='softmax'))

In [12]:
gru_w2v.compile(loss='sparse_categorical_crossentropy', optimizer='adam')

In [13]:
filepath = "weights/gru_w2v.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
callbacks = [checkpoint]

In [14]:
gru_w2v_loss = gru_w2v.fit(x, y, validation_split=0.2, batch_size=64, epochs=20, callbacks=callbacks)

Train on 14102 samples, validate on 3526 samples
Epoch 1/20
Epoch 00001: val_loss improved from inf to 7.06587, saving model to weights/gru_w2v.hdf5
Epoch 2/20
Epoch 00002: val_loss improved from 7.06587 to 6.64127, saving model to weights/gru_w2v.hdf5
Epoch 3/20
Epoch 00003: val_loss improved from 6.64127 to 5.66575, saving model to weights/gru_w2v.hdf5
Epoch 4/20
Epoch 00004: val_loss improved from 5.66575 to 4.91596, saving model to weights/gru_w2v.hdf5
Epoch 5/20
Epoch 00005: val_loss improved from 4.91596 to 4.31876, saving model to weights/gru_w2v.hdf5
Epoch 6/20
Epoch 00006: val_loss improved from 4.31876 to 3.83396, saving model to weights/gru_w2v.hdf5
Epoch 7/20
Epoch 00007: val_loss improved from 3.83396 to 3.46584, saving model to weights/gru_w2v.hdf5
Epoch 8/20
Epoch 00008: val_loss improved from 3.46584 to 3.22089, saving model to weights/gru_w2v.hdf5
Epoch 9/20
Epoch 00009: val_loss improved from 3.22089 to 3.05204, saving model to weights/gru_w2v.hdf5
Epoch 10/20
Epoch 0

## Baseline #3: LSTM 

In [15]:
lstm = Sequential()

lstm.add(Embedding(input_dim=vocab_size, output_dim=emdedding_size))
lstm.add(LSTM(256, input_shape=(vocab_size, emdedding_size), return_sequences=True))
lstm.add(Dropout(0.2))
lstm.add(LSTM(256, return_sequences=True))
lstm.add(Dropout(0.2))
lstm.add(LSTM(256))
lstm.add(Dropout(0.2))
lstm.add(Dense(vocab_size, activation='softmax'))

In [16]:
lstm.compile(loss='sparse_categorical_crossentropy', optimizer='adam')

In [17]:
filepath = "weights/lstm.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
callbacks = [checkpoint]

In [18]:
lstm_loss = lstm.fit(x, y, validation_split=0.2, batch_size=64, epochs=20, callbacks=callbacks)

Train on 14102 samples, validate on 3526 samples
Epoch 1/20
Epoch 00001: val_loss improved from inf to 7.57240, saving model to weights/lstm.hdf5
Epoch 2/20
Epoch 00002: val_loss improved from 7.57240 to 7.41155, saving model to weights/lstm.hdf5
Epoch 3/20
Epoch 00003: val_loss did not improve from 7.41155
Epoch 4/20
Epoch 00004: val_loss did not improve from 7.41155
Epoch 5/20
Epoch 00005: val_loss did not improve from 7.41155
Epoch 6/20
Epoch 00006: val_loss did not improve from 7.41155
Epoch 7/20
Epoch 00007: val_loss did not improve from 7.41155
Epoch 8/20
Epoch 00008: val_loss improved from 7.41155 to 7.27573, saving model to weights/lstm.hdf5
Epoch 9/20
Epoch 00009: val_loss improved from 7.27573 to 6.97503, saving model to weights/lstm.hdf5
Epoch 10/20
Epoch 00010: val_loss improved from 6.97503 to 6.71988, saving model to weights/lstm.hdf5
Epoch 11/20
Epoch 00011: val_loss improved from 6.71988 to 6.58038, saving model to weights/lstm.hdf5
Epoch 12/20
Epoch 00012: val_loss did

## Baseline #4: LSTM + Word2Vec 

In [45]:
lstm_w2v = Sequential()

lstm_w2v.add(Embedding(input_dim=vocab_size, output_dim=emdedding_size, weights=[w2v_model.wv.vectors]))
lstm_w2v.add(LSTM(256, input_shape=(vocab_size, emdedding_size), return_sequences=True))
lstm_w2v.add(LSTM(256, return_sequences=True))
lstm_w2v.add(LSTM(256))
lstm_w2v.add(Dense(vocab_size, activation='softmax'))

In [46]:
lstm_w2v.compile(loss='sparse_categorical_crossentropy', optimizer='adam')

In [47]:
filepath = "weights/lstm_w2v.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
callbacks = [checkpoint]

In [48]:
lstm_w2v_loss = lstm_w2v.fit(x, y, validation_split=0.2, batch_size=64, epochs=20, callbacks=callbacks)

Train on 14102 samples, validate on 3526 samples
Epoch 1/20
Epoch 00001: val_loss improved from inf to 7.38322, saving model to weights/lstm_w2v.hdf5
Epoch 2/20
Epoch 00002: val_loss did not improve from 7.38322
Epoch 3/20
Epoch 00003: val_loss improved from 7.38322 to 7.37524, saving model to weights/lstm_w2v.hdf5
Epoch 4/20
Epoch 00004: val_loss improved from 7.37524 to 6.81413, saving model to weights/lstm_w2v.hdf5
Epoch 5/20
Epoch 00005: val_loss improved from 6.81413 to 6.60395, saving model to weights/lstm_w2v.hdf5
Epoch 6/20
Epoch 00006: val_loss did not improve from 6.60395
Epoch 7/20
Epoch 00007: val_loss did not improve from 6.60395
Epoch 8/20
Epoch 00008: val_loss improved from 6.60395 to 6.50708, saving model to weights/lstm_w2v.hdf5
Epoch 9/20
Epoch 00009: val_loss did not improve from 6.50708
Epoch 10/20
Epoch 00010: val_loss improved from 6.50708 to 6.49947, saving model to weights/lstm_w2v.hdf5
Epoch 11/20
Epoch 00011: val_loss did not improve from 6.49947
Epoch 12/20
E

## Generation

In [130]:
# using top k sampling
def sample(preds, top_k):
    
    top_ids = preds.argsort()[-top_k:][::-1]
    next_id = top_ids[random.sample(range(top_k),1)[0]]
    
    return next_id

In [24]:
def word_to_id(word):
    return w2v_model.wv.key_to_index[word]

def id_to_word(id):
    return w2v_model.wv.index_to_key[id]

In [138]:
def generate(model=gru, prompt='In this paper', n=20, top_k=10):
    
    word_ids = [word_to_id(word) for word in prompt.lower().split()]
    
    for i in range(n):
        prediction = model.predict(x=np.array(word_ids))
        id = sample(prediction[-1], top_k)
        word_ids.append(id)
        
    words = [id_to_word(w) for w in word_ids]
    
    return ' '.join(words)

In [140]:
generate()

'in this paper umuteam harness oupoco succession coce raisonnement vis refuse videmment sentes srivastava airbus verypluming edit neuralclassifier originating bistparser subsective overestimated natifs'

In [61]:
generate(model=lstm, n=50)

'in this paper de that to the the we a and a as a that of we we the the that of a internal a of a to we of the a the describe learning order ner has used l order events e general correlations s it human a of the a we'

In [79]:
generate(model=lstm_w2v, prompt='In this paper we present a novel approach', n=20)

'in this paper we present a novel approach efforts f annotated four approaches present mechanism apply bert all accuracy identification real generate century long weather unlike and detection'

## Evaluation

In [55]:
models = {gru_loss: 'GRU', gru_w2v_loss: 'GRU + Word2Vec', lstm_loss: 'LSTM', lstm_w2v_loss: 'LSTM + Word2Vec'}

In [56]:
# get minimimum validation loss within a set num of epochs
def min_val_loss(model, max_epochs=50):
    return min(model.history['val_loss'][:max_epochs])

In [57]:
for m in models.keys():
    print("Minimum validation loss for {}: {:.5f}".format(models[m], min_val_loss(m)))
    print("Perplexity for model {}: {:.2f}\n".format(models[m], math.exp(min_val_loss(m))))

Minimum validation loss for GRU: 4.25001
Perplexity for model GRU: 70.11

Minimum validation loss for GRU + Word2Vec: 2.66512
Perplexity for model GRU + Word2Vec: 14.37

Minimum validation loss for LSTM: 6.53958
Perplexity for model LSTM: 692.00

Minimum validation loss for LSTM + Word2Vec: 6.45779
Perplexity for model LSTM + Word2Vec: 637.65

