# Baseline Models

In [1]:
import numpy as np

import gensim

import math

from tensorflow.keras import utils
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.layers import GRU, LSTM, Activation, Dense, Dropout, Embedding
from tensorflow.keras.callbacks import History 



In [2]:
w2v_model = gensim.models.KeyedVectors.load("w2v.model", mmap='r')

In [3]:
vocab_size, emdedding_size = w2v_model.wv.vectors.shape
vocab_size, emdedding_size

(17862, 100)

In [4]:
x = np.load('data/x.npy')
y = np.load('data/y.npy')[:,0]

In [5]:
x.shape, y.shape

((5841, 100), (5841,))

In [6]:
# instantiate history to save losses
history = History()

## Baseline #1: GRU

In [25]:
gru = Sequential()

gru.add(Embedding(input_dim=vocab_size, output_dim=emdedding_size))
gru.add(GRU(128, input_shape=(vocab_size, emdedding_size), return_sequences=True))
gru.add(Dropout(0.3))
gru.add(GRU(128))
gru.add(Dropout(0.3))
# gru.add(GRU(64))
# gru.add(Dropout(0.3))
gru.add(Dense(vocab_size, activation='softmax'))

In [26]:
gru.compile(loss='sparse_categorical_crossentropy', optimizer='adam')

In [9]:
filepath = "weights/gru.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
callbacks = [checkpoint]

In [27]:
gru_loss = gru.fit(x, y, validation_split=0.2, batch_size=64, epochs=10, callbacks=callbacks)

Train on 4672 samples, validate on 1169 samples
Epoch 1/10
Epoch 00001: val_loss did not improve from 7.43431
Epoch 2/10
Epoch 00002: val_loss did not improve from 7.43431
Epoch 3/10
Epoch 00003: val_loss did not improve from 7.43431
Epoch 4/10
Epoch 00004: val_loss did not improve from 7.43431
Epoch 5/10
Epoch 00005: val_loss did not improve from 7.43431
Epoch 6/10
Epoch 00006: val_loss did not improve from 7.43431
Epoch 7/10
Epoch 00007: val_loss did not improve from 7.43431
Epoch 8/10
Epoch 00008: val_loss did not improve from 7.43431
Epoch 9/10
Epoch 00009: val_loss did not improve from 7.43431
Epoch 10/10
Epoch 00010: val_loss did not improve from 7.43431


In [14]:
gru_loss = gru.fit(x, y, validation_split=0.2, batch_size=64, epochs=5, callbacks=callbacks)

Train on 4672 samples, validate on 1169 samples
Epoch 1/5
Epoch 00001: val_loss did not improve from 7.53704
Epoch 2/5
Epoch 00002: val_loss did not improve from 7.53704
Epoch 3/5
Epoch 00003: val_loss did not improve from 7.53704
Epoch 4/5
Epoch 00004: val_loss did not improve from 7.53704
Epoch 5/5
Epoch 00005: val_loss did not improve from 7.53704


In [24]:
gru_loss = gru.fit(x, y, validation_split=0.2, batch_size=64, epochs=10, callbacks=callbacks)

Train on 4672 samples, validate on 1169 samples
Epoch 1/10
Epoch 00001: val_loss improved from 7.53704 to 7.43431, saving model to weights/gru.hdf5
Epoch 2/10
Epoch 00002: val_loss did not improve from 7.43431
Epoch 3/10
Epoch 00003: val_loss did not improve from 7.43431
Epoch 4/10
Epoch 00004: val_loss did not improve from 7.43431
Epoch 5/10
Epoch 00005: val_loss did not improve from 7.43431
Epoch 6/10
Epoch 00006: val_loss did not improve from 7.43431
Epoch 7/10
Epoch 00007: val_loss did not improve from 7.43431
Epoch 8/10
Epoch 00008: val_loss did not improve from 7.43431
Epoch 9/10
Epoch 00009: val_loss did not improve from 7.43431
Epoch 10/10
Epoch 00010: val_loss did not improve from 7.43431


## Baseline #2: GRU + Word2Vec

In [28]:
gru_w2v = Sequential()

gru_w2v.add(Embedding(input_dim=vocab_size, output_dim=emdedding_size, weights=[w2v_model.wv.vectors]))
gru_w2v.add(GRU(256, input_shape=(vocab_size, emdedding_size), return_sequences=True))
gru_w2v.add(Dropout(0.2))
gru_w2v.add(GRU(256, return_sequences=True))
gru_w2v.add(Dropout(0.2))
gru_w2v.add(GRU(128))
gru_w2v.add(Dropout(0.2))
gru_w2v.add(Dense(vocab_size, activation='softmax'))

In [29]:
gru_w2v.compile(loss='sparse_categorical_crossentropy', optimizer='adam')

In [30]:
filepath = "weights/gru_w2v.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
callbacks = [checkpoint]

In [31]:
gru_w2v_loss = gru_w2v.fit(x, y, validation_split=0.2, batch_size=64, epochs=20, callbacks=callbacks)

Train on 4672 samples, validate on 1169 samples
Epoch 1/20
Epoch 00001: val_loss improved from inf to 7.63390, saving model to weights/gru_w2v.hdf5
Epoch 2/20
Epoch 00002: val_loss did not improve from 7.63390
Epoch 3/20
Epoch 00003: val_loss did not improve from 7.63390
Epoch 4/20
Epoch 00004: val_loss did not improve from 7.63390
Epoch 5/20
Epoch 00005: val_loss did not improve from 7.63390
Epoch 6/20
Epoch 00006: val_loss improved from 7.63390 to 7.51785, saving model to weights/gru_w2v.hdf5
Epoch 7/20
Epoch 00007: val_loss improved from 7.51785 to 7.48169, saving model to weights/gru_w2v.hdf5
Epoch 8/20
Epoch 00008: val_loss did not improve from 7.48169
Epoch 9/20
Epoch 00009: val_loss improved from 7.48169 to 7.43547, saving model to weights/gru_w2v.hdf5
Epoch 10/20
Epoch 00010: val_loss improved from 7.43547 to 7.38837, saving model to weights/gru_w2v.hdf5
Epoch 11/20
Epoch 00011: val_loss improved from 7.38837 to 7.30014, saving model to weights/gru_w2v.hdf5
Epoch 12/20
Epoch 00

## Baseline #3: LSTM 

In [32]:
lstm = Sequential()

lstm.add(Embedding(input_dim=vocab_size, output_dim=emdedding_size))
lstm.add(GRU(256, input_shape=(vocab_size, emdedding_size), return_sequences=True))
lstm.add(Dropout(0.2))
lstm.add(GRU(256, return_sequences=True))
lstm.add(Dropout(0.2))
lstm.add(GRU(128))
lstm.add(Dropout(0.2))
lstm.add(Dense(vocab_size, activation='softmax'))

In [33]:
lstm.compile(loss='sparse_categorical_crossentropy', optimizer='adam')

In [34]:
filepath = "weights/lstm.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
callbacks = [checkpoint]

In [35]:
lstm_loss = lstm.fit(x, y, validation_split=0.2, batch_size=64, epochs=10, callbacks=callbacks)

Train on 4672 samples, validate on 1169 samples
Epoch 1/20
Epoch 00001: val_loss improved from inf to 7.54207, saving model to weights/lstm.hdf5
Epoch 2/20
Epoch 00002: val_loss did not improve from 7.54207
Epoch 3/20
Epoch 00003: val_loss did not improve from 7.54207
Epoch 4/20
Epoch 00004: val_loss did not improve from 7.54207
Epoch 5/20
Epoch 00005: val_loss did not improve from 7.54207
Epoch 6/20
Epoch 00006: val_loss did not improve from 7.54207
Epoch 7/20
Epoch 00007: val_loss did not improve from 7.54207
Epoch 8/20
Epoch 00008: val_loss did not improve from 7.54207
Epoch 9/20
Epoch 00009: val_loss did not improve from 7.54207
Epoch 10/20
Epoch 00010: val_loss did not improve from 7.54207
Epoch 11/20
Epoch 00011: val_loss did not improve from 7.54207
Epoch 12/20
Epoch 00012: val_loss did not improve from 7.54207
Epoch 13/20
Epoch 00013: val_loss did not improve from 7.54207
Epoch 14/20
Epoch 00014: val_loss improved from 7.54207 to 7.39421, saving model to weights/lstm.hdf5
Epoch

## Baseline #4: LSTM + Word2Vec 

In [36]:
lstm_w2v = Sequential()

lstm_w2v.add(Embedding(input_dim=vocab_size, output_dim=emdedding_size, weights=[w2v_model.wv.vectors]))
lstm_w2v.add(LSTM(256, input_shape=(vocab_size, emdedding_size), return_sequences=True))
lstm_w2v.add(Dropout(0.2))
lstm_w2v.add(LSTM(256, return_sequences=True))
lstm_w2v.add(Dropout(0.2))
lstm_w2v.add(LSTM(128))
lstm_w2v.add(Dropout(0.2))
lstm_w2v.add(Dense(vocab_size, activation='softmax'))

In [37]:
lstm_w2v.compile(loss='sparse_categorical_crossentropy', optimizer='adam')

In [38]:
filepath = "weights/lstm_w2v.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
callbacks = [checkpoint]

In [47]:
lstm_w2v_loss = lstm_w2v.fit(x, y, validation_split=0.2, batch_size=64, epochs=100, callbacks=callbacks)

Train on 4672 samples, validate on 1169 samples
Epoch 1/100
Epoch 00001: val_loss improved from inf to 7.51032, saving model to weights/lstm_w2v.hdf5
Epoch 2/100
Epoch 00002: val_loss did not improve from 7.51032
Epoch 3/100
Epoch 00003: val_loss did not improve from 7.51032
Epoch 4/100
Epoch 00004: val_loss did not improve from 7.51032
Epoch 5/100
Epoch 00005: val_loss did not improve from 7.51032
Epoch 6/100
Epoch 00006: val_loss did not improve from 7.51032
Epoch 7/100
Epoch 00007: val_loss did not improve from 7.51032
Epoch 8/100
Epoch 00008: val_loss did not improve from 7.51032
Epoch 9/100
Epoch 00009: val_loss did not improve from 7.51032
Epoch 10/100
Epoch 00010: val_loss did not improve from 7.51032
Epoch 11/100
Epoch 00011: val_loss improved from 7.51032 to 7.28065, saving model to weights/lstm_w2v.hdf5
Epoch 12/100
Epoch 00012: val_loss improved from 7.28065 to 7.21565, saving model to weights/lstm_w2v.hdf5
Epoch 13/100
Epoch 00013: val_loss improved from 7.21565 to 6.98366,

## Generation

In [40]:
def sample(preds, temperature):
    if temperature <= 0:
        return np.argmax(preds)

    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
  
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
  
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [41]:
def word_to_id(word):
    return w2v_model.wv.key_to_index[word]

def id_to_word(id):
    return w2v_model.wv.index_to_key[id]

In [42]:
def generate(model=gru, prompt='In this paper', words=20, temperature=0.2):
    word_ids = [word_to_id(word) for word in prompt.lower().split()]
    
    for i in range(words):
        prediction = model.predict(x=np.array(word_ids))
        id = sample(prediction[-1], temperature)
        word_ids.append(id)
    
    return ' '.join(id_to_word(id) for id in word_ids)

In [46]:
generate(model=gru_w2v, prompt='In this paper we present a novel approach')

'in this paper we present a novel approach faroese bots websites guards ideologies satisfies unibuc mance cialis automatizes royal utilit toute som bcp yifan entitled distorted approximation subtree'

## Evaluation

In [60]:
models = {gru_loss: 'GRU', gru_w2v_loss: 'GRU + Word2Vec', lstm_loss: 'LSTM', lstm_w2v_loss: 'LSTM + Word2Vec'}

In [66]:
# get minimimum validation loss for a model 
def min_val_loss(model, max_epochs=100):
    return min(model.history['val_loss'][:max_epochs])

In [70]:
for m in models.keys():
    print("Minimum validation loss for {}: {:.5f}".format(models[m], min_val_loss(m, 50)))
    print("Perplexity for model {}: {:.2f}\n".format(models[m], math.exp(min_val_loss(m))))

Minimum validation loss for GRU: 7.56549
Perplexity for model GRU: 1930.41

Minimum validation loss for GRU + Word2Vec: 5.64481
Perplexity for model GRU + Word2Vec: 282.82

Minimum validation loss for LSTM: 6.84608
Perplexity for model LSTM: 940.19

Minimum validation loss for LSTM + Word2Vec: 5.65056
Perplexity for model LSTM + Word2Vec: 234.66

