# Baseline Models

In [45]:
import preprocessing

import numpy as np

import gensim

from tensorflow.keras import utils
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.layers import GRU, LSTM, Activation, Dense, Dropout, Embedding

## Word2Vec

In [39]:
max_sentence_len = 100

sent_split = zip(*[iter(preprocessing.tokens)]*max_sentence_len)
sentences = [list(s) for s in sent_split]

In [40]:
# adjust word embedding size according to paper findings

w2v_abs_model = gensim.models.Word2Vec(sentences, vector_size=100, min_count=1, window=5, epochs=20)
w2v_abs_model.save("word2vec_arxiv_abstracts.model")

KeyboardInterrupt: 

In [26]:
vocab_size, emdedding_size = w2v_abs_model.wv.vectors.shape
print(vocab_size,emdedding_size)

37613 100


In [27]:
example_vector = w2v_abs_model.wv['computer']
print(example_vector)

[ 4.372654    2.7433672   0.62932134 -0.8145154  -0.9035992  -1.4313896
  1.5141246  -1.9577683   0.15298018 -1.295687   -1.7438972   1.1404539
  2.0444472   2.3379836   2.034976   -0.8598816  -1.2982026   0.26897135
 -1.2041309  -1.4754906   0.52895623  0.11968645 -4.9213243   0.17976657
 -4.659229    0.06960139 -3.7481487   0.45358458  2.0879383  -0.5005908
  0.5270861   0.7232373  -0.9296084  -0.66182303  0.9198066   0.98656416
  1.5054046   0.7764223  -2.9103794  -3.245516   -2.9878287  -2.6478703
  1.288185    2.2701735   0.01422079  2.507414   -0.6688591  -1.3786908
  0.4868558   0.22375818  2.0643394   0.8521452   3.7033792  -1.5795045
 -1.2823077  -2.4139853   0.770057    2.065975    1.6900107  -3.6990576
 -1.5111724  -4.3736672   1.6173257   2.0998533  -0.6258489  -2.1910467
  2.344055   -2.5877545  -0.59883416  0.1914055   0.5979591  -1.1765189
  1.1242287   2.663258   -2.7722313   3.4164665  -0.03385113  1.063215
 -1.9169434  -0.7599689  -0.9681411   0.5473986  -2.4375563   

In [29]:
example_similar = w2v_abs_model.wv.most_similar('language', topn=10) 
print(example_similar)

[('languages', 0.555637001991272), ('lan', 0.4815512001514435), ('badges', 0.4753008484840393), ('paraphrase', 0.4374748170375824), ('samplerank', 0.4367715120315552), ('deduction', 0.43126803636550903), ('disasters', 0.42568325996398926), ('dialect', 0.42256200313568115), ('nlu', 0.42144766449928284), ('phonology', 0.41899043321609497)]


In [38]:
def word_to_id(word):
    return w2v_abs_model.wv.key_to_index[word]

def id_to_word(id):
    return w2v_abs_model.wv.index_to_key[id]

## Training data

In [31]:
train_x = np.zeros([len(sentences), max_sentence_len], dtype=np.int32)
train_y = np.zeros([len(sentences)], dtype=np.int32)

for i, sentence in enumerate(sentences):
    for t, word in enumerate(sentence[:-1]):
        train_x[i, t] = word_to_id(word)
    train_y[i] = word_to_id(sentence[-1])

print('train_x shape:', train_x.shape)
print('train_y shape:', train_y.shape)

train_x shape: (236240, 100)
train_y shape: (236240,)


## Baseline #1: GRU

In [80]:
gru = Sequential()

gru.add(Embedding(input_dim=vocab_size, output_dim=emdedding_size))
gru.add(GRU(256, input_shape=(vocab_size, emdedding_size), return_sequences=True))
gru.add(Dropout(0.2))
gru.add(GRU(256, return_sequences=True))
gru.add(Dropout(0.2))
gru.add(GRU(128))
gru.add(Dropout(0.2))
gru.add(Dense(vocab_size, activation='softmax'))

In [81]:
gru.compile(loss='sparse_categorical_crossentropy', optimizer='adam')

In [82]:
filepath = "weights/gru.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks = [checkpoint]

In [83]:
gru.fit(train_x, train_y, batch_size=128, epochs=1, callbacks=callbacks)

Train on 236240 samples
 10624/236240 [>.............................] - ETA: 54:57 - loss: 8.2486
Epoch 00001: loss improved from inf to 8.14925, saving model to weights/gru.hdf5
 10624/236240 [>.............................] - ETA: 55:01 - loss: 8.2486

KeyboardInterrupt: 

## Baseline #2: GRU + Word2Vec

In [62]:
gru_w2v = Sequential()

gru_w2v.add(Embedding(input_dim=vocab_size, output_dim=emdedding_size, weights=[w2v_abs_model.wv.vectors]))
gru_w2v.add(GRU(256, input_shape=(vocab_size, emdedding_size), return_sequences=True))
gru_w2v.add(Dropout(0.2))
gru_w2v.add(GRU(256, return_sequences=True))
gru_w2v.add(Dropout(0.2))
gru_w2v.add(GRU(128))
gru_w2v.add(Dropout(0.2))
gru_w2v.add(Dense(vocab_size, activation='softmax'))

In [67]:
gru_w2v.compile(loss='sparse_categorical_crossentropy', optimizer='adam')

In [68]:
filepath = "weights/gru_w2v.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks = [checkpoint]

In [69]:
gru_w2v.fit(train_x, train_y, batch_size=128, epochs=10, callbacks=callbacks)

Train on 236240 samples
Epoch 1/10


KeyboardInterrupt: 

## Baseline #3: LSTM

In [70]:
lstm = Sequential()

lstm.add(Embedding(input_dim=vocab_size, output_dim=emdedding_size))
lstm.add(GRU(256, input_shape=(vocab_size, emdedding_size), return_sequences=True))
lstm.add(Dropout(0.2))
lstm.add(GRU(256, return_sequences=True))
lstm.add(Dropout(0.2))
lstm.add(GRU(128))
lstm.add(Dropout(0.2))
lstm.add(Dense(vocab_size, activation='softmax'))

In [71]:
lstm.compile(loss='sparse_categorical_crossentropy', optimizer='adam')

In [72]:
filepath = "weights/lstm.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks = [checkpoint]

In [73]:
lstm.fit(train_x, train_y, batch_size=128, epochs=10, callbacks=callbacks)

Train on 236240 samples
Epoch 1/10


KeyboardInterrupt: 

## Baseline #4: LSTM + Word2Vec 

In [74]:
lstm_w2v = Sequential()

lstm_w2v.add(Embedding(input_dim=vocab_size, output_dim=100, weights=[w2v_abs_model.wv.vectors]))
lstm_w2v.add(LSTM(256, input_shape=(vocab_size, 100), return_sequences=True))
lstm_w2v.add(Dropout(0.2))
lstm_w2v.add(LSTM(256, return_sequences=True))
lstm_w2v.add(Dropout(0.2))
lstm_w2v.add(LSTM(128))
lstm_w2v.add(Dropout(0.2))
lstm_w2v.add(Dense(vocab_size, activation='softmax'))

In [75]:
lstm_w2v.compile(loss='sparse_categorical_crossentropy', optimizer='adam')

In [76]:
filepath = "weights/lstm_w2v.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks = [checkpoint]

In [77]:
lstm_w2v.fit(train_x, train_y, batch_size=128, epochs=10, callbacks=callbacks)

Train on 236240 samples
Epoch 1/10


KeyboardInterrupt: 

## Generation

In [78]:
def sample(preds, temperature=1.0):
    if temperature <= 0:
        return np.argmax(preds)

    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
  
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
  
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [79]:
def generate(text, num_generated=20):
    word_ids = [word_to_id(word) for word in text.lower().split()]
    
    for i in range(num_generated):
        prediction = model.predict(x=np.array(word_ids))
        id = sample(prediction[-1], temperature=0.5)
        word_ids.append(id)
    
    return ' '.join(id_to_word(id) for id in word_ids)

In [None]:
generate("In this paper")

## Evaluation