# Todo
1. remove glove 
2. decrease embeddings size and use word2vec
3. use [center loss](https://medium.com/mlreview/experiments-with-a-new-loss-term-added-to-the-standard-cross-entropy-85b080c42446)



In [31]:
import warnings
warnings.filterwarnings("ignore")
import src.dataset as ds
import numpy as np
import keras.backend as K
from src.embeddings import extract_embedding_weights
from keras.layers import Embedding, CuDNNLSTM, Bidirectional, Dense, CuDNNGRU
from keras.initializers import Constant
from keras import Sequential
from tensorflow.python.keras.callbacks import TensorBoard
from time import time
import tensorflow as tf

In [35]:
def perplexity(y_true, y_pred):
    cross_entropy = K.categorical_crossentropy(y_true, y_pred)
    perplexity = K.pow(2.0, cross_entropy)
    return perplexity

In [36]:
tensorboard = TensorBoard(log_dir='./logs', histogram_freq=0,
                          write_graph=True, write_images=False)

In [37]:
X, y, tokenizer = ds.load_tokenized_data()
embedding_matrix = extract_embedding_weights()

100%|███████████████████████████████████████████████████████████████████████████| 615/615 [00:01<00:00, 334.64it/s]
100%|███████████████████████████████████████████████████████████████████████████| 615/615 [00:01<00:00, 336.07it/s]


In [40]:
def make_model(rnn_units=50, bidirectional=True, rnn_type='lstm'):
    
    rnn_types = {
        'lstm': CuDNNLSTM,
        'gru': CuDNNGRU
    }
    rnn_type = rnn_types[rnn_type]
    
    # load pre-trained word embeddings into an Embedding layer
    # note that we set trainable = False so as to keep the embeddings fixed
    EMBEDDING_DIM = 300
    num_words = len(tokenizer.word_index) + 1
    embedding_layer = Embedding(num_words,
                                EMBEDDING_DIM,
                                embeddings_initializer=Constant(embedding_matrix),
                                input_length=1,
                                trainable=False)
    
    model = Sequential()
    model.add(embedding_layer)
    if bidirectional:
        model.add(Bidirectional(rnn_type(rnn_units)))
    else:
        model.add(rnn_type(rnn_units))

    model.add(Dense(num_words, activation='softmax'))
    model.summary()
    return model
    
    
def train(model, epochs=5, batch_size=32):
    # compile network
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=[perplexity])
    # fit network
    model.fit(X, y, 
              epochs=epochs,
              batch_size=batch_size,
              verbose=1, 
              shuffle=True,
              validation_split=0.2,
              callbacks=[tensorboard])
    
    
# generate a sequence from the model
def generate_seq(model, tokenizer, seed_text, n_words):
	in_text, result = seed_text, seed_text
	# generate a fixed number of words
	for _ in range(n_words):
		# encode the text as integer
		encoded = tokenizer.texts_to_sequences([in_text])[0]
		encoded = np.array(encoded)
		# predict a word in the vocabulary
		yhat = model.predict_classes(encoded, verbose=0)
    		# map predicted word index to word
		out_word = ''
		for word, index in tokenizer.word_index.items():
			if index == yhat:
				out_word = word
				break
		# append to input
		in_text, result = out_word, result + ' ' + out_word
	return result

In [42]:
model = make_model(rnn_units=150, rnn_type='gru')

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 1, 300)            2251800   
_________________________________________________________________
bidirectional_3 (Bidirection (None, 300)               406800    
_________________________________________________________________
dense_4 (Dense)              (None, 7506)              2259306   
Total params: 4,917,906
Trainable params: 2,666,106
Non-trainable params: 2,251,800
_________________________________________________________________


In [44]:
train(model, epochs=10, batch_size=128)

Train on 154596 samples, validate on 38650 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [45]:
train(model, epochs=10, batch_size=32)

Train on 154596 samples, validate on 38650 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [20]:
generate_seq(model, tokenizer, 's', 10)

"s . i 'm a little bit . i 'm a"