In [14]:
import src.dataset as ds
import numpy as np
from src.embeddings import extract_embedding_weights
from keras.layers import Embedding, CuDNNLSTM, Bidirectional, Dense
from keras.initializers import Constant
from keras import Sequential

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [15]:
X, y, tokenizer = ds.load_tokenized_data()

100%|██████████████████████████████████████████████████████████████████| 615/615 [00:02<00:00, 289.37it/s]


In [16]:
embedding_matrix = extract_embedding_weights()

100%|██████████████████████████████████████████████████████████████████| 615/615 [00:01<00:00, 367.06it/s]


In [17]:
# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
EMBEDDING_DIM = 300
num_words = len(tokenizer.word_index) + 1
embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=1,
                            trainable=True)

In [18]:
# define model
model = Sequential()
model.add(embedding_layer)
model.add(Bidirectional(CuDNNLSTM(50)))
model.add(Dense(num_words, activation='softmax'))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 1, 300)            2251800   
_________________________________________________________________
bidirectional_4 (Bidirection (None, 100)               140800    
_________________________________________________________________
dense_4 (Dense)              (None, 7506)              758106    
Total params: 3,150,706
Trainable params: 3,150,706
Non-trainable params: 0
_________________________________________________________________


In [19]:
# compile network
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit network
model.fit(X, y, epochs=100, verbose=2)

Epoch 1/100
 - 47s - loss: 5.0822 - acc: 0.1939
Epoch 2/100
 - 45s - loss: 4.4329 - acc: 0.2271
Epoch 3/100
 - 44s - loss: 4.1963 - acc: 0.2372
Epoch 4/100
 - 44s - loss: 4.0594 - acc: 0.2439
Epoch 5/100
 - 45s - loss: 3.9705 - acc: 0.2475
Epoch 6/100
 - 44s - loss: 3.9100 - acc: 0.2512
Epoch 7/100
 - 45s - loss: 3.8686 - acc: 0.2522
Epoch 8/100
 - 44s - loss: 3.8392 - acc: 0.2528
Epoch 9/100
 - 44s - loss: 3.8163 - acc: 0.2531
Epoch 10/100
 - 44s - loss: 3.8001 - acc: 0.2529
Epoch 11/100
 - 44s - loss: 3.7875 - acc: 0.2528
Epoch 12/100
 - 44s - loss: 3.7772 - acc: 0.2537
Epoch 13/100
 - 44s - loss: 3.7686 - acc: 0.2537
Epoch 14/100
 - 44s - loss: 3.7624 - acc: 0.2534
Epoch 15/100
 - 44s - loss: 3.7572 - acc: 0.2536
Epoch 16/100
 - 44s - loss: 3.7520 - acc: 0.2530
Epoch 17/100
 - 44s - loss: 3.7470 - acc: 0.2537
Epoch 18/100
 - 44s - loss: 3.7448 - acc: 0.2529
Epoch 19/100
 - 44s - loss: 3.7413 - acc: 0.2531
Epoch 20/100
 - 44s - loss: 3.7382 - acc: 0.2534
Epoch 21/100
 - 44s - loss: 3

<keras.callbacks.History at 0x1555a6c46d8>

In [21]:
# generate a sequence from the model
def generate_seq(model, tokenizer, seed_text, n_words):
	in_text, result = seed_text, seed_text
	# generate a fixed number of words
	for _ in range(n_words):
		# encode the text as integer
		encoded = tokenizer.texts_to_sequences([in_text])[0]
		encoded = np.array(encoded)
		# predict a word in the vocabulary
		yhat = model.predict_classes(encoded, verbose=0)
    		# map predicted word index to word
		out_word = ''
		for word, index in tokenizer.word_index.items():
			if index == yhat:
				out_word = word
				break
		# append to input
		in_text, result = out_word, result + ' ' + out_word
	return result

In [23]:
generate_seq(model, tokenizer, 'i', 10)

"i 'm gon na get it 's a little bit ."