In [2]:
import os
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.layers import Dense 
from keras.layers import Embedding, Bidirectional, CuDNNLSTM
from keras.initializers import Constant
from keras import Sequential
import src.dataset as ds
import pickle


%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
WORDS_VECTORS_DIR = 'word_vectors/'
LYRICS_DIR = 'Data/'
GLOVE_DIR = os.path.join(WORDS_VECTORS_DIR, 'glove.6B')
TEXT_DATA = os.path.join(LYRICS_DIR, 'unified_lyrics_dump.txt')

MAX_SEQUENCE_LENGTH = 1 # During each step of the training phase, your architecture will receive as input one word of the lyrics.
EMBEDDING_DIM = 300
VALIDATION_SPLIT = 0.2

In [4]:
def load_embeddings(embeddings='glove'):
    with open(os.path.join(WORDS_VECTORS_DIR, f'{embeddings}_embeddings.pickle'), 'rb') as f:
        pretrained_embeddings = pickle.load(f)
    return pretrained_embeddings
pretrained_embeddings = load_embeddings()

In [5]:
X, y = ds.load_data()

100%|██████████████████████████████████████████████████████████████████| 615/615 [00:01<00:00, 369.68it/s]


In [6]:
with open(TEXT_DATA, 'r') as f:
    text = f.read()
# finally, vectorize the text samples into a 2D integer tensor
tokenizer = Tokenizer(filters='')
tokenizer.fit_on_texts([text])
# sequences = tokenizer.texts_to_sequences([text])
X = [lst[0] for lst in tokenizer.texts_to_sequences(X)] 
y = [lst[0] for lst in tokenizer.texts_to_sequences(y)]

In [7]:
# prepare embedding matrix
word_index = tokenizer.word_index
num_words = len(word_index) + 1
print('Vocabulary Size: {}'.format(num_words))
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
not_found = []
for word, i in word_index.items(): # Todo: check also word in capitlal (for word2vec)
    word_encode = word.encode()
    embedding_vector = pretrained_embeddings.get(word_encode)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
    else:
        not_found.append(word) # 

Vocabulary Size: 7506


In [8]:
y = to_categorical(y, num_classes=num_words)

In [9]:
# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

In [10]:
# define model
model = Sequential()
model.add(embedding_layer)
model.add(Bidirectional(CuDNNLSTM(50)))
model.add(Dense(num_words, activation='softmax'))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1, 300)            2251800   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 100)               140800    
_________________________________________________________________
dense_1 (Dense)              (None, 7506)              758106    
Total params: 3,150,706
Trainable params: 898,906
Non-trainable params: 2,251,800
_________________________________________________________________


In [15]:
# compile network
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit network
model.fit(X, y, epochs=5, verbose=2)

Epoch 1/5
 - 45s - loss: 5.1930 - acc: 0.1833
Epoch 2/5
 - 38s - loss: 4.5662 - acc: 0.2135
Epoch 3/5
 - 38s - loss: 4.3401 - acc: 0.2237
Epoch 4/5
 - 38s - loss: 4.2154 - acc: 0.2281
Epoch 5/5
 - 38s - loss: 4.1370 - acc: 0.2308


<keras.callbacks.History at 0x19799264828>

In [16]:
# generate a sequence from the model
def generate_seq(model, tokenizer, seed_text, n_words):
	in_text, result = seed_text, seed_text
	# generate a fixed number of words
	for _ in range(n_words):
		# encode the text as integer
		encoded = tokenizer.texts_to_sequences([in_text])[0]
		encoded = np.array(encoded)
		# predict a word in the vocabulary
		yhat = model.predict_classes(encoded, verbose=0)
    		# map predicted word index to word
		out_word = ''
		for word, index in tokenizer.word_index.items():
			if index == yhat:
				out_word = word
				break
		# append to input
		in_text, result = out_word, result + ' ' + out_word
	return result

In [24]:
generate_seq(model, tokenizer, 'hey', 10)

"hey little bit . i 'm gon na be . i"

In [None]:
# check what not exist in our corpus
def check_our_corpus(our_words):
    not_found = []
    for word in our_words:
        if word.encode() not in embeddings_index:    
            not_found.append(word)
    return not_found