In [59]:
import os
import sys
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding, LSTM, Bidirectional, CuDNNLSTM
from keras.models import Model
from keras.initializers import Constant
from keras import Sequential
import src.dataset as ds
import pickle
import itertools
import random

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [60]:
WORDS_VECTORS_DIR = 'word_vectors/'
LYRICS_DIR = 'Data/'
GLOVE_DIR = os.path.join(WORDS_VECTORS_DIR, 'glove.6B')
TEXT_DATA = os.path.join(LYRICS_DIR, 'unified_lyrics_dump.txt')

MAX_SEQUENCE_LENGTH = 1 # During each step of the training phase, your architecture will receive as input one word of the lyrics.
EMBEDDING_DIM = 300
VALIDATION_SPLIT = 0.2

In [61]:
def load_embeddings(embeddings='glove'):
    with open(os.path.join(WORDS_VECTORS_DIR, f'{embeddings}_embeddings.pickle'), 'rb') as f:
        pretrained_embeddings = pickle.load(f)
    return pretrained_embeddings
pretrained_embeddings = load_embeddings()

In [62]:
X, y = ds.load_data()

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 615/615 [00:01<00:00, 364.23it/s]


In [53]:
with open(TEXT_DATA, 'r') as f:
    text = f.read()
# finally, vectorize the text samples into a 2D integer tensor
tokenizer = Tokenizer(filters='')
tokenizer.fit_on_texts([text])
# sequences = tokenizer.texts_to_sequences([text])
X = [lst[0] for lst in tokenizer.texts_to_sequences(X)] 
y = [lst[0] for lst in tokenizer.texts_to_sequences(y)]

In [54]:
# prepare embedding matrix
word_index = tokenizer.word_index
num_words = len(word_index) + 1
print('Vocabulary Size: {}'.format(num_words))
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
not_found = []
for word, i in word_index.items(): # Todo: check also word in capitlal (for word2vec)
    word_encode = word.encode()
    embedding_vector = pretrained_embeddings.get(word_encode)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
    else:
        not_found.append(word) # Todo: solve unknown word in pretrained_embeddings (words with ')

Vocabulary Size: 7506


In [55]:
y = to_categorical(y, num_classes=num_words)

In [56]:
# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

In [57]:
# define model
model = Sequential()
model.add(embedding_layer)
model.add(Bidirectional(CuDNNLSTM(50)))
model.add(Dense(num_words, activation='softmax'))
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 1, 300)            2251800   
_________________________________________________________________
bidirectional_2 (Bidirection (None, 100)               140800    
_________________________________________________________________
dense_4 (Dense)              (None, 7506)              758106    
Total params: 3,150,706
Trainable params: 898,906
Non-trainable params: 2,251,800
_________________________________________________________________
None


In [58]:
# compile network
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit network
model.fit(X, y, epochs=1000, verbose=2)

Epoch 1/1000


KeyboardInterrupt: 

In [None]:
# check what not exist in our corpus
def check_our_corpus(our_words):
    not_found = []
    for word in our_words:
        if word.encode() not in embeddings_index:    
            not_found.append(word)
    return not_found