In [1]:
import os
import zipfile
import numpy as np
import tensorflow as tf
from tensorboard.plugins import projector

In [2]:
import tools.processing as pre
import tools.embedding as emb
import tools.architectures as nn
import tools.training as tr

## Create embedding for words

In [3]:
text = pre.get_text("data/cleaned-rap-lyrics/final_2_pac_rakim_kid_cudi.txt")[:10000]

vocab = pre.Vocabulary(text)
word2index = vocab.word2index
index2word = vocab.index2word
VOCAB_SIZE = len(index2word)

# create embedding for words
word_embedding_matrix = emb.get_embedding_matrix(word2index, VOCAB_SIZE)
print(word_embedding_matrix)

[[ 0.03209195  0.06766261  0.04151145 ... -0.046448   -0.03842104
  -0.04822   ]
 [ 0.00029647  0.01313631 -0.0196403  ... -0.0001816   0.00719928
   0.00517092]
 [ 0.00825335  0.00466938 -0.03945393 ... -0.0567189  -0.02748226
   0.0531178 ]
 ...
 [-0.0196528   0.05993839 -0.09010126 ...  0.00900052  0.04102224
  -0.00091614]
 [-0.00165838 -0.00060007 -0.0861928  ...  0.03562126 -0.0017157
  -0.04269713]
 [ 0.03778261 -0.01219655 -0.05140489 ...  0.00514077 -0.03030926
  -0.04753494]]


In [4]:
print(VOCAB_SIZE)

595


## Create embedding for phonems

In [None]:
phonems = pre.get_text("data/phonem-rap-lyrics/phonem_all.txt")

vocab_phonem = pre.Vocabulary(phonems)
phonem2index = vocab_phonem.word2index
index2phonem = vocab_phonem.index2word
VOCAB_SIZE_PHONEM = len(index2phonem)

# create embedding for phonems
phonem_embedding_matrix = emb.get_phonem_embedding_matrix(phonem2index, VOCAB_SIZE_PHONEM)
print(phonem_embedding_matrix)

In [None]:
print(VOCAB_SIZE_PHONEM)

## Data Preparation: Split sentences of text

In [5]:
word_tokens = text.split()

TIMESTEPS = 16

str_data, str_labels = pre.create_data_label_pairs(word_tokens, TIMESTEPS)

print( list( zip(str_data, str_labels) )[:5] )

[(['yeah', 'you', 'know', 'what', 'this', 'is', 'nyc', ';', 'the', 'triumphant', 'return', 'rakim', 'allah', ';', 'rakim', ';'], 'remember'), (['you', 'know', 'what', 'this', 'is', 'nyc', ';', 'the', 'triumphant', 'return', 'rakim', 'allah', ';', 'rakim', ';', 'remember'], 'being'), (['know', 'what', 'this', 'is', 'nyc', ';', 'the', 'triumphant', 'return', 'rakim', 'allah', ';', 'rakim', ';', 'remember', 'being'], 'introduced'), (['what', 'this', 'is', 'nyc', ';', 'the', 'triumphant', 'return', 'rakim', 'allah', ';', 'rakim', ';', 'remember', 'being', 'introduced'], 'to'), (['this', 'is', 'nyc', ';', 'the', 'triumphant', 'return', 'rakim', 'allah', ';', 'rakim', ';', 'remember', 'being', 'introduced', 'to'], 'rapping')]


In [6]:
encoder = tr.OneHotWordEncoder("1-Hot-Word-Encoding", word2index)
decoder = tr.OneHotWordDecoder("1-Hot-Word-Decoding", index2word, temperature=0.8)

data = encoder.encode(str_data)
labels = encoder.encode_labels(str_labels)

del str_labels
del str_data
del word_tokens

In [7]:
print(data.shape)
print(labels.shape)

(2084, 16, 595)
(2084, 595)


### TODO: Get seqlens of rap lyrics and phonem

## Tensorflow Model

In [8]:
PRE_TRAINED = True
EPOCHS = 10
BATCH_SIZE = 128

embedding_dimension = 64
embedding_dimension_phonem = 3

hidden_layer_size = 32
hidden_layer_size_phonem = 16

num_LSTM_layers = 4
num_LSTM_layers_phonem = 4

#times_steps = 16        # TODO max seqlen of rap text sentence
times_steps_phonem = 16 # TODO max seqlen of phonem sentence

## Create input placeholders

In [9]:
# create input placeholders
'''
_inputs = tf.placeholder(tf.int32, shape=[BATCH_SIZE, times_steps])
embedding_placeholder = tf.placeholder(tf.float32, [VOCAB_SIZE, emb.GLOVE_SIZE])
_labels = tf.placeholder(tf.float32, shape=[BATCH_SIZE, VOCAB_SIZE])
_seqlens = tf.placeholder(tf.int32, shape=[BATCH_SIZE])

_inputs_phonem = tf.placeholder(tf.int32, shape=[BATCH_SIZE, times_steps_phonem])
embedding_placeholder_phonem = tf.placeholder(tf.float32, [VOCAB_SIZE_PHONEM, emb.PHONEM_SIZE])
_labels_phonem = tf.placeholder(tf.float32, shape=[BATCH_SIZE, VOCAB_SIZE_PHONEM]) # TODO do we need this? label should be only one?
_seqlens_phonem = tf.placeholder(tf.int32, shape=[BATCH_SIZE])
'''

'\n_inputs = tf.placeholder(tf.int32, shape=[BATCH_SIZE, times_steps])\nembedding_placeholder = tf.placeholder(tf.float32, [VOCAB_SIZE, emb.GLOVE_SIZE])\n_labels = tf.placeholder(tf.float32, shape=[BATCH_SIZE, VOCAB_SIZE])\n_seqlens = tf.placeholder(tf.int32, shape=[BATCH_SIZE])\n\n_inputs_phonem = tf.placeholder(tf.int32, shape=[BATCH_SIZE, times_steps_phonem])\nembedding_placeholder_phonem = tf.placeholder(tf.float32, [VOCAB_SIZE_PHONEM, emb.PHONEM_SIZE])\n_labels_phonem = tf.placeholder(tf.float32, shape=[BATCH_SIZE, VOCAB_SIZE_PHONEM]) # TODO do we need this? label should be only one?\n_seqlens_phonem = tf.placeholder(tf.int32, shape=[BATCH_SIZE])\n'

### Word embedding

In [None]:
with tf.name_scope("embeddings"):
    if PRE_TRAINED:
        embeddings = tf.Variable(tf.constant(0.0, shape=[VOCAB_SIZE, emb.GLOVE_SIZE]), trainable=True)
        # If using pretrained embeddings, assign them to the embeddings variable
        embedding_init = embeddings.assign(embedding_placeholder)
        embed = tf.nn.embedding_lookup(embeddings, _inputs)
    else:
        embeddings = tf.Variable(tf.random_uniform([VOCAB_SIZE, embedding_dimension],
                                                   -1.0, 1.0))
        embed = tf.nn.embedding_lookup(embeddings, _inputs)

### Phonem embedding

In [None]:
with tf.name_scope("embeddings_phonem"):
    if PRE_TRAINED:
        embeddings_phonem = tf.Variable(tf.constant(0.0, shape=[VOCAB_SIZE_PHONEM, emb.PHONEM_SIZE]), trainable=True)
        # If using pretrained embeddings, assign them to the embeddings variable
        embedding_init_phonem = embeddings_phonem.assign(embedding_placeholder_phonem)
        embed_phonem = tf.nn.embedding_lookup(embeddings_phonem, _inputs_phonem)
    else:
        embeddings_phonem = tf.Variable(tf.random_uniform([VOCAB_SIZE_PHONEM, embedding_dimension_phonem],
                                                   -1.0, 1.0))
        embed_phonem = tf.nn.embedding_lookup(embeddings_phonem, _inputs_phonem)

### LSTM words

In [9]:
rnn_words = nn.MultiLayerRNN_v2(name="lstm-words")
rnn_words.build(num_LSTM_layers, hidden_layer_size, VOCAB_SIZE, TIMESTEPS, l2_reg=0.0, embedding_dim=emb.GLOVE_SIZE)

sampler = lambda trainable, seed_text: tr.sample(seed_text, trainable, encoder, decoder, length=20)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.

For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
This class is equivalent as tf.keras.layers.StackedRNNCells, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
Use tf.cast instead.


In [10]:
tr.train_model(rnn_words, data, labels, sampler, epochs=EPOCHS, batch_size=BATCH_SIZE, log_dir="logs/10-test-glove")

"""
with tf.variable_scope("lstm"):
    # Define a function that gives the output in the right shape
    def lstm_cell():
        return tf.nn.rnn_cell.LSTMCell(hidden_layer_size, forget_bias=1.0)
    cell = tf.contrib.rnn.MultiRNNCell(cells=[lstm_cell() for _ in range(num_LSTM_layers)],
                                       state_is_tuple=True)
    outputs, states = tf.nn.dynamic_rnn(cell, embed,
                                        sequence_length=_seqlens,
                                        dtype=tf.float32)
"""

Building model from scratch! 
 Saving into: '../logs/10-test-glove'


ValueError: Cannot feed value of shape (2084, 16, 595) for Tensor 'data:0', which has shape '(?, 16)'

### LSTM phonem

In [None]:
with tf.variable_scope("lstm_phonem"):
    # Define a function that gives the output in the right shape
    def lstm_cell_phonem():
        return tf.nn.rnn_cell.LSTMCell(hidden_layer_size_phonem, forget_bias=1.0)
    cell_phonem = tf.contrib.rnn.MultiRNNCell(cells=[lstm_cell_phonem() for _ in range(num_LSTM_layers_phonem)],
                                       state_is_tuple=True)
    outputs_phonem, states_phonem = tf.nn.dynamic_rnn(cell_phonem, embed_phonem,
                                        sequence_length=_seqlens_phonem,
                                        dtype=tf.float32)