In [12]:
import os
import zipfile
import numpy as np
import tensorflow as tf
from tensorboard.plugins import projector

In [2]:
import tools.processing as pre
import tools.embedding as emb
import tools.architectures as nn
import tools.training as tr

## Create embedding for words

In [11]:
text = pre.get_text("data/cleaned-rap-lyrics/final_2_pac_rakim_kid_cudi.txt")[:250000]

vocab = pre.Vocabulary(text)
word2index = vocab.word2index
index2word = vocab.index2word
VOCAB_SIZE = len(index2word)

# create embedding for words
word_embedding_matrix = emb.get_embedding_matrix(word2index, VOCAB_SIZE)
print(word_embedding_matrix)

[[ 0.00153569  0.12661901  0.09518377 ...  0.01355811  0.01934354
  -0.0286953 ]
 [ 0.03209195  0.06766261  0.04151145 ... -0.046448   -0.03842104
  -0.04822   ]
 [-0.00257177  0.00306226 -0.00831606 ...  0.00347133  0.00291661
  -0.00081346]
 ...
 [-0.02687368  0.09554347 -0.04599568 ... -0.04568946  0.02053585
  -0.02226466]
 [-0.00257177  0.00306226 -0.00831606 ...  0.00347133  0.00291661
  -0.00081346]
 [ 0.03778261 -0.01219655 -0.05140489 ...  0.00514077 -0.03030926
  -0.04753494]]


In [13]:
print(VOCAB_SIZE)

4231


## Create embedding for phonems

In [14]:
phonems = pre.get_text("data/phonem-rap-lyrics/phonem_all.txt")

vocab_phonem = pre.Vocabulary(phonems)
phonem2index = vocab_phonem.word2index
index2phonem = vocab_phonem.index2word
VOCAB_SIZE_PHONEM = len(index2phonem)

# create embedding for phonems
phonem_embedding_matrix = emb.get_phonem_embedding_matrix(phonem2index, VOCAB_SIZE_PHONEM)
print(phonem_embedding_matrix)

[[-0.83106965 -0.53812933  0.14049865]
 [-0.57452571 -0.45615974  0.67958683]
 [-0.85238743 -0.27057332 -0.4474659 ]
 [-0.89024484 -0.43297064  0.14142321]
 [-0.889552   -0.42086163  0.17768735]
 [-0.84767091  0.20958452 -0.48736873]
 [-0.80071777 -0.59023607 -0.10233522]
 [-0.8041234  -0.43067229  0.40976447]
 [-0.85063279 -0.25427106 -0.46018487]
 [-0.83106846 -0.49368501 -0.25612572]
 [-0.77628517 -0.57696062 -0.25396407]
 [-0.85525501  0.34369221  0.38783291]
 [-0.72286534 -0.66972053  0.17011791]
 [-0.90681714  0.07853168 -0.41414431]
 [-0.84486431 -0.38346329 -0.37304166]
 [-0.73923445 -0.57153291 -0.35620561]
 [-0.94168699  0.08752931  0.32490653]
 [ 0.91624618 -0.1400506  -0.37533814]
 [-0.88760549 -0.45781818 -0.05058585]
 [-0.95192575  0.20271704 -0.22965866]
 [-0.88998002  0.08347418 -0.44829413]
 [-0.1288805  -0.28575739 -0.94959605]
 [-0.87844598 -0.46678743 -0.10218589]
 [-0.99973291  0.0226963   0.00435353]
 [-0.92469871  0.07278427  0.37367749]
 [-0.92872733  0.10954829

In [15]:
print(VOCAB_SIZE_PHONEM)

85


## Data Preparation: Split sentences of text

In [16]:
word_tokens = text.split()

TIMESTEPS = 16

str_data, str_labels = pre.create_data_label_pairs(word_tokens, TIMESTEPS)

print( list( zip(str_data, str_labels) )[:5] )

[(['yeah', 'you', 'know', 'what', 'this', 'is', 'nyc', ';', 'the', 'triumphant', 'return', 'rakim', 'allah', ';', 'rakim', ';'], 'remember'), (['you', 'know', 'what', 'this', 'is', 'nyc', ';', 'the', 'triumphant', 'return', 'rakim', 'allah', ';', 'rakim', ';', 'remember'], 'being'), (['know', 'what', 'this', 'is', 'nyc', ';', 'the', 'triumphant', 'return', 'rakim', 'allah', ';', 'rakim', ';', 'remember', 'being'], 'introduced'), (['what', 'this', 'is', 'nyc', ';', 'the', 'triumphant', 'return', 'rakim', 'allah', ';', 'rakim', ';', 'remember', 'being', 'introduced'], 'to'), (['this', 'is', 'nyc', ';', 'the', 'triumphant', 'return', 'rakim', 'allah', ';', 'rakim', ';', 'remember', 'being', 'introduced', 'to'], 'rapping')]


In [17]:
encoder = tr.IndexWordEncoder("Index-Word-Encoding", word2index)
decoder = tr.OneHotWordDecoder("1-Hot-Word-Decoding", index2word, temperature=0.8)

data = encoder.encode(str_data)
labels = encoder.encode_labels(str_labels)

del str_labels
del str_data
del word_tokens

In [18]:
print(data.shape)
print(labels.shape)

(54607, 16)
(54607, 4231)


### TODO: Get seqlens of rap lyrics and phonem

## Tensorflow Model

In [19]:
PRE_TRAINED = True
EPOCHS = 10
BATCH_SIZE = 128

embedding_dimension = 64
embedding_dimension_phonem = 3

hidden_layer_size = 32
hidden_layer_size_phonem = 16

num_LSTM_layers = 4
num_LSTM_layers_phonem = 4

#times_steps = 16        # TODO max seqlen of rap text sentence
times_steps_phonem = 16 # TODO max seqlen of phonem sentence

## Create input placeholders

In [None]:
# create input placeholders
'''
_inputs = tf.placeholder(tf.int32, shape=[BATCH_SIZE, times_steps])
embedding_placeholder = tf.placeholder(tf.float32, [VOCAB_SIZE, emb.GLOVE_SIZE])
_labels = tf.placeholder(tf.float32, shape=[BATCH_SIZE, VOCAB_SIZE])
_seqlens = tf.placeholder(tf.int32, shape=[BATCH_SIZE])

_inputs_phonem = tf.placeholder(tf.int32, shape=[BATCH_SIZE, times_steps_phonem])
embedding_placeholder_phonem = tf.placeholder(tf.float32, [VOCAB_SIZE_PHONEM, emb.PHONEM_SIZE])
_labels_phonem = tf.placeholder(tf.float32, shape=[BATCH_SIZE, VOCAB_SIZE_PHONEM]) # TODO do we need this? label should be only one?
_seqlens_phonem = tf.placeholder(tf.int32, shape=[BATCH_SIZE])
'''

### Word embedding

In [None]:
with tf.name_scope("embeddings"):
    if PRE_TRAINED:
        embeddings = tf.Variable(tf.constant(0.0, shape=[VOCAB_SIZE, emb.GLOVE_SIZE]), trainable=True)
        # If using pretrained embeddings, assign them to the embeddings variable
        embedding_init = embeddings.assign(embedding_placeholder)
        embed = tf.nn.embedding_lookup(embeddings, _inputs)
    else:
        embeddings = tf.Variable(tf.random_uniform([VOCAB_SIZE, embedding_dimension],
                                                   -1.0, 1.0))
        embed = tf.nn.embedding_lookup(embeddings, _inputs)

### Phonem embedding

In [None]:
with tf.name_scope("embeddings_phonem"):
    if PRE_TRAINED:
        embeddings_phonem = tf.Variable(tf.constant(0.0, shape=[VOCAB_SIZE_PHONEM, emb.PHONEM_SIZE]), trainable=True)
        # If using pretrained embeddings, assign them to the embeddings variable
        embedding_init_phonem = embeddings_phonem.assign(embedding_placeholder_phonem)
        embed_phonem = tf.nn.embedding_lookup(embeddings_phonem, _inputs_phonem)
    else:
        embeddings_phonem = tf.Variable(tf.random_uniform([VOCAB_SIZE_PHONEM, embedding_dimension_phonem],
                                                   -1.0, 1.0))
        embed_phonem = tf.nn.embedding_lookup(embeddings_phonem, _inputs_phonem)

### LSTM words

In [20]:
seed_text = "yeah you know what this is nyc ; the triumphant return rakim allah ; rakim ; remember being introduced to rapping your first rhyme ;"
sampler = lambda trainable, _: tr.sample(seed_text, trainable, encoder, decoder, length=20)

In [21]:
rnn_words = nn.MultiLayerRNN_v2(name="lstm-words")
rnn_words.build(num_LSTM_layers, hidden_layer_size, VOCAB_SIZE, TIMESTEPS, l2_reg=0.0, embedding_dim=emb.GLOVE_SIZE)
tr.train_model(rnn_words, data, labels, sampler, epochs=EPOCHS, batch_size=BATCH_SIZE, log_dir="logs/10-test-glove-2", retrain=False)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.

For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
This class is equivalent as tf.keras.layers.StackedRNNCells, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
Use tf.cast instead.
Building model from scratch! 
 Saving into: 'logs/10-test-glove-2'


Epoch 1/10
Loss:    	 5.897898197174072
Accuracy:	 10.864907264709473
------Sampling----------
seed: 
yeah you know what this is nyc ; the triumphant return rakim allah ; rakim ; remember being introduced to rappin

In [14]:
"""
with tf.variable_scope("lstm"):
    # Define a function that gives the output in the right shape
    def lstm_cell():
        return tf.nn.rnn_cell.LSTMCell(hidden_layer_size, forget_bias=1.0)
    cell = tf.contrib.rnn.MultiRNNCell(cells=[lstm_cell() for _ in range(num_LSTM_layers)],
                                       state_is_tuple=True)
    outputs, states = tf.nn.dynamic_rnn(cell, embed,
                                        sequence_length=_seqlens,
                                        dtype=tf.float32)
"""

'\nwith tf.variable_scope("lstm"):\n    # Define a function that gives the output in the right shape\n    def lstm_cell():\n        return tf.nn.rnn_cell.LSTMCell(hidden_layer_size, forget_bias=1.0)\n    cell = tf.contrib.rnn.MultiRNNCell(cells=[lstm_cell() for _ in range(num_LSTM_layers)],\n                                       state_is_tuple=True)\n    outputs, states = tf.nn.dynamic_rnn(cell, embed,\n                                        sequence_length=_seqlens,\n                                        dtype=tf.float32)\n'

### LSTM phonem

In [None]:
with tf.variable_scope("lstm_phonem"):
    # Define a function that gives the output in the right shape
    def lstm_cell_phonem():
        return tf.nn.rnn_cell.LSTMCell(hidden_layer_size_phonem, forget_bias=1.0)
    cell_phonem = tf.contrib.rnn.MultiRNNCell(cells=[lstm_cell_phonem() for _ in range(num_LSTM_layers_phonem)],
                                       state_is_tuple=True)
    outputs_phonem, states_phonem = tf.nn.dynamic_rnn(cell_phonem, embed_phonem,
                                        sequence_length=_seqlens_phonem,
                                        dtype=tf.float32)