In [1]:
import os
import zipfile
import numpy as np
import tensorflow as tf
from tensorboard.plugins import projector

In [13]:
import tools.processing as pre
import tools.embedding as emb

## Create embedding for words

In [2]:
text = pre.get_text("data/cleaned-rap-lyrics/final_2_pac_rakim_kid_cudi.txt")

vocab = pre.Vocabulary(text)
index2word_map = vocab.index2word_map
word2index_map = vocab._dict
vocabulary_size = len(index2word_map)

# create embedding for words
word_embedding_matrix = emb.get_embedding_matrix(word2index_map, vocabulary_size)
print(word_embedding_matrix)

[[ 0.0848636   0.04045071  0.07530911 ... -0.00327249 -0.03926468
   0.00752516]
 [ 0.00153569  0.12661901  0.09518377 ...  0.01355811  0.01934354
  -0.0286953 ]
 [ 0.00486193  0.14006345  0.08130748 ...  0.00033818  0.03776005
  -0.02884221]
 ...
 [ 0.09170229 -0.04923349 -0.01742387 ...  0.08653232 -0.0250071
  -0.02735833]
 [ 0.0044884   0.10705608  0.00423622 ...  0.13541482 -0.01943643
  -0.10687874]
 [ 0.03727657 -0.02300504  0.08994251 ...  0.02934486 -0.05716072
  -0.04860425]]


In [4]:
len(word_embedding_matrix)

8471

## Create embedding for phonems

In [5]:
phonems = pre.get_text("data/phonem-rap-lyrics/phonem_all.txt")

vocab_phonem = pre.Vocabulary(phonems)
index2phonem_map = vocab_phonem.index2word_map
phonem2index_map = vocab_phonem._dict
vocabulary_size_phonem = len(index2phonem_map)

# create embedding for phonems
phonem_embedding_matrix = emb.get_phonem_embedding_matrix(phonem2index_map, vocabulary_size_phonem)
print(phonem_embedding_matrix)

[[-0.83106965 -0.53812933  0.14049865]
 [-0.57452571 -0.45615974  0.67958683]
 [-0.85238743 -0.27057332 -0.4474659 ]
 [-0.89024484 -0.43297064  0.14142321]
 [-0.889552   -0.42086163  0.17768735]
 [-0.84767091  0.20958452 -0.48736873]
 [-0.80071777 -0.59023607 -0.10233522]
 [-0.8041234  -0.43067229  0.40976447]
 [-0.85063279 -0.25427106 -0.46018487]
 [-0.83106846 -0.49368501 -0.25612572]
 [-0.77628517 -0.57696062 -0.25396407]
 [-0.85525501  0.34369221  0.38783291]
 [-0.72286534 -0.66972053  0.17011791]
 [-0.90681714  0.07853168 -0.41414431]
 [-0.84486431 -0.38346329 -0.37304166]
 [-0.73923445 -0.57153291 -0.35620561]
 [-0.94168699  0.08752931  0.32490653]
 [ 0.91624618 -0.1400506  -0.37533814]
 [-0.88760549 -0.45781818 -0.05058585]
 [-0.95192575  0.20271704 -0.22965866]
 [-0.88998002  0.08347418 -0.44829413]
 [-0.1288805  -0.28575739 -0.94959605]
 [-0.87844598 -0.46678743 -0.10218589]
 [-0.99973291  0.0226963   0.00435353]
 [-0.92469871  0.07278427  0.37367749]
 [-0.92872733  0.10954829

In [6]:
len(phonem_embedding_matrix)

85

## Data Preparation: Split sentences of text

In [8]:
text_sentences = text.split(";")

In [12]:
text_sentences[22:]

[' yeah you know what this is nyc ',
 ' the triumphant return rakim allah ',
 ' rakim ',
 ' remember being introduced to rapping your first rhyme ',
 ' it is pivotal like a fiends first high ',
 ' hustlers first ride players first dime ',
 ' unforgettable like a ladies first time ',
 ' the world been waiting for euphoria ',
 ' the true for with a sixth sense new nausea ',
 ' true talk to the bomb so you forced me to go hard ',
 ' like the streets is the bomb ',
 ' and new york be the birth place of hip hop ',
 ' get it it is the model where swagger was born ',
 ' we set trends to follow ',
 ' the home of the god the go letters and my flow ',
 ' is not a city been so prolific since cairo ',
 ' hieroglyphic graffiti paint of view ',
 ' slang too just gaming a few ',
 ' still claiming through that thing that you do ',
 ' til you famous and just love for the game and this bangers for you ',
 ' euphoria euphoria natural high ',
 ' feeling good off this hip hop i supply ',
 ' time goes by st

### TODO: Get seqlens of rap lyrics and phonem

## Tensorflow Model

In [20]:
PRE_TRAINED = True
batch_size = 128

embedding_dimension = 64
embedding_dimension_phonem = 3

num_classes = len(word_embedding_matrix)
num_classes_phonem = len(phonem_embedding_matrix)

hidden_layer_size = 32
hidden_layer_size_phonem = 16

num_LSTM_layers = 1
num_LSTM_layers_phonem = 1

times_steps = 6        # TODO max seqlen of rap text sentence
times_steps_phonem = 6 # TODO max seqlen of phonem sentence

## Create input placeholders

In [18]:
# create input placeholders
_inputs = tf.placeholder(tf.int32, shape=[batch_size, times_steps])
embedding_placeholder = tf.placeholder(tf.float32, [vocabulary_size, emb.GLOVE_SIZE])
_labels = tf.placeholder(tf.float32, shape=[batch_size, num_classes])
_seqlens = tf.placeholder(tf.int32, shape=[batch_size])

_inputs_phonem = tf.placeholder(tf.int32, shape=[batch_size, times_steps_phonem])
embedding_placeholder_phonem = tf.placeholder(tf.float32, [vocabulary_size_phonem, emb.PHONEM_SIZE])
_labels_phonem = tf.placeholder(tf.float32, shape=[batch_size, num_classes_phonem]) # TODO do we need this? label should be only one?
_seqlens_phonem = tf.placeholder(tf.int32, shape=[batch_size])

### Word embedding

In [17]:
with tf.name_scope("embeddings"):
    if PRE_TRAINED:
        embeddings = tf.Variable(tf.constant(0.0, shape=[vocabulary_size, emb.GLOVE_SIZE]), trainable=True)
        # If using pretrained embeddings, assign them to the embeddings variable
        embedding_init = embeddings.assign(embedding_placeholder)
        embed = tf.nn.embedding_lookup(embeddings, _inputs)
    else:
        embeddings = tf.Variable(tf.random_uniform([vocabulary_size, embedding_dimension],
                                                   -1.0, 1.0))
        embed = tf.nn.embedding_lookup(embeddings, _inputs)

### Phonem embedding

In [19]:
with tf.name_scope("embeddings_phonem"):
    if PRE_TRAINED:
        embeddings_phonem = tf.Variable(tf.constant(0.0, shape=[vocabulary_size_phonem, emb.PHONEM_SIZE]), trainable=True)
        # If using pretrained embeddings, assign them to the embeddings variable
        embedding_init_phonem = embeddings_phonem.assign(embedding_placeholder_phonem)
        embed_phonem = tf.nn.embedding_lookup(embeddings_phonem, _inputs_phonem)
    else:
        embeddings_phonem = tf.Variable(tf.random_uniform([vocabulary_size_phonem, embedding_dimension_phonem],
                                                   -1.0, 1.0))
        embed_phonem = tf.nn.embedding_lookup(embeddings_phonem, _inputs_phonem)

### LSTM words

In [21]:
with tf.variable_scope("lstm"):
    # Define a function that gives the output in the right shape
    def lstm_cell():
        return tf.nn.rnn_cell.LSTMCell(hidden_layer_size, forget_bias=1.0)
    cell = tf.contrib.rnn.MultiRNNCell(cells=[lstm_cell() for _ in range(num_LSTM_layers)],
                                       state_is_tuple=True)
    outputs, states = tf.nn.dynamic_rnn(cell, embed,
                                        sequence_length=_seqlens,
                                        dtype=tf.float32)


For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
This class is equivalent as tf.keras.layers.StackedRNNCells, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
Use tf.cast instead.


### LSTM phonem

In [22]:
with tf.variable_scope("lstm_phonem"):
    # Define a function that gives the output in the right shape
    def lstm_cell_phonem():
        return tf.nn.rnn_cell.LSTMCell(hidden_layer_size_phonem, forget_bias=1.0)
    cell_phonem = tf.contrib.rnn.MultiRNNCell(cells=[lstm_cell_phonem() for _ in range(num_LSTM_layers_phonem)],
                                       state_is_tuple=True)
    outputs_phonem, states_phonem = tf.nn.dynamic_rnn(cell_phonem, embed_phonem,
                                        sequence_length=_seqlens_phonem,
                                        dtype=tf.float32)