In [1]:
import os
import zipfile
import numpy as np
import tensorflow as tf
from tensorboard.plugins import projector

In [2]:
import tools.processing as pre
import tools.embedding as emb
import tools.architectures as nn
import tools.training as tr
import tools.phonetics as phon

## Create embedding for words

In [3]:
text = pre.get_text("data/cleaned-rap-lyrics/final_2_pac_rakim_kid_cudi.txt")[:250]

vocab = pre.Vocabulary(text)
word2index = vocab.word2index
index2word = vocab.index2word
VOCAB_SIZE = len(index2word)

# create embedding for words
word_embedding_matrix = emb.get_embedding_matrix(word2index, VOCAB_SIZE)
print(word_embedding_matrix)

[[ 0.03209195  0.06766261  0.04151145 ... -0.046448   -0.03842104
  -0.04822   ]
 [ 0.01169672  0.02784324 -0.02134047 ... -0.00254565  0.01072712
   0.00599443]
 [ 0.00825335  0.00466938 -0.03945393 ... -0.0567189  -0.02748226
   0.0531178 ]
 ...
 [-0.04762312  0.04288607 -0.09683317 ...  0.01158441  0.01227157
   0.01355943]
 [-0.02130833  0.059227   -0.10000064 ... -0.01137079  0.09158197
   0.0108908 ]
 [-0.0196528   0.05993839 -0.09010126 ...  0.00900052  0.04102224
  -0.00091614]]


In [4]:
print(VOCAB_SIZE)

35


## Create embedding for phonems

In [5]:
phonems = pre.get_text("data/phonem-rap-lyrics/phonem_all.txt")

vocab_phonem = pre.Vocabulary(phonems)
phonem2index = vocab_phonem.word2index
index2phonem = vocab_phonem.index2word
VOCAB_SIZE_PHONEM = len(index2phonem)

# create embedding for phonems
phonem_embedding_matrix = emb.get_phonem_embedding_matrix(phonem2index, VOCAB_SIZE_PHONEM)
print(phonem_embedding_matrix)

[[-0.83106965 -0.53812933  0.14049865]
 [-0.57452571 -0.45615974  0.67958683]
 [-0.85238743 -0.27057332 -0.4474659 ]
 [-0.89024484 -0.43297064  0.14142321]
 [-0.889552   -0.42086163  0.17768735]
 [-0.84767091  0.20958452 -0.48736873]
 [-0.80071777 -0.59023607 -0.10233522]
 [-0.8041234  -0.43067229  0.40976447]
 [-0.85063279 -0.25427106 -0.46018487]
 [-0.83106846 -0.49368501 -0.25612572]
 [-0.77628517 -0.57696062 -0.25396407]
 [-0.85525501  0.34369221  0.38783291]
 [-0.72286534 -0.66972053  0.17011791]
 [-0.90681714  0.07853168 -0.41414431]
 [-0.84486431 -0.38346329 -0.37304166]
 [-0.73923445 -0.57153291 -0.35620561]
 [-0.94168699  0.08752931  0.32490653]
 [ 0.91624618 -0.1400506  -0.37533814]
 [-0.88760549 -0.45781818 -0.05058585]
 [-0.95192575  0.20271704 -0.22965866]
 [-0.88998002  0.08347418 -0.44829413]
 [-0.1288805  -0.28575739 -0.94959605]
 [-0.87844598 -0.46678743 -0.10218589]
 [-0.99973291  0.0226963   0.00435353]
 [-0.92469871  0.07278427  0.37367749]
 [-0.92872733  0.10954829

In [6]:
print(VOCAB_SIZE_PHONEM)

85


## Data Preparation: Split sentences of text into data and label

In [7]:
word_tokens = text.split()

TIMESTEPS = 16

str_data, str_labels = pre.create_data_label_pairs(word_tokens, TIMESTEPS)

print( list( zip(str_data, str_labels) )[:5] )

[(['yeah', 'you', 'know', 'what', 'this', 'is', 'nyc', ';', 'the', 'triumphant', 'return', 'rakim', 'allah', ';', 'rakim', ';'], 'remember'), (['you', 'know', 'what', 'this', 'is', 'nyc', ';', 'the', 'triumphant', 'return', 'rakim', 'allah', ';', 'rakim', ';', 'remember'], 'being'), (['know', 'what', 'this', 'is', 'nyc', ';', 'the', 'triumphant', 'return', 'rakim', 'allah', ';', 'rakim', ';', 'remember', 'being'], 'introduced'), (['what', 'this', 'is', 'nyc', ';', 'the', 'triumphant', 'return', 'rakim', 'allah', ';', 'rakim', ';', 'remember', 'being', 'introduced'], 'to'), (['this', 'is', 'nyc', ';', 'the', 'triumphant', 'return', 'rakim', 'allah', ';', 'rakim', ';', 'remember', 'being', 'introduced', 'to'], 'rapping')]


## Data Preparation: Convert previous data into phonems and keep the label

### Update arpabet dictionary with unknown words

In [8]:
unknown_list = pre.get_text("data/cleaned-rap-lyrics/unknown_words_dict.txt")
unknown_dict = phon.create_unknown_dict_from_text(unknown_list)

phon.update_arpabet(unknown_dict)
print(len(phon.get_unknown_words()))

0


In [9]:
phonem_data = []
for sent in str_data:
    phon_str = []
    for word in sent:
        if not word == ";":
            phon_str.extend(phon.get_phonem(word))
    phonem_data.append(phon_str)
    
print( list( zip(phonem_data, str_labels) )[:1] )

[(['Y', 'AE1', 'Y', 'UW1', 'N', 'OW1', 'W', 'AH1', 'T', 'DH', 'IH1', 'S', 'IH1', 'Z', 'N', 'IH', 'K', 'DH', 'AH0', 'T', 'R', 'AY0', 'AH1', 'M', 'F', 'AH0', 'N', 'T', 'R', 'IH0', 'T', 'ER1', 'N', 'R', 'AE', 'K', 'AH', 'M', 'AA1', 'L', 'AH0', 'R', 'AE', 'K', 'AH', 'M'], 'remember')]


In [10]:
# words
encoder = tr.IndexWordEncoder("Index-Word-Encoding", word2index)
decoder = tr.OneHotWordDecoder("1-Hot-Word-Decoding", index2word, temperature=0.8)
# phonem
encoder_phonem = tr.IndexWordEncoder("Index-Phonem-Encoding", phonem2index)

data = encoder.encode(str_data)
data_phonem = encoder_phonem.encode(phonem_data)
labels = encoder.encode_labels(str_labels)

### Delete to save space

In [11]:
del str_labels
del str_data
del phonem_data
del word_tokens

In [11]:
print(data.shape)
print(labels.shape)
print(data_phonem.shape)

# choose timesteps according to length of padded phonem array
TIMESTEPS_PHONEM = len(data_phonem[0])

(31, 16)
(31, 35)
(31, 66)


### TODO: Get seqlens of rap lyrics and phonem

## Tensorflow Model

In [16]:
PRE_TRAINED = True
EPOCHS = 10
BATCH_SIZE = 128

embedding_dimension = 64
embedding_dimension_phonem = 3

hidden_layer_size = 32
hidden_layer_size_phonem = 16

num_LSTM_layers = 4
num_LSTM_layers_phonem = 4

## Create input placeholders

### Word embedding

In [None]:
with tf.name_scope("embeddings"):
    if PRE_TRAINED:
        embeddings = tf.Variable(tf.constant(0.0, shape=[VOCAB_SIZE, emb.GLOVE_SIZE]), trainable=True)
        # If using pretrained embeddings, assign them to the embeddings variable
        embedding_init = embeddings.assign(embedding_placeholder)
        embed = tf.nn.embedding_lookup(embeddings, _inputs)
    else:
        embeddings = tf.Variable(tf.random_uniform([VOCAB_SIZE, embedding_dimension],
                                                   -1.0, 1.0))
        embed = tf.nn.embedding_lookup(embeddings, _inputs)

### Phonem embedding

In [None]:
with tf.name_scope("embeddings_phonem"):
    if PRE_TRAINED:
        embeddings_phonem = tf.Variable(tf.constant(0.0, shape=[VOCAB_SIZE_PHONEM, emb.PHONEM_SIZE]), trainable=True)
        # If using pretrained embeddings, assign them to the embeddings variable
        embedding_init_phonem = embeddings_phonem.assign(embedding_placeholder_phonem)
        embed_phonem = tf.nn.embedding_lookup(embeddings_phonem, _inputs_phonem)
    else:
        embeddings_phonem = tf.Variable(tf.random_uniform([VOCAB_SIZE_PHONEM, embedding_dimension_phonem],
                                                   -1.0, 1.0))
        embed_phonem = tf.nn.embedding_lookup(embeddings_phonem, _inputs_phonem)

### LSTM words

In [20]:
seed_text = "yeah you know what this is nyc ; the triumphant return rakim allah ; rakim ; remember being introduced to rapping your first rhyme ;"
sampler = lambda trainable, _: tr.sample(seed_text, trainable, encoder, decoder, length=20)

In [22]:
rnn_words = nn.MultiLayerRNN_2_embeddings(name="lstm-words-phonems")
rnn_words.build(num_LSTM_layers, hidden_layer_size, VOCAB_SIZE, TIMESTEPS, embedding_words=emb.GLOVE_SIZE, l2_reg=0.0
                num_LSTM_layers_phonem, hidden_layer_size_phonem, VOCAB_SIZE_PHONEM, TIMESTEPS_PHONEM, embedding_phonems=emb.PHONEM_SIZE)

tr.train_model(rnn_words, data, labels, sampler, epochs=EPOCHS, batch_size=BATCH_SIZE, log_dir="logs/10-test-glove-2", retrain=False)

Instructions for updating:
Use standard file APIs to check for files with this prefix.
INFO:tensorflow:Restoring parameters from logs/10-test-glove-2/model
Restoring an old model from 'logs/10-test-glove-2'


In [26]:
decoder.temperature = 0.7

sampler = lambda trainable, seed_text: tr.sample( seed_text, trainable, encoder, decoder, length=50)
sampler(rnn_words, "killing people left and right \n use a gun cool homie \n that is right")

------Sampling----------
seed: 
killing people left and right 
 use a gun cool homie 
 that is right
-
result: 
killing people left and right 
 use a gun cool homie 
 that is right for effect ; scenery and mic words shallow ; giving to perfection to stomp a mic of it ; i stash you am i you you stop will not for this ; i know to heard to want to we came the crowd ; the world with because ya you


In [14]:
"""
with tf.variable_scope("lstm"):
    # Define a function that gives the output in the right shape
    def lstm_cell():
        return tf.nn.rnn_cell.LSTMCell(hidden_layer_size, forget_bias=1.0)
    cell = tf.contrib.rnn.MultiRNNCell(cells=[lstm_cell() for _ in range(num_LSTM_layers)],
                                       state_is_tuple=True)
    outputs, states = tf.nn.dynamic_rnn(cell, embed,
                                        sequence_length=_seqlens,
                                        dtype=tf.float32)
"""

'\nwith tf.variable_scope("lstm"):\n    # Define a function that gives the output in the right shape\n    def lstm_cell():\n        return tf.nn.rnn_cell.LSTMCell(hidden_layer_size, forget_bias=1.0)\n    cell = tf.contrib.rnn.MultiRNNCell(cells=[lstm_cell() for _ in range(num_LSTM_layers)],\n                                       state_is_tuple=True)\n    outputs, states = tf.nn.dynamic_rnn(cell, embed,\n                                        sequence_length=_seqlens,\n                                        dtype=tf.float32)\n'

### LSTM phonem

In [None]:
with tf.variable_scope("lstm_phonem"):
    # Define a function that gives the output in the right shape
    def lstm_cell_phonem():
        return tf.nn.rnn_cell.LSTMCell(hidden_layer_size_phonem, forget_bias=1.0)
    cell_phonem = tf.contrib.rnn.MultiRNNCell(cells=[lstm_cell_phonem() for _ in range(num_LSTM_layers_phonem)],
                                       state_is_tuple=True)
    outputs_phonem, states_phonem = tf.nn.dynamic_rnn(cell_phonem, embed_phonem,
                                        sequence_length=_seqlens_phonem,
                                        dtype=tf.float32)