# Combining glove and phonetic embedding to get a better accuracy in predicting new rap lyrics
## The idea is to combine our previous results with the phonetics of a word <br/> and get more detailed information about the structure of a sentence which rhymes. 

In [1]:
import os
import zipfile
import numpy as np
import tensorflow as tf
from tensorboard.plugins import projector

In [2]:
import tools.processing as pre
import tools.embedding as emb
import tools.architectures as nn
import tools.training as tr
import tools.phonetics as phon

## Create embedding for rap lyrics

We are only using halve of the words "250000" because of memory issues

In [16]:
text = pre.get_text("data/cleaned-rap-lyrics/final_2_pac_rakim_kid_cudi.txt")[:25000]

vocab = pre.Vocabulary(text)
word2index = vocab.word2index
index2word = vocab.index2word
VOCAB_SIZE = len(index2word)

# create embedding for words
word_embedding_matrix = emb.get_embedding_matrix(word2index, VOCAB_SIZE)
print(word_embedding_matrix)

[[ 3.20919529e-02  6.76626116e-02  4.15114537e-02 ... -4.64479961e-02
  -3.84210385e-02 -4.82199974e-02]
 [-2.10926658e-03  1.25741512e-02 -1.79006178e-02 ... -2.81850225e-05
   5.56626776e-03  3.17836367e-03]
 [ 8.25334620e-03  4.66938363e-03 -3.94539274e-02 ... -5.67189045e-02
  -2.74822619e-02  5.31177968e-02]
 ...
 [-3.14446986e-02 -1.60996541e-02 -1.04854435e-01 ... -3.80562693e-02
   1.62324663e-02  1.56729072e-02]
 [-1.65838236e-03 -6.00066036e-04 -8.61928016e-02 ...  3.56212556e-02
  -1.71570492e-03 -4.26971316e-02]
 [ 3.77826095e-02 -1.21965548e-02 -5.14048897e-02 ...  5.14076790e-03
  -3.03092636e-02 -4.75349426e-02]]


In [4]:
print(VOCAB_SIZE)

35


## Create embedding for phonems

In [5]:
phonems = pre.get_text("data/phonem-rap-lyrics/phonem_all.txt")

vocab_phonem = pre.Vocabulary(phonems)
phonem2index = vocab_phonem.word2index
index2phonem = vocab_phonem.index2word
VOCAB_SIZE_PHONEM = len(index2phonem)

# create embedding for phonems
phonem_embedding_matrix = emb.get_phonem_embedding_matrix(phonem2index, VOCAB_SIZE_PHONEM)
print(phonem_embedding_matrix)

[[-0.83106965 -0.53812933  0.14049865]
 [-0.57452571 -0.45615974  0.67958683]
 [-0.85238743 -0.27057332 -0.4474659 ]
 [-0.89024484 -0.43297064  0.14142321]
 [-0.889552   -0.42086163  0.17768735]
 [-0.84767091  0.20958452 -0.48736873]
 [-0.80071777 -0.59023607 -0.10233522]
 [-0.8041234  -0.43067229  0.40976447]
 [-0.85063279 -0.25427106 -0.46018487]
 [-0.83106846 -0.49368501 -0.25612572]
 [-0.77628517 -0.57696062 -0.25396407]
 [-0.85525501  0.34369221  0.38783291]
 [-0.72286534 -0.66972053  0.17011791]
 [-0.90681714  0.07853168 -0.41414431]
 [-0.84486431 -0.38346329 -0.37304166]
 [-0.73923445 -0.57153291 -0.35620561]
 [-0.94168699  0.08752931  0.32490653]
 [ 0.91624618 -0.1400506  -0.37533814]
 [-0.88760549 -0.45781818 -0.05058585]
 [-0.95192575  0.20271704 -0.22965866]
 [-0.88998002  0.08347418 -0.44829413]
 [-0.1288805  -0.28575739 -0.94959605]
 [-0.87844598 -0.46678743 -0.10218589]
 [-0.99973291  0.0226963   0.00435353]
 [-0.92469871  0.07278427  0.37367749]
 [-0.92872733  0.10954829

In [6]:
print(VOCAB_SIZE_PHONEM)

85


## Data Preparation: Split sentences of rap lyrics into data and label

In [7]:
word_tokens = text.split()

TIMESTEPS = 16

str_data, str_labels = pre.create_data_label_pairs(word_tokens, TIMESTEPS)

print( list( zip(str_data, str_labels) )[:5] )

[(['yeah', 'you', 'know', 'what', 'this', 'is', 'nyc', ';', 'the', 'triumphant', 'return', 'rakim', 'allah', ';', 'rakim', ';'], 'remember'), (['you', 'know', 'what', 'this', 'is', 'nyc', ';', 'the', 'triumphant', 'return', 'rakim', 'allah', ';', 'rakim', ';', 'remember'], 'being'), (['know', 'what', 'this', 'is', 'nyc', ';', 'the', 'triumphant', 'return', 'rakim', 'allah', ';', 'rakim', ';', 'remember', 'being'], 'introduced'), (['what', 'this', 'is', 'nyc', ';', 'the', 'triumphant', 'return', 'rakim', 'allah', ';', 'rakim', ';', 'remember', 'being', 'introduced'], 'to'), (['this', 'is', 'nyc', ';', 'the', 'triumphant', 'return', 'rakim', 'allah', ';', 'rakim', ';', 'remember', 'being', 'introduced', 'to'], 'rapping')]


## Data Preparation: Convert previous data into phonems and keep the label

### Update arpabet dictionary of phonems with unknown words

In [8]:
unknown_list = pre.get_text("data/cleaned-rap-lyrics/unknown_words_dict.txt")
unknown_dict = phon.create_unknown_dict_from_text(unknown_list)

phon.update_arpabet(unknown_dict)
print(len(phon.get_unknown_words()))

0


In [9]:
phonem_data = []
for sent in str_data:
    phon_str = []
    for word in sent:
        if not word == ";":
            phon_str.extend(phon.get_phonem(word))
    phonem_data.append(phon_str)
    
print( list( zip(phonem_data, str_labels) )[:1] )

[(['Y', 'AE1', 'Y', 'UW1', 'N', 'OW1', 'W', 'AH1', 'T', 'DH', 'IH1', 'S', 'IH1', 'Z', 'N', 'IH', 'K', 'DH', 'AH0', 'T', 'R', 'AY0', 'AH1', 'M', 'F', 'AH0', 'N', 'T', 'R', 'IH0', 'T', 'ER1', 'N', 'R', 'AE', 'K', 'AH', 'M', 'AA1', 'L', 'AH0', 'R', 'AE', 'K', 'AH', 'M'], 'remember')]


### The rap lyrics and phonems are encoded so that their shape fits into the embedding lookup table

In [10]:
# words
encoder = tr.IndexWordEncoder("Index-Word-Encoding", word2index)
decoder = tr.OneHotWordDecoder("1-Hot-Word-Decoding", index2word, temperature=0.8)
# phonem
encoder_phonem = tr.IndexWordEncoder("Index-Phonem-Encoding", phonem2index)

data = encoder.encode(str_data)
data_phonem = encoder_phonem.encode(phonem_data)
labels = encoder.encode_labels(str_labels)

### Delete unneeded variables to save space

In [11]:
del str_labels
del str_data
del phonem_data
del word_tokens

### In order to get the correct shape of the phonem embedding matrix, we needed to pad missing entries <br/> The changes were made in the library file: training.py:IndexWordEncoder()

In [12]:
print(data.shape)
print(labels.shape)
print(data_phonem.shape)

# choose timesteps according to length of padded phonem array
TIMESTEPS_PHONEM = len(data_phonem[0])

(31, 16)
(31, 35)
(31, 66)


## Tensorflow Model

In [13]:
PRE_TRAINED = True
EPOCHS = 10
BATCH_SIZE = 128

embedding_dimension = emb.GLOVE_SIZE
embedding_dimension_phonem = emb.PHONEM_SIZE

hidden_layer_size = 32
hidden_layer_size_phonem = 16

num_LSTM_layers = 4
num_LSTM_layers_phonem = 4

### Build a model with both embeddings combined

In [14]:
seed_text = "yeah you know what this is nyc ; the triumphant return rakim allah ; rakim ; remember being introduced to rapping your first rhyme ;"
sampler = lambda trainable, _: tr.sample(seed_text, trainable, encoder, decoder, length=20)

### [Error] We are getting an error during the build model process. <br/> We were unable to find a solution to that..
#### >> Trying to share variable lstm-words-phonems/rnn/multi_rnn_cell/cell_0/lstm_cell/kernel, but specified shape (19, 64) and found shape (332, 128) <<

In [15]:
rnn = nn.MultiLayerRNN_more_embeddings(name="lstm-words-phonems")
rnn.build(num_LSTM_layers, hidden_layer_size, VOCAB_SIZE, TIMESTEPS, embedding_dimension,
          num_LSTM_layers_phonem, hidden_layer_size_phonem, VOCAB_SIZE_PHONEM, TIMESTEPS_PHONEM, embedding_dimension_phonem,
          l2_reg=0.0)

tr.train_model_more_embeddings(rnn, data, data_phonem, labels,
                               sampler, epochs=EPOCHS, batch_size=BATCH_SIZE, 
                               embedding_matrix=word_embedding_matrix,
                               embedding_matrix_phonem=phonem_embedding_matrix,
                               log_dir="logs/10.2-glove-phonem", retrain=False)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.

For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
This class is equivalent as tf.keras.layers.StackedRNNCells, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API


ValueError: Trying to share variable lstm-words-phonems/rnn/multi_rnn_cell/cell_0/lstm_cell/kernel, but specified shape (19, 64) and found shape (332, 128).

## Now we are able to sample from any given seed into the next function and get new generated rap lyrics
### (As long the words of the seed are known words)

In [None]:
decoder.temperature = 0.7

sampler = lambda trainable, seed_text: tr.sample( seed_text, trainable, encoder, decoder, length=50)
sampler(rnn_words, "killing people left and right \n use a gun cool homie \n that is right")