In [1]:
import os
import zipfile
import numpy as np
import tensorflow as tf
from tensorboard.plugins import projector

In [2]:
import tools.processing as pre
import tools.embedding as emb
import tools.architectures as nn
import tools.training as tr

## Create embedding for words

We are only using halve of the words "250000" because of memory issues

In [3]:
text = pre.get_text("data/cleaned-rap-lyrics/final_2_pac_rakim_kid_cudi.txt")[:250000]

vocab = pre.Vocabulary(text)
word2index = vocab.word2index
index2word = vocab.index2word
VOCAB_SIZE = len(index2word)

# create embedding for words
word_embedding_matrix = emb.get_embedding_matrix(word2index, VOCAB_SIZE)
print(word_embedding_matrix)

[[ 0.00153569  0.12661901  0.09518377 ...  0.01355811  0.01934354
  -0.0286953 ]
 [ 0.03209195  0.06766261  0.04151145 ... -0.046448   -0.03842104
  -0.04822   ]
 [-0.00257177  0.00306226 -0.00831606 ...  0.00347133  0.00291661
  -0.00081346]
 ...
 [-0.02687368  0.09554347 -0.04599568 ... -0.04568946  0.02053585
  -0.02226466]
 [-0.00257177  0.00306226 -0.00831606 ...  0.00347133  0.00291661
  -0.00081346]
 [ 0.03778261 -0.01219655 -0.05140489 ...  0.00514077 -0.03030926
  -0.04753494]]


In [4]:
print(VOCAB_SIZE)

4231


## Data Preparation: Split sentences of text into data and label

In [7]:
word_tokens = text.split()

TIMESTEPS = 16

str_data, str_labels = pre.create_data_label_pairs(word_tokens, TIMESTEPS)

print( list( zip(str_data, str_labels) )[:5] )

[(['yeah', 'you', 'know', 'what', 'this', 'is', 'nyc', ';', 'the', 'triumphant', 'return', 'rakim', 'allah', ';', 'rakim', ';'], 'remember'), (['you', 'know', 'what', 'this', 'is', 'nyc', ';', 'the', 'triumphant', 'return', 'rakim', 'allah', ';', 'rakim', ';', 'remember'], 'being'), (['know', 'what', 'this', 'is', 'nyc', ';', 'the', 'triumphant', 'return', 'rakim', 'allah', ';', 'rakim', ';', 'remember', 'being'], 'introduced'), (['what', 'this', 'is', 'nyc', ';', 'the', 'triumphant', 'return', 'rakim', 'allah', ';', 'rakim', ';', 'remember', 'being', 'introduced'], 'to'), (['this', 'is', 'nyc', ';', 'the', 'triumphant', 'return', 'rakim', 'allah', ';', 'rakim', ';', 'remember', 'being', 'introduced', 'to'], 'rapping')]


In [8]:
encoder = tr.IndexWordEncoder("Index-Word-Encoding", word2index)
decoder = tr.OneHotWordDecoder("1-Hot-Word-Decoding", index2word, temperature=0.8)

data = encoder.encode(str_data)
labels = encoder.encode_labels(str_labels)

del str_labels
del str_data
del word_tokens

In [9]:
print(data.shape)
print(labels.shape)

(54607, 16)
(54607, 4231)


## Tensorflow Model

In [21]:
PRE_TRAINED = True
EPOCHS = 20
BATCH_SIZE = 128

embedding_dimension = 64

hidden_layer_size = 32

num_LSTM_layers = 4

## Create input placeholders

All input placeholders are created within the class of architectures.MultiLayerRNN_v2

### Create embedding

The embedding for the words is also created within the class of architectures.MultiLayerRNN_v2

## Build and train the model

In [22]:
seed_text = "yeah you know what this is nyc ; the triumphant return rakim allah ; rakim ; remember being introduced to rapping your first rhyme ;"
sampler = lambda trainable, _: tr.sample(seed_text, trainable, encoder, decoder, length=20)

In [23]:
rnn_words = nn.MultiLayerRNN_v2(name="lstm-words")
rnn_words.build(num_LSTM_layers, hidden_layer_size, VOCAB_SIZE, TIMESTEPS, l2_reg=0.0, embedding_dim=emb.GLOVE_SIZE)
tr.train_model(rnn_words, data, labels, sampler, epochs=EPOCHS, batch_size=BATCH_SIZE,
               embedding_matrix=word_embedding_matrix, log_dir="logs/10-test-glove-3", retrain=True)

INFO:tensorflow:Restoring parameters from logs/10-test-glove-3/model
Restoring an old model from 'logs/10-test-glove-3'
and training it further..


Epoch 1/20
Loss:    	 5.153051376342773
Accuracy:	 16.671855926513672
------Sampling----------
seed: 
yeah you know what this is nyc ; the triumphant return rakim allah ; rakim ; remember being introduced to rapping your first rhyme ;
-
result: 
yeah you know what this is nyc ; the triumphant return rakim allah ; rakim ; remember being introduced to rapping your first rhyme ; i give and show the mind is i know and it the time ; the king because and i people


Epoch 2/20
Loss:    	 5.090422630310059
Accuracy:	 17.567344665527344
------Sampling----------
seed: 
yeah you know what this is nyc ; the triumphant return rakim allah ; rakim ; remember being introduced to rapping your first rhyme ;
-
result: 
yeah you know what this is nyc ; the triumphant return rakim allah ; rakim ; remember being introduced to rapping your first rhyme ; the we wa

## Now we are able to sample from any given seed into the next function and get new generated rap lyrics
### (As long the words of the seed are known words)

In [24]:
decoder.temperature = 0.7

sampler = lambda trainable, seed_text: tr.sample( seed_text, trainable, encoder, decoder, length=50)
sampler(rnn_words, "killing people left and right \n use a gun cool homie \n that is right")

------Sampling----------
seed: 
killing people left and right 
 use a gun cool homie 
 that is right
-
result: 
killing people left and right 
 use a gun cool homie 
 that is right ; do your til yo a rhyme to can not be back ; from shooting the kids sex it time ; you was one them with the crowd to self and the whole oh way ; i am him you make it the trick ; and from give not the
