In [1]:
import os
import zipfile
import numpy as np
import tensorflow as tf
from tensorboard.plugins import projector

In [2]:
import tools.processing as pre
import tools.embedding as emb
import tools.architectures as nn
import tools.training as tr

## Create embedding for words

We are now even using less than half the words. (At the moment, we do not have the computational power to do more..)

In [3]:
text = pre.get_text("data/prepped/cleanrakim.txt")[:100001]

print( "We only train with " + str(len(text.split(';'))) + " rap lines from cleanrakim as we have limited computational power" )

We only train with 2390 rap lines from cleanrakim as we have limited computational power


In [4]:
vocab = pre.Vocabulary(text)
word2index = vocab.word2index
index2word = vocab.index2word
VOCAB_SIZE = len(index2word)

# create embedding for words
word_embedding_matrix = emb.get_embedding_matrix(word2index, VOCAB_SIZE)
print(word_embedding_matrix)

[[ 0.04659269 -0.02430996  0.0380847  ...  0.10669128  0.01932387
   0.00729941]
 [-0.0027586   0.00629138 -0.01284484 ...  0.00282254  0.00429476
   0.00170374]
 [-0.0027586   0.00629138 -0.01284484 ...  0.00282254  0.00429476
   0.00170374]
 ...
 [-0.00165838 -0.00060007 -0.0861928  ...  0.03562126 -0.0017157
  -0.04269713]
 [-0.0027586   0.00629138 -0.01284484 ...  0.00282254  0.00429476
   0.00170374]
 [ 0.03778261 -0.01219655 -0.05140489 ...  0.00514077 -0.03030926
  -0.04753494]]


In [5]:
print(VOCAB_SIZE)

2706


## Data Preparation: Split sentences of text into data and label

In [6]:
word_tokens = text.split()

TIMESTEPS = 16

str_data, str_labels = pre.create_data_label_pairs(word_tokens, TIMESTEPS)

print( list( zip(str_data, str_labels) )[:5] )

[(['yeah', 'you', 'know', 'what', 'this', 'is', 'nyc', ';', 'the', 'triumphant', 'return', 'rakim', 'allah', ';', 'rakim', ';'], 'remember'), (['you', 'know', 'what', 'this', 'is', 'nyc', ';', 'the', 'triumphant', 'return', 'rakim', 'allah', ';', 'rakim', ';', 'remember'], 'being'), (['know', 'what', 'this', 'is', 'nyc', ';', 'the', 'triumphant', 'return', 'rakim', 'allah', ';', 'rakim', ';', 'remember', 'being'], 'introduced'), (['what', 'this', 'is', 'nyc', ';', 'the', 'triumphant', 'return', 'rakim', 'allah', ';', 'rakim', ';', 'remember', 'being', 'introduced'], 'to'), (['this', 'is', 'nyc', ';', 'the', 'triumphant', 'return', 'rakim', 'allah', ';', 'rakim', ';', 'remember', 'being', 'introduced', 'to'], 'rapping')]


In [7]:
encoder = tr.IndexWordEncoder("Index-Word-Encoding", word2index)
decoder = tr.OneHotWordDecoder("1-Hot-Word-Decoding", index2word, temperature=0.8)

data = encoder.encode(str_data)
labels = encoder.encode_labels(str_labels)

del str_labels
del str_data
del word_tokens

In [8]:
print(data.shape)
print(labels.shape)

(21912, 16)
(21912, 2706)


## Build and train the model

In [9]:
seed_text = "yeah you know what this is nyc ; the triumphant return rakim allah ; rakim ; remember being introduced to rapping your first rhyme ;"
sampler = lambda trainable, _: tr.sample(seed_text, trainable, encoder, decoder, length=20)

In [10]:
class PretrainedEmbeddedMultiLayerRNN(nn.Trainable):
    def __init__(self, name):
        super().__init__(name)

    def build(self, num_layers, hidden_layer_size, vocab_size, embedding_dim, time_steps, l2_reg=0.0):
        tf.reset_default_graph()
        
        self.time_steps = time_steps
        self.vocab_size = vocab_size

        self.X = tf.placeholder(tf.int32, shape=[None, time_steps], name="data")
        self.Y = tf.placeholder(tf.int16, shape=[None, vocab_size], name="labels")
        self._seqlens = tf.placeholder(tf.int32, shape=[None])

        # define pretrained embedding
        self.embedding_placeholder = tf.placeholder(tf.float32, [vocab_size, embedding_dim])
        embeddings = tf.Variable(tf.constant(0.0, shape=[vocab_size, embedding_dim]), trainable=True)
        self.embedding_init = embeddings.assign(self.embedding_placeholder)
        embed = tf.nn.embedding_lookup(embeddings, self.X)


        with tf.variable_scope(self.name, reuse=tf.AUTO_REUSE):

            self.stacked_cells = nn.lstm_layer(num_layers, hidden_layer_size)

            self.outputs, self.states = tf.nn.dynamic_rnn(
                    self.stacked_cells, embed, sequence_length=None, dtype=tf.float32)

            self.last_rnn_output = self.states[num_layers - 1][1]

            self.final_output, W_out, b_out = nn.full_layer(self.last_rnn_output, vocab_size)

            self.weights.append(W_out)
            self.biases.append(b_out)

            self.softmax = tf.nn.softmax_cross_entropy_with_logits_v2(logits=self.final_output,
                    labels=self.Y)
            self.cross_entropy_loss = tf.reduce_mean(self.softmax)

            self.loss = self.cross_entropy_loss

            self.optimizer = tf.train.AdamOptimizer()
            self.train_step = self.optimizer.minimize(self.loss)

            self.correct_prediction = tf.equal(tf.argmax(self.Y,1), tf.argmax(self.final_output, 1))
            self.accuracy = tf.reduce_mean(tf.cast(self.correct_prediction, tf.float32))*100

In [11]:
EPOCHS = 15
BATCH_SIZE = 256

hidden_layer_size = 512

num_LSTM_layers = 2

rnn_words = PretrainedEmbeddedMultiLayerRNN(name="lstm-words")
rnn_words.build(num_LSTM_layers, hidden_layer_size, VOCAB_SIZE, emb.GLOVE_SIZE, TIMESTEPS, l2_reg=0.0)

tr.train_model(rnn_words, data, labels, sampler, epochs=EPOCHS, batch_size=BATCH_SIZE,
               embedding_matrix=word_embedding_matrix, log_dir="logs/10-test-glove-5", retrain=True)

Building model from scratch! 
 Saving into: 'logs/10-test-glove-5'


Epoch 1/15
Loss:    	 5.743573188781738
Accuracy:	 11.77893352508545
------Sampling----------
seed: 
yeah you know what this is nyc ; the triumphant return rakim allah ; rakim ; remember being introduced to rapping your first rhyme ;
-
result: 
yeah you know what this is nyc ; the triumphant return rakim allah ; rakim ; remember being introduced to rapping your first rhyme ; that clear will ; a dangerous strong not picture i ; much mind tell mind a is you ; like


Epoch 2/15
Loss:    	 5.522245407104492
Accuracy:	 14.042533874511719
------Sampling----------
seed: 
yeah you know what this is nyc ; the triumphant return rakim allah ; rakim ; remember being introduced to rapping your first rhyme ;
-
result: 
yeah you know what this is nyc ; the triumphant return rakim allah ; rakim ; remember being introduced to rapping your first rhyme ; of i i is the of universe ; i was borough your can style be and and before ; before

KeyboardInterrupt: 

## Now we are able to sample from any given seed into the next function and get new generated rap lyrics
### (As long the words of the seed are known words)

In [25]:
decoder.temperature = 0.4

sampler = lambda trainable, seed_text: tr.sample( seed_text, trainable, encoder, decoder, length=50)
sampler(rnn_words, "brother i walk down the street ; no where to see no where to be ; walk up walk down")

------Sampling----------
seed: 
brother i walk down the street ; no where to see no where to be ; walk up walk down
-
result: 
brother i walk down the street ; no where to see no where to be ; walk up walk down to your mind ; i am a streets that is the same ; and i am untouchable ; i am her her at the rhythm ; and i am a same of same dead ; and i am a sign but i am the mic ; i am untouchable and


In [39]:
decoder.temperature = 0.5

sampler = lambda trainable, seed_text: tr.sample( seed_text, trainable, encoder, decoder, length=50)
sampler(rnn_words, "do you know what is up ; well nothing ; because nothing is up ; funny is not it")

------Sampling----------
seed: 
do you know what is up ; well nothing ; because nothing is up ; funny is not it
-
result: 
do you know what is up ; well nothing ; because nothing is up ; funny is not it is not soul ; keep that i am a encore you can not build ; i am untouchable ; so i am back to the mind and i am a new story ; now i am untouchable ; my once is a her i still untouchable ; so i am


In [None]:
decoder.temperature = 0.5

sampler = lambda trainable, seed_text: tr.sample( seed_text, trainable, encoder, decoder, length=50)
sampler(rnn_words, "do you know what is up ; well nothing ; because nothing is up ; funny is not it")

## Alright, we see that our rapper is clearly overfitting and has a high tendency to be "untouchable".
- This might be an indicator that we are not using enough data
- But we also can't. Because we have limited computational power..