# 04 - embedding lstm

Up to this point, we always fed the 1-hot-encoded data into our RNN cell. Now we want to embed our data first.

We will map our 1-hot-encoded data to our embedding space with has the dimensions **EMBEDDING_SIZE**

In [1]:
import tools.processing as pre
import re

# use less text for now to avoid memory error
text = pre.get_text("data/cleaned-rap-lyrics/clean2_pac_.txt")

TIMESTEPS = 40

vocab = pre.Vocabulary(text)

# double \\n to avoid null error in tensorboard projection
text = text.replace("\n", " \\n ")

# remove extra spacing
tokens = re.sub( " +", " ", text).split(" ")[:-1]


str_data, str_labels = pre.create_data_label_pairs(tokens, TIMESTEPS)

print( list( zip(str_data, str_labels) )[:5] )

[(['as', 'real', 'as', 'it', 'seems', 'the', 'american', 'dream', '\\n', "ain't", 'nothing', 'but', 'another', 'calculated', 'schemes', '\\n', 'to', 'get', 'us', 'locked', 'up', 'shot', 'up', 'back', 'in', 'chains', '\\n', 'to', 'deny', 'us', 'of', 'the', 'future', 'rob', 'our', 'names', '\\n', 'kept', 'my', 'history'], 'of'), (['real', 'as', 'it', 'seems', 'the', 'american', 'dream', '\\n', "ain't", 'nothing', 'but', 'another', 'calculated', 'schemes', '\\n', 'to', 'get', 'us', 'locked', 'up', 'shot', 'up', 'back', 'in', 'chains', '\\n', 'to', 'deny', 'us', 'of', 'the', 'future', 'rob', 'our', 'names', '\\n', 'kept', 'my', 'history', 'of'], 'mystery'), (['as', 'it', 'seems', 'the', 'american', 'dream', '\\n', "ain't", 'nothing', 'but', 'another', 'calculated', 'schemes', '\\n', 'to', 'get', 'us', 'locked', 'up', 'shot', 'up', 'back', 'in', 'chains', '\\n', 'to', 'deny', 'us', 'of', 'the', 'future', 'rob', 'our', 'names', '\\n', 'kept', 'my', 'history', 'of', 'mystery'], 'but'), (['it'

In [2]:
import tools.architectures as nn
import tensorflow as tf

class EmbeddedSingleLayerRNN(nn.Trainable):
    def __init__(self, name):
        super().__init__(name)

    def build(self, hidden_layer_size, vocab_size, embedding_dimension, time_steps, l2_reg=0.0):
        self.time_steps = time_steps
        self.vocab_size = vocab_size

        self.X = tf.placeholder(tf.int32, shape=[None, time_steps], name="data")

        # remaps our 1-hot to embedding space
        embeddings = tf.Variable(
            tf.random_uniform([vocab_size, embedding_dimension], -1.0, 1.0)
        )
        
        embed = tf.nn.embedding_lookup(embeddings, self.X)
        _X = tf.transpose(embed, [1, 0, 2])

        self.Y = tf.placeholder(tf.int16, shape=[None, vocab_size], name="labels")

        _X = tf.reshape(_X, [-1, embedding_dimension])
        _X = tf.split(_X, time_steps, 0)

        with tf.variable_scope(self.name, reuse=tf.AUTO_REUSE):
            self.rnn_cell   = tf.nn.rnn_cell.LSTMCell(hidden_layer_size)

            self.outputs, _ = tf.contrib.rnn.static_rnn(self.rnn_cell, _X, dtype=tf.float32)
            self.last_rnn_output = self.outputs[-1]

            self.final_output, W_out, b_out = nn.full_layer( self.last_rnn_output, vocab_size )

            self.weights.append(W_out)
            self.biases.append(b_out)

            self.softmax = tf.nn.softmax_cross_entropy_with_logits_v2(logits=self.final_output,
                    labels=self.Y)
            self.cross_entropy_loss = tf.reduce_mean(self.softmax)

            self.loss = self.cross_entropy_loss

            self.optimizer = tf.train.AdamOptimizer()
            self.train_step= self.optimizer.minimize(self.loss)

            self.correct_prediction = tf.equal(tf.argmax(self.Y,1), tf.argmax(self.final_output, 1))
            self.accuracy = tf.reduce_mean(tf.cast(self.correct_prediction, tf.float32))*100

### Problem: Our embedding can look up indices of words, but not 1-hot encodings
$=>$ We therefore create a **IndexWordEncoder**

In [3]:
import numpy as np
import tools.training as tr

class IndexWordEncoder(tr.Encoder):
    """
    Encodes sequences of words to sequences of 1-Hot Encoded vectors
    """
    
    def __init__(self, name, word2index):
        super(IndexWordEncoder, self).__init__(name)
        self.word2index = word2index
        
    def encode(self, sequences):
        """
        Encodes our sequences of words to sequences of indices
        """
        encoded_sequences = []
        for seq in sequences:
            
            # encoded = np.zeros( len(seq) )
            encoded = [ self.word2index[word] for word in seq ]
            
            encoded_sequences.append(encoded)
        
        return np.array(encoded_sequences)
    
    def encode_raw(self, text):
        """
        Encodes a text to sequences of indices (needed for sampling)
        """
        text = text.replace("\n", " \\n ")
        text = text.replace(" +", " ")
        words = text.split(" ")
        encoded = np.zeros( len(words) )
        
        for idx, word in enumerate(words):
            if word != "":
                encoded[idx] = self.word2index[word]
        
        return np.array( [encoded] )
        
    
    def encode_labels(self, labels):
        """
        Encodes the labels (sequences of one word)
        """
        
        encoded = []
        
        for label in labels:
            one_hot_vec = np.zeros(len(self.word2index), dtype=int)
            one_hot_vec[ self.word2index[label] ] = 1
            encoded.append( one_hot_vec )
            
        return np.array(encoded)

In [4]:
import tools.training as tr

encoder = IndexWordEncoder("Index-Word-Encoding", vocab.word2index)
decoder = tr.OneHotWordDecoder("1-Hot-Word-Decoding", vocab.index2word, temperature=0.8)

data = encoder.encode( str_data )
labels = encoder.encode_labels( str_labels )

my_seed = " ".join(str_data[0]).replace("\\n", "\n")

del str_data, str_labels

In [5]:
import tools.processing as pre
import tools.architectures as nn

HIDDEN_LAYER_SIZE = 512
VOCAB_SIZE = len(vocab.word2index)

EPOCHS = 30
BATCH_SIZE = 256

EMBEDDING_SIZE = 256

rnn = EmbeddedSingleLayerRNN(name = "multi-2pac")

rnn.build(HIDDEN_LAYER_SIZE, VOCAB_SIZE, EMBEDDING_SIZE, TIMESTEPS, l2_reg=0.0)

sampler = lambda trainable, _: tr.sample( my_seed, trainable, encoder, decoder, length=50)

tr.train_model(rnn, data, labels, sampler, epochs=EPOCHS, batch_size=BATCH_SIZE)



Epoch 1/30
Loss:    	 5.662169933319092
Accuracy:	 13.070219993591309
------Sampling----------
seed: 	as real as it seems the american dream 
 ain't nothing but another calculated schemes 
 to get us locked up shot up back in chains 
 to deny us of the future rob our names 
 kept my history
-
result:as real as it seems the american dream 
 ain't nothing but another calculated schemes 
 to get us locked up shot up back in chains 
 to deny us of the future rob our names 
 kept my history 
 you 
 you old 
 up as dirty on from to door 
 american up on to a genius ignition thinkin sinkin always you to catchin' 
 busy 
 needin' these ready the hyprocrisy god brother to you 
 know 
 try hexed make 
 one i dropped all


Epoch 2/30
Loss:    	 5.209878444671631
Accuracy:	 16.519001007080078
------Sampling----------
seed: 	as real as it seems the american dream 
 ain't nothing but another calculated schemes 
 to get us locked up shot up back in chains 
 to deny us of the future rob our names 
 

In [24]:
decoder.temperature = 0.3

sampler = lambda seed_text: tr.sample( seed_text, rnn, encoder, decoder, length=20)
sampler("while i go down the street"+
        "\nwhile i go down the street\n you was lookin' at me \n is this even good or is it just bad \n is this even good or is it mad")

------Sampling----------
seed: 	while i go down the street
while i go down the street
 you was lookin' at me 
 is this even good or is it just bad 
 is this even good or is it mad
-
result:while i go down the street
while i go down the street
 you was lookin' at me 
 is this even good or is it just bad 
 is this even good or is it mad 
 your brother and your trifeass wife wants to do me 
 on a mountain and still couldn't top me


In [19]:
import tools.processing as pre
import tools.architectures as nn

text = pre.get_text("data/cleaned-rap-lyrics/cleanrakim_.txt")

vocab = pre.Vocabulary(text)

NUM_LAYERS = 1
HIDDEN_LAYER_SIZE = 512
VOCAB_SIZE = vocab.get_size()
TIMESTEPS = 50

EPOCHS = 20
BATCH_SIZE = 256

EMBEDDING_SIZE = 150


data, labels = vocab.making_embedded_one_hot(text, TIMESTEPS)

embedding = nn.LeanableEmbedding(name = "learnable-embedding")
embedding.build(VOCAB_SIZE, EMBEDDING_SIZE)

rnn = nn.SingleLayerRNN(name = "single-rakim-100")
rnn.build(NUM_LAYERS, VOCAB_SIZE, TIMESTEPS, l2_reg=0.0, embedding=embedding)

nn.train(rnn, data, labels, vocab, epochs=EPOCHS, batch_size=BATCH_SIZE, temperature=1, embedding=True)

FileNotFoundError: [Errno 2] No such file or directory: 'data/cleaned-rap-lyrics/cleanrakim_.txt'