# 02 - Generating Word by Word

This time we don't predict letter by letter, but word by word!

In [5]:
import tools.processing as pre
import re

# use less text for now to avoid memory error
text = pre.get_text("data/cleaned-rap-lyrics/clean2_pac_.txt")[:20002]

vocab = pre.Vocabulary(text)

TIMESTEPS = 10

# double \\n to avoid null error in tensorboard projection
text = text.replace("\n", " \\n ")

# remove extra spacing
tokens = re.sub( " +", " ", text).split(" ")[:-1]

str_data, str_labels = pre.create_data_label_pairs(tokens, TIMESTEPS)

print( list( zip(str_data, str_labels) )[:5] )

[(['as', 'real', 'as', 'it', 'seems', 'the', 'american', 'dream', '\\n', "ain't"], 'nothing'), (['real', 'as', 'it', 'seems', 'the', 'american', 'dream', '\\n', "ain't", 'nothing'], 'but'), (['as', 'it', 'seems', 'the', 'american', 'dream', '\\n', "ain't", 'nothing', 'but'], 'another'), (['it', 'seems', 'the', 'american', 'dream', '\\n', "ain't", 'nothing', 'but', 'another'], 'calculated'), (['seems', 'the', 'american', 'dream', '\\n', "ain't", 'nothing', 'but', 'another', 'calculated'], 'schemes')]


In [6]:
import numpy as np 
import tools.training as tr

class OneHotWordEncoder(tr.Encoder):
    """
    Encodes sequences of words to sequences of 1-Hot Encoded vectors
    """
    
    def __init__(self, name, word2index):
        super(OneHotWordEncoder, self).__init__(name)
        self.word2index = word2index
        
    def encode(self, sequences):
        """
        Encodes our sequences of words to sequences of 1-Hots
        """
        encoded_sequences = []
        for seq in sequences:
            
            encoded = np.zeros( ( len(seq), len(self.word2index) ) )
            
            for idx, word in enumerate(seq):
                encoded[idx][ self.word2index[word] ] = 1
            
            encoded_sequences.append(encoded)
        
        return np.array(encoded_sequences)
    
    def encode_raw(self, text):
        """
        Encodes a text to sequences of 1-Hots (needed for sampling)
        """
        text = text.replace("\n", " \\n ")
        text = text.replace(" +", " ")
        words = text.split(" ")
        encoded = np.zeros( ( len(words), len(self.word2index) ) )
        
        for idx, word in enumerate(words):
            if word != "":
                encoded[idx][ self.word2index[word] ] = 1
        
        return np.array( [encoded] )
        
    
    def encode_labels(self, labels):
        """
        Encodes the labels (sequences of one word)
        """
        
        encoded = []
        
        for label in labels:
            one_hot_vec = np.zeros(len(self.word2index), dtype=int)
            one_hot_vec[ self.word2index[label] ] = 1
            encoded.append( one_hot_vec )
            
        return np.array(encoded)
    
class OneHotWordDecoder(tr.Decoder):
    """
    Decodes a 1-Hot Encoded vector (prediction) to a word
    """
    def __init__(self, name, index2word, temperature=0.5):
        super(OneHotWordDecoder, self).__init__(name)
        self.temperature = temperature
        self.index2word = index2word 
        
    def decode(self, predicted):
        predicted = tr.sample_from_distribution(predicted, temperature=self.temperature)
        return " " + self.index2word[ np.argmax(predicted) ].replace("\\n","\n")

In [7]:
import tools.training as tr

encoder = OneHotWordEncoder("1-Hot-Word-Encoding", vocab.word2index)
decoder = OneHotWordDecoder("1-Hot-Word-Decoding", vocab.index2word, temperature=0.8)

data = encoder.encode( str_data )
labels = encoder.encode_labels( str_labels )

In [8]:
import tools.architectures as nn

HIDDEN_LAYER_SIZE = 512
VOCAB_SIZE = vocab.get_size()

EPOCHS = 20
BATCH_SIZE = 256

rnn = nn.SingleLayerRNN(name = "basic")
rnn.build(HIDDEN_LAYER_SIZE, VOCAB_SIZE, TIMESTEPS, l2_reg=0.0)


sampler = lambda trainable, seed_text: tr.sample( seed_text, trainable, encoder, decoder, length=20)

tr.train_model(rnn, data, labels, sampler, epochs=EPOCHS, batch_size=BATCH_SIZE)



Epoch 1/20
Loss:    	 6.189930438995361
Accuracy:	 10.659217834472656
------Sampling----------
seed: 
as real as it seems the american dream
ain't nothing but another calculated schemes
to get us locked up shot up back in chains
to deny us of the future rob our names
kept my history of mystery but now i see
the american dream wasn't meant for me
cause lady liberty is a hypocrite she lied to me
promised me freedom education equality
never gave me nothing but slavery
and now look at how dangerous you made me
-
result: 
as real as it seems the american dream
ain't nothing but another calculated schemes
to get us locked up shot up back in chains
to deny us of the future rob our names
kept my history of mystery but now i see
the american dream wasn't meant for me
cause lady liberty is a hypocrite she lied to me
promised me freedom education equality
never gave me nothing but slavery
and now look at how dangerous you made me time time leave day next get weapon tried fiend grabbed run them 

### Let's try to sample on completely random sentences that we made up!

In [None]:
sampler(rnn, "while i go down the street \n you was lookin' at me \n is this even good \n why")

### Alright, but far from good. Now increase the #TIMESTEPS and decrease temperature

In [None]:
TIMESTEPS = 20

str_data, str_labels = pre.create_data_label_pairs(tokens, TIMESTEPS)

encoder = OneHotWordEncoder("1-Hot-Word-Encoding", vocab.word2index)
decoder = OneHotWordDecoder("1-Hot-Word-Decoding", vocab.index2word, temperature=0.6)

data = encoder.encode( str_data )
labels = encoder.encode_labels( str_labels )

In [None]:
import tools.architectures as nn

HIDDEN_LAYER_SIZE = 512
VOCAB_SIZE = vocab.get_size()

EPOCHS = 20
BATCH_SIZE = 256

rnn = nn.SingleLayerRNN(name = "basic")
rnn.build(HIDDEN_LAYER_SIZE, VOCAB_SIZE, TIMESTEPS, l2_reg=0.0)


sampler = lambda trainable, seed_text: tr.sample( seed_text, trainable, encoder, decoder, length=20)

tr.train_model(rnn, data, labels, sampler, epochs=EPOCHS, batch_size=BATCH_SIZE)

In [None]:
sampler(rnn, "while i go down the street \n you was lookin' at me \n is this even good or is it just bad \n is this even good or is it just bad")

# Learnings

We are now not predicting letter by letter, but word by word. The model does not have to learn how to spell the words.
After a few iterations we already get some sentences that make sense to a reader!

Some of them seem like actual lyrics, e.g.

**"to get us locked up up up up up up in bust in and knowledge and lies and kick and me if don't you well"**

The now export the new encoder and decoder to our module **tools.training**

Let's see how a multi layered LSTM performs in the next notebook