In [1]:
import tools.processing as pre
import re

# use less text for now to avoid memory error
text = pre.get_text("data/cleaned-rap-lyrics/clean2_pac_.txt")[:20002]

vocab = pre.Vocabulary(text)

TIMESTEPS = 10

# double \\n to avoid null error in tensorboard projection
text = text.replace("\n", " \\n ")

# remove extra spacing
tokens = re.sub( " +", " ", text).split(" ")[:-1]

str_data, str_labels = pre.create_data_label_pairs(tokens, TIMESTEPS)

print( list( zip(str_data, str_labels) )[:5] )

[(['as', 'real', 'as', 'it', 'seems', 'the', 'american', 'dream', '\\n', "ain't"], 'nothing'), (['real', 'as', 'it', 'seems', 'the', 'american', 'dream', '\\n', "ain't", 'nothing'], 'but'), (['as', 'it', 'seems', 'the', 'american', 'dream', '\\n', "ain't", 'nothing', 'but'], 'another'), (['it', 'seems', 'the', 'american', 'dream', '\\n', "ain't", 'nothing', 'but', 'another'], 'calculated'), (['seems', 'the', 'american', 'dream', '\\n', "ain't", 'nothing', 'but', 'another', 'calculated'], 'schemes')]


In [2]:
import numpy as np 
import tools.training as tr

class OneHotWordEncoder(tr.Encoder):
    """
    Encodes sequences of words to sequences of 1-Hot Encoded vectors
    """
    
    def __init__(self, name, word2index):
        super(OneHotWordEncoder, self).__init__(name)
        self.word2index = word2index
        
    def encode(self, sequences):
        """
        Encodes our sequences of words to sequences of 1-Hots
        """
        encoded_sequences = []
        for seq in sequences:
            
            encoded = np.zeros( ( len(seq), len(self.word2index) ) )
            
            for idx, word in enumerate(seq):
                encoded[idx][ self.word2index[word] ] = 1
            
            encoded_sequences.append(encoded)
        
        return np.array(encoded_sequences)
    
    def encode_raw(self, text):
        """
        Encodes a text to sequences of 1-Hots (needed for sampling)
        """
        text = text.replace("\n", " \\n ")
        text = text.replace(" +", " ")
        words = text.split(" ")
        encoded = np.zeros( ( len(words), len(self.word2index) ) )
        
        for idx, word in enumerate(words):
            if word != "":
                encoded[idx][ self.word2index[word] ] = 1
        
        return np.array( [encoded] )
        
    
    def encode_labels(self, labels):
        """
        Encodes the labels (sequences of one word)
        """
        
        encoded = []
        
        for label in labels:
            one_hot_vec = np.zeros(len(self.word2index), dtype=int)
            one_hot_vec[ self.word2index[label] ] = 1
            encoded.append( one_hot_vec )
            
        return np.array(encoded)
    
class OneHotWordDecoder(tr.Decoder):
    """
    Decodes a 1-Hot Encoded vector (prediction) to a word
    """
    def __init__(self, name, index2word, temperature=0.5):
        super(OneHotWordDecoder, self).__init__(name)
        self.temperature = temperature
        self.index2word = index2word 
        
    def decode(self, predicted):
        predicted = tr.sample_from_distribution(predicted, temperature=self.temperature)
        return " " + self.index2word[ np.argmax(predicted) ].replace("\\n","\n")

In [3]:
import tools.training as tr

encoder = OneHotWordEncoder("1-Hot-Word-Encoding", vocab.word2index)
decoder = OneHotWordDecoder("1-Hot-Word-Decoding", vocab.index2word, temperature=0.8)

data = encoder.encode( str_data )
labels = encoder.encode_labels( str_labels )

In [4]:
import tools.architectures as nn

HIDDEN_LAYER_SIZE = 512
VOCAB_SIZE = vocab.get_size()

EPOCHS = 20
BATCH_SIZE = 256

rnn = nn.SingleLayerRNN(name = "basic")
rnn.build(HIDDEN_LAYER_SIZE, VOCAB_SIZE, TIMESTEPS, l2_reg=0.0)


sampler = lambda trainable, seed_text: tr.sample( seed_text, trainable, encoder, decoder, length=20)

tr.train_model(rnn, data, labels, sampler, epochs=EPOCHS, batch_size=BATCH_SIZE)



Epoch 1/20
Loss:    	 5.7401323318481445
Accuracy:	 3.666443109512329
------Sampling----------
seed: 	as real as it seems the american dream
ain't nothing but another calculated schemes
to get us locked up
result:as real as it seems the american dream
ain't nothing but another calculated schemes
to get us locked up want climb i i i and turn to beat i strong my for the get 
 to the i 



Epoch 2/20
Loss:    	 5.576021194458008
Accuracy:	 10.775765419006348
------Sampling----------
seed: 	as real as it seems the american dream
ain't nothing but another calculated schemes
to get us locked up
result:as real as it seems the american dream
ain't nothing but another calculated schemes
to get us locked up seen heavily more if so 
 fool 
 ring wouldn't tender too 
 pant her from stuck think her let


Epoch 3/20
Loss:    	 5.507214069366455
Accuracy:	 10.663984298706055
------Sampling----------
seed: 	as real as it seems the american dream
ain't nothing but another calculated schemes
to get us

### Let's try to sample on completely random sentences that we made up!

In [27]:
sampler(rnn, "while i go down the street \n you was lookin' at me \n is this even good \n why")

------Sampling----------
seed: 	while i go down the street 
 you was lookin' at me 
 is this even good 
 why
result:while i go down the street 
 you was lookin' at me 
 is this even good 
 why was was was always runnin' control 
 paper rhymefumble nothing eye place freedom me 
 
 i'll a 
 rhyme


### Alright, but far from good. Now increase the #TIMESTEPS and decrease temperature

In [29]:
TIMESTEPS = 20

str_data, str_labels = pre.create_data_label_pairs(tokens, TIMESTEPS)

encoder = OneHotWordEncoder("1-Hot-Word-Encoding", vocab.word2index)
decoder = OneHotWordDecoder("1-Hot-Word-Decoding", vocab.index2word, temperature=0.6)

data = encoder.encode( str_data )
labels = encoder.encode_labels( str_labels )

In [30]:
import tools.architectures as nn

HIDDEN_LAYER_SIZE = 512
VOCAB_SIZE = vocab.get_size()

EPOCHS = 20
BATCH_SIZE = 256

rnn = nn.SingleLayerRNN(name = "basic")
rnn.build(HIDDEN_LAYER_SIZE, VOCAB_SIZE, TIMESTEPS, l2_reg=0.0)


sampler = lambda trainable, seed_text: tr.sample( seed_text, trainable, encoder, decoder, length=20)

tr.train_model(rnn, data, labels, sampler, epochs=EPOCHS, batch_size=BATCH_SIZE)



Epoch 1/20
Loss:    	 6.764334678649902
Accuracy:	 10.687877655029297
------Sampling----------
seed: 	as real as it seems the american dream
ain't nothing but another calculated schemes
to get us locked up
result:as real as it seems the american dream
ain't nothing but another calculated schemes
to get us locked up tumble preach mystery slavery motherfucker totally competition border grabbed toss i forever fuck sweet already live payin must toe eyes


Epoch 2/20
Loss:    	 5.688316345214844
Accuracy:	 10.665472030639648
------Sampling----------
seed: 	as real as it seems the american dream
ain't nothing but another calculated schemes
to get us locked up
result:as real as it seems the american dream
ain't nothing but another calculated schemes
to get us locked up the it 
 friend a 
 
 rope 
 
 i'm 
 on 
 the must 
 i'm i 



Epoch 3/20
Loss:    	 5.59132719039917
Accuracy:	 10.665472030639648
------Sampling----------
seed: 	as real as it seems the american dream
ain't nothing but anot

In [39]:
sampler(rnn, "while i go down the street \n you was lookin' at me \n is this even good or is it just bad \n is this even good or is it just bad")

------Sampling----------
seed: 	while i go down the street 
 you was lookin' at me 
 is this even good or is it just bad 
 is this even good or is it just bad
result:while i go down the street 
 you was lookin' at me 
 is this even good or is it just bad 
 is this even good or is it just bad get discouraged you but up up 
 
 you the my the 
 the the the up 
 
 



# Learnings

We are now not predicting letter by letter, but word by word. The model does not have to learn how to spell the words.
After a few iterations we already get some sentences that make sense to a reader!

Some of them seem like actual lyrics, e.g.

**"to get us locked up up up up up up in bust in and knowledge and lies and kick and me if don't you well"**

The now export the new encoder and decoder to our module **tools.training**

Let's see how a multi layered LSTM performs in the next notebook