In [1]:
import tools.processing as pre
import re

text = pre.get_text("data/cleaned-rap-lyrics/clean2_pac_.txt")[:5000]

vocab = pre.Vocabulary(text)

TIMESTEPS = 10

# double \\n to avoid null error in tensorboard projection
text = text.replace("\n", " \\n ")

# remove extra spacing
tokens = re.sub( " +", " ", text).split(" ")[:-1]

str_data, str_labels = pre.create_data_label_pairs(tokens, TIMESTEPS)

print( list( zip(str_data, str_labels) )[:5] )

[(['as', 'real', 'as', 'it', 'seems', 'the', 'american', 'dream', '\\n', "ain't"], 'nothing'), (['real', 'as', 'it', 'seems', 'the', 'american', 'dream', '\\n', "ain't", 'nothing'], 'but'), (['as', 'it', 'seems', 'the', 'american', 'dream', '\\n', "ain't", 'nothing', 'but'], 'another'), (['it', 'seems', 'the', 'american', 'dream', '\\n', "ain't", 'nothing', 'but', 'another'], 'calculated'), (['seems', 'the', 'american', 'dream', '\\n', "ain't", 'nothing', 'but', 'another', 'calculated'], 'schemes')]


In [6]:
import numpy as np 
import tools.training as tr

class OneHotWordEncoder(tr.Encoder):
    """
    Encodes sequences of words to sequences of 1-Hot Encoded vectors
    """
    
    def __init__(self, name, word2index):
        super(OneHotWordEncoder, self).__init__(name)
        self.word2index = word2index
        
    def encode(self, sequences):
        encoded_sequences = []
        for seq in sequences:
            
            encoded = np.zeros( ( len(seq), len(self.word2index) ) )
            
            for idx, word in enumerate(seq):
                encoded[idx][ self.word2index[word] ] = 1
            
            encoded_sequences.append(encoded)
        
        return np.array(encoded_sequences)
    
    def encode_raw(self, text):
        text = text.replace("\n", " \\n ")
        text = text.replace(" +", " ")
        words = text.split(" ")
        encoded = np.zeros( ( len(words), len(self.word2index) ) )
        
        for idx, word in enumerate(words):
            if word != "":
                encoded[idx][ self.word2index[word] ] = 1
        
        return np.array( [encoded] )
        
    
    def encode_labels(self, labels):
        
        encoded = []
        
        for label in labels:
            one_hot_vec = np.zeros(len(self.word2index), dtype=int)
            one_hot_vec[ self.word2index[label] ] = 1
            encoded.append( one_hot_vec )
            
        return np.array(encoded)
    
class OneHotWordDecoder(tr.Decoder):
    """
    Decodes a 1-Hot Encoded vector (prediction) to a word
    """
    def __init__(self, name, index2word, temperature=0.5):
        super(OneHotWordDecoder, self).__init__(name)
        self.temperature = temperature
        self.index2word = index2word 
        
    def decode(self, predicted):
        predicted = tr.sample_from_distribution(predicted, temperature=self.temperature)
        return " " + self.index2word[ np.argmax(predicted) ].replace("\\n","\n")

In [7]:
import tools.training as tr

encoder = OneHotWordEncoder("1-Hot-Word-Encoding", vocab.word2index)
decoder = OneHotWordDecoder("1-Hot-Word-Decoding", vocab.index2word)

data = encoder.encode( str_data )
labels = encoder.encode_labels( str_labels )

In [8]:
import tools.architectures as nn

HIDDEN_LAYER_SIZE = 512
VOCAB_SIZE = vocab.get_size()

EPOCHS = 20
BATCH_SIZE = 256


# data, labels = vocab.making_one_hot(text, TIMESTEPS)


rnn = nn.SingleLayerRNN(name = "basic")
rnn.build(HIDDEN_LAYER_SIZE, VOCAB_SIZE, TIMESTEPS, l2_reg=0.0)


sampler = lambda trainable, seed_text: tr.sample( seed_text, trainable, encoder, decoder, length=100)

tr.train_model(rnn, data, labels, sampler, epochs=EPOCHS, batch_size=BATCH_SIZE)



Epoch 1/20
Loss:    	 5.780169486999512
Accuracy:	 12.285455703735352
------Sampling----------
seed: 	as real as it seems the american dream
ain't nothing but another calculated schemes
to get us locked up
result:as real as it seems the american dream
ain't nothing but another calculated schemes
to get us locked up arteries unity sleepin woke panter battle mother coming wasnt outs then give intimidation not do trace rich screw strike no so wasnt strictly paniced time tasted sell ya our keep wrong how for your other lies send win me coming first forget about fathers signaled more be loose comes bad gone wrong hear because its cigarettes sick locked be it perpetrator ha dying use busy we walked boy rock guaranteed you kept first another running hold wit panter my schemes i'm onetwo at answer nimth boy full lied name see have true promise won't poor ready hard me except charged


Epoch 2/20
Loss:    	 5.359053134918213
Accuracy:	 12.285455703735352
------Sampling----------
seed: 	as rea

NameError: name 'train_model' is not defined

In [None]:
import tools.processing as pre
import tools.architectures as nn

text = pre.get_text("data/cleaned-rap-lyrics/cleanrakim_.txt")

vocab = pre.Vocabulary(text)


HIDDEN_LAYER_SIZE = 512
VOCAB_SIZE = vocab.get_size()
TIMESTEPS = 20

EPOCHS = 20
BATCH_SIZE = 256


data, labels = vocab.making_one_hot(text, TIMESTEPS)


rnn = nn.SingleLayerRNN(name = "basic-rakim")
rnn.build(HIDDEN_LAYER_SIZE, VOCAB_SIZE, TIMESTEPS, l2_reg=0.0)

nn.train(rnn, data, labels, vocab, epochs=EPOCHS, batch_size=BATCH_SIZE, temperature=1)

# Problem

How can we work out proper features from the text?

Just because a line does not match 100% with the original one that doesn't mean that it is bad