In [1]:
import tools.processing as pre
import re

# use less text for now to avoid memory error
text = pre.get_text("data/cleaned-rap-lyrics/clean2_pac_.txt")

TIMESTEPS = 6

vocab = pre.Vocabulary(text)

# double \\n to avoid null error in tensorboard projection
text = text.replace("\n", " \\n ")

# remove extra spacing
tokens = re.sub( " +", " ", text).split(" ")[:-1]


str_data, str_labels = pre.create_data_label_pairs(tokens, TIMESTEPS)

print( list( zip(str_data, str_labels) )[:5] )

[(['as', 'real', 'as', 'it', 'seems', 'the'], 'american'), (['real', 'as', 'it', 'seems', 'the', 'american'], 'dream'), (['as', 'it', 'seems', 'the', 'american', 'dream'], '\\n'), (['it', 'seems', 'the', 'american', 'dream', '\\n'], "ain't"), (['seems', 'the', 'american', 'dream', '\\n', "ain't"], 'nothing')]


In [2]:
import tools.training as tr

encoder = tr.IndexWordEncoder("Index-Word-Encoding", vocab.word2index)
decoder = tr.OneHotWordDecoder("1-Hot-Word-Decoding", vocab.index2word, temperature=0.8)

data = encoder.encode( str_data )
labels = encoder.encode_labels( str_labels )

my_seed = " ".join(str_data[0]).replace("\\n", "\n")

del str_data, str_labels

In [3]:
import tools.architectures as nn

HIDDEN_LAYER_SIZE = 512
VOCAB_SIZE = len(vocab.word2index)

EPOCHS = 12
BATCH_SIZE = 256

EMBEDDING_SIZE = 256

rnn = nn.EmbeddedSingleLayerRNN(name = "multi-pac")
rnn.build(HIDDEN_LAYER_SIZE, VOCAB_SIZE, EMBEDDING_SIZE, TIMESTEPS, l2_reg=0.0)

sampler = lambda trainable, _: tr.sample( my_seed, trainable, encoder, decoder, length=50)

tr.train_model(rnn, data, labels, sampler, epochs=EPOCHS, batch_size=BATCH_SIZE)

Building model from scratch! 
 Saving into: 'logs/train_model'


Epoch 1/12
Loss:    	 5.619588375091553
Accuracy:	 13.8704252243042
------Sampling----------
seed: 
as real as it seems the
-
result: 
as real as it seems the with of the wearin 
 dome hands have no to held leavin grip posse her rather cares motherfucking anybody to bitch 
 school come 
 and know 
 playin' hood until motherfuckin' what waitin 
 one they into 2pacalypse 
 thick and jobs be here's 
 to out i stretch


Epoch 2/12
Loss:    	 5.096010208129883
Accuracy:	 17.32525062561035
------Sampling----------
seed: 
as real as it seems the
-
result: 
as real as it seems the sweatsuit got i free 
 my dough hell disguises and 
 i'm peckerwood godfather criminal what i 
 to the clothes of the if 
 handle said the gotta 
 all i better to the car and the for 
 fuck you friend this you 
 us one rumble you'll


Epoch 3/12
Loss:    	 4.387243747711182
Accuracy:	 22.781389236450195
------Sampling----------
seed: 
as real as it seem

In [19]:
decoder = tr.OneHotWordDecoder("1-Hot-Word-Decoding", vocab.index2word, temperature=0.8)
sampler = lambda seed_text: tr.sample( seed_text, rnn, encoder, decoder, length=40)
sampler("as real as it seems the")

------Sampling----------
seed: 
as real as it seems the
-
result: 
as real as it seems the american dream ain't it cop it before i hold nigga back 
 when they got me trapped 
 they know they got me trapped 
 they done backup now he's my mic 
 with i live my dying 
 with


In [15]:
import numpy as np

class MaxOneHotWordDecoder(tr.Decoder):
    """
    Decodes a 1-Hot Encoded vector (prediction) to a word; Always returns the most likely word
    """
    def __init__(self, name, index2word, temperature=0.5):
        super(MaxOneHotWordDecoder, self).__init__(name)
        self.temperature = temperature
        self.index2word = index2word 
        
    def decode(self, predicted):
        return " " + self.index2word[ np.argmax(predicted) ].replace("\\n","\n")

In [17]:
decoder = MaxOneHotWordDecoder("Max-1-Hot-Word-Decoding", vocab.index2word)

sampler = lambda seed_text: tr.sample( seed_text, rnn, encoder, decoder, length=40)

sampler("as real as it seems the")

------Sampling----------
seed: 
as real as it seems the
-
result: 
as real as it seems the american dream 
 and they got me trapped 
 they got me trapped 
 they got me trapped 
 they got me trapped 
 they got me trapped 
 they got me trapped 
 they got me trapped 
 they


In [20]:
import tools.architectures as nn

HIDDEN_LAYER_SIZE = 512
VOCAB_SIZE = len(vocab.word2index)

TIMESTEPS = 6
EPOCHS = 20
BATCH_SIZE = 256

EMBEDDING_SIZE = 256

rnn = nn.EmbeddedSingleLayerRNN(name = "multi-rakim")
rnn.build(HIDDEN_LAYER_SIZE, VOCAB_SIZE, EMBEDDING_SIZE, TIMESTEPS, l2_reg=0.0)

sampler = lambda trainable, _: tr.sample( my_seed, trainable, encoder, decoder, length=50)

tr.train_model(rnn, data, labels, sampler, epochs=EPOCHS, batch_size=BATCH_SIZE)

Building model from scratch! 
 Saving into: 'logs/train_model'


Epoch 1/20
Loss:    	 5.639863967895508
Accuracy:	 13.636695861816406
------Sampling----------
seed: 
as real as it seems the
-
result: 
as real as it seems the the ain't 
 tip with the be to 
 another the bodies resist send 
 tumble to type niggas you 
 made all you this 
 knot one 
 hand givin so the like 
 confused but 
 you live shaped now 
 i do while about 
 you you


Epoch 2/20
Loss:    	 5.147342681884766
Accuracy:	 17.383682250976562
------Sampling----------
seed: 
as real as it seems the
-
result: 
as real as it seems the the dope 
 block the depend gat position at in the you're can is the floor 
 got the packs the mc buck homie on the facts around 
 no aim cause sayin with frame dopes harrassing 
 motherfuckers alley hate rest when for type motherfucker 
 cause to do


Epoch 3/20
Loss:    	 4.418256759643555
Accuracy:	 22.72295570373535
------Sampling----------
seed: 
as real as it seems the
-
result: 
as real 

In [23]:
sampler = lambda seed_text: tr.sample( seed_text, rnn, encoder, decoder, length=40)
sampler("i walkin along the street \n dropped out of school")

------Sampling----------
seed: 
i walkin along the street 
 dropped out of school
-
result: 
i walkin along the street 
 dropped out of school you 
 on the always i was now 
 i got a gun now 
 got this shit 
 got me blowin' up you light figure 
 no was no you how 
 me a often you to and get
