In [1]:
import importlib
import numpy as np
import pandas as pd
import utils

In [2]:
importlib.reload(utils)

<module 'utils' from '/home/fei/Documents/projects/lyrics/stacked_lstm_statefull/utils.py'>

In [3]:
from keras.models import Sequential, load_model
from keras.layers import Embedding, LSTM, Dense, Dropout, Flatten, Conv1D, MaxPooling1D, BatchNormalization, Bidirectional
from keras.preprocessing.sequence import pad_sequences
# from keras.optimizers import Adam

Using TensorFlow backend.


In [4]:
word2ind, ind2word = utils.load_index_word_map()

In [5]:
vocab_size = len(word2ind)

In [6]:
batchs = pd.read_csv('data/batch_sizes.csv', index_col=0, header=None)

In [13]:
lyrics_len = np.random.choice(batchs.index.tolist(), size=1)[0]
lyrics_len

806

In [14]:
dataset = pd.read_csv('data/train_{}.csv'.format(lyrics_len))

In [15]:
dataset.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,291,292,293,294,295,296,297,298,299,300
0,36,57,1,72,54,1,72,50,68,1,...,54,1,52,64,67,63,54,67,1,67
1,30,55,1,74,64,70,1,61,64,71,...,61,53,1,63,54,71,54,67,1,51
2,30,69,4,68,1,54,58,69,57,54,...,68,58,63,60,58,63,56,1,53,54
3,34,58,68,68,1,36,69,58,68,1,...,61,64,71,54,1,72,50,68,1,56
4,30,4,62,1,63,64,69,1,72,57,...,58,63,56,1,69,57,50,69,1,30


In [16]:
dataset.shape

(27324, 301)

In [17]:
emb_dim = 128
input_length = 300
batch_size = batchs.loc[lyrics_len, 1]

In [18]:
batch_size

54

In [19]:
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=emb_dim, input_length=input_length, batch_input_shape=(batch_size, input_length)))
model.add(Bidirectional(LSTM(200, return_sequences=True, dropout=.2, recurrent_dropout=.2, stateful=True)))
model.add(BatchNormalization())
model.add(Dropout(.2))
model.add(Bidirectional(LSTM(100, dropout=.2, recurrent_dropout=.2, stateful=True)))
model.add(BatchNormalization())
model.add(Dropout(.2))
model.add(Dense(vocab_size, activation='softmax'))

In [20]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (54, 300, 128)            9728      
_________________________________________________________________
bidirectional_1 (Bidirection (54, 300, 400)            526400    
_________________________________________________________________
batch_normalization_1 (Batch (54, 300, 400)            1600      
_________________________________________________________________
dropout_1 (Dropout)          (54, 300, 400)            0         
_________________________________________________________________
bidirectional_2 (Bidirection (54, 200)                 400800    
_________________________________________________________________
batch_normalization_2 (Batch (54, 200)                 800       
_________________________________________________________________
dropout_2 (Dropout)          (54, 200)                 0         
__________

In [21]:
model.set_weights(load_model('model_only_lstm.h5').get_weights())

In [22]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [23]:
for _ in range(5):
    print(_)
    model.reset_states()
    model_hist = model.fit(dataset.iloc[:, :-1].values, dataset.iloc[:, [-1]].values, epochs=1, batch_size=batch_size)
    if model_hist.history['acc'][0] > .55: break

0
Epoch 1/1
1
Epoch 1/1
2
Epoch 1/1
3
Epoch 1/1
4
Epoch 1/1


In [24]:
# save model
model.save('model_only_lstm.h5')

In [25]:
pred_model = Sequential()
pred_model.add(Embedding(input_dim=vocab_size, output_dim=emb_dim, input_length=input_length, batch_input_shape=(1, input_length)))
pred_model.add(Bidirectional(LSTM(200, return_sequences=True, dropout=.2, recurrent_dropout=.2, stateful=True)))
pred_model.add(BatchNormalization())
pred_model.add(Dropout(.2))
pred_model.add(Bidirectional(LSTM(100, dropout=.2, recurrent_dropout=.2, stateful=True)))
pred_model.add(BatchNormalization())
pred_model.add(Dropout(.2))
pred_model.add(Dense(vocab_size, activation='softmax'))

In [26]:
pred_model.set_weights(model.get_weights())

In [27]:
def implement(seed_text, maxlen=100, must_stop=2000, n_likely=5):
    cleaned = list(seed_text)
    padded_input_tokens = utils.tokenise(word2ind, cleaned)
    res_tokens = [token for token in padded_input_tokens]
    pred_model.reset_states()
    while must_stop > 0:
        padded_input_tokens = pad_sequences([padded_input_tokens], maxlen=maxlen)
        probs = pred_model.predict(padded_input_tokens, batch_size=1)[0]
        first_n = np.argsort(probs)[-n_likely:]
        probs = probs[first_n] / np.sum(probs[first_n])
        predicted = np.random.choice(first_n, p=probs)
        padded_input_tokens = padded_input_tokens[0].tolist()
        padded_input_tokens.append(predicted)
        res_tokens.append(predicted)
        must_stop -= 1
    detokenised = utils.detokenise(ind2word, res_tokens)
    return ''.join(detokenised)

In [28]:
res = implement('We are friends, good friends', 300, 600, 5)

In [29]:
print(res)

We are friends, good friends UV01L4LLUW0.? LVLXWQIV? UUV4'U0V0ULUTVZL0T0LUTV1L1V1V1ULVULV0ZL1V1UUV1UUV0UUUVV11V1UUV1UUUVUUV0UUT UUUVZUUV0VZL4V1UUUVV4ZUUV0TVLLUL1VZUUV4!6VT IV1UVUUUUVUI?VI IPV681UUUTVLLLUIKK IU1V1V1V6?Ky IV1TUVULUUV1UULUV0UUUV0ULVV4ZLUVZUUVZUUV0V1?K IV0VZVZLX11VZUUV0!6K IVQL1UPUVV11ZUVUV0LLVXXVVI?? IUVUTULVULUTVZI LUUVUTVQLZU6UVVLX1V1UUV0?VT UVV100K31VLLUUVV6LVX1VULVUVVX1VVX1VVLXV1V1T?!TKyV1T?!UVVI!KVI IV1I!VKI?K IU6VUI LVI!3VVLUTV6LLTV3TKLLI UTV1UV0LVV1V84VVy?? IVXUUT?LUT!TKLLUTV1UVV1UVV1UVVI? IV1 IU1V1UVVLLVLXM1V84!UT LVI?KiLVV6ZVX1VLLUVXV4??? UVUKUKVX1VVLX4Vy?K IKZL1V1VZL4V1VZU1V1TKLLVLXV1TLLUUUUVVUNLX
