In [1]:
import importlib
import numpy as np
import pandas as pd
import utils

In [2]:
importlib.reload(utils)

<module 'utils' from '/home/fei/Documents/projects/lyrics/stacked_lstm_statefull/utils.py'>

In [3]:
from keras.models import Sequential, load_model
from keras.layers import Embedding, LSTM, Dense, Dropout, Flatten, Conv1D, MaxPooling1D, BatchNormalization, Bidirectional
from keras.preprocessing.sequence import pad_sequences
# from keras.optimizers import Adam

Using TensorFlow backend.


In [4]:
word2ind, ind2word = utils.load_index_word_map()

In [5]:
vocab_size = len(word2ind)

In [6]:
batchs = pd.read_csv('data/batch_sizes.csv', index_col=0, header=None)

In [14]:
lyrics_len = np.random.choice(batchs.index.tolist(), size=1)[0]
lyrics_len

542

In [15]:
dataset = pd.read_csv('data/train_{}.csv'.format(lyrics_len))

In [16]:
dataset.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,291,292,293,294,295,296,297,298,299,300
0,35,64,72,1,58,69,4,68,1,69,...,56,57,69,1,1,0,24,61,64,68
1,41,57,54,1,61,54,50,71,54,68,...,54,65,69,54,62,51,54,67,1,1
2,30,1,53,64,63,4,69,1,52,50,...,1,68,64,62,54,53,50,74,7,1
3,33,64,72,1,50,63,53,1,61,64,...,64,64,1,1,0,46,64,70,4,67
4,24,50,63,4,69,1,67,70,63,1,...,0,41,57,54,1,68,72,54,54,69


In [17]:
dataset.shape

(9438, 301)

In [18]:
emb_dim = 128
input_length = 300
batch_size = batchs.loc[lyrics_len, 1]

In [19]:
batch_size

39

In [20]:
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=emb_dim, input_length=input_length, batch_input_shape=(batch_size, input_length)))
model.add(Conv1D(filters=256, kernel_size=4, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2, padding='same'))
model.add(Dropout(.2))
model.add(Bidirectional(LSTM(50, activation='softmax', return_sequences=True, dropout=.2, recurrent_dropout=.2, stateful=True)))
model.add(BatchNormalization())
model.add(Conv1D(filters=256, kernel_size=4, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2, padding='same'))
model.add(Dropout(.2))
model.add(Bidirectional(LSTM(100, activation='softmax', dropout=.2, recurrent_dropout=.2, stateful=True)))
model.add(BatchNormalization())
model.add(Dropout(.2))
model.add(Dense(vocab_size, activation='softmax'))

In [21]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (39, 300, 128)            9728      
_________________________________________________________________
conv1d_3 (Conv1D)            (39, 300, 256)            131328    
_________________________________________________________________
max_pooling1d_3 (MaxPooling1 (39, 150, 256)            0         
_________________________________________________________________
dropout_4 (Dropout)          (39, 150, 256)            0         
_________________________________________________________________
bidirectional_3 (Bidirection (39, 150, 100)            122800    
_________________________________________________________________
batch_normalization_3 (Batch (39, 150, 100)            400       
_________________________________________________________________
conv1d_4 (Conv1D)            (39, 150, 256)            102656    
__________

In [22]:
model.set_weights(load_model('model_alter.h5').get_weights())

In [23]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [24]:
for _ in range(5):
    print(_)
    model.reset_states()
    model_hist = model.fit(dataset.iloc[:, :-1].values, dataset.iloc[:, [-1]].values, epochs=1, batch_size=batch_size)
    if model_hist.history['acc'][0] > .55: break

0
Epoch 1/1
1
Epoch 1/1
2
Epoch 1/1
3
Epoch 1/1
4
Epoch 1/1


In [25]:
# save model
model.save('model_alter.h5')

In [26]:
pred_model = Sequential()
pred_model.add(Embedding(input_dim=vocab_size, output_dim=emb_dim, input_length=input_length, batch_input_shape=(1, input_length)))
pred_model.add(Conv1D(filters=256, kernel_size=4, padding='same', activation='relu'))
pred_model.add(MaxPooling1D(pool_size=2, padding='same'))
pred_model.add(Dropout(.2))
pred_model.add(Bidirectional(LSTM(50, activation='softmax', return_sequences=True, dropout=.2, recurrent_dropout=.2, stateful=True)))
pred_model.add(BatchNormalization())
pred_model.add(Conv1D(filters=256, kernel_size=4, padding='same', activation='relu'))
pred_model.add(MaxPooling1D(pool_size=2, padding='same'))
pred_model.add(Dropout(.2))
pred_model.add(Bidirectional(LSTM(100, activation='softmax', dropout=.2, recurrent_dropout=.2, stateful=True)))
pred_model.add(BatchNormalization())
pred_model.add(Dropout(.2))
pred_model.add(Dense(vocab_size, activation='softmax'))

In [27]:
pred_model.set_weights(model.get_weights())

In [28]:
def implement(seed_text, maxlen=100, must_stop=2000, n_likely=5):
    cleaned = list(seed_text)
    padded_input_tokens = utils.tokenise(word2ind, cleaned)
    res_tokens = [token for token in padded_input_tokens]
    pred_model.reset_states()
    while must_stop > 0:
        padded_input_tokens = pad_sequences([padded_input_tokens], maxlen=maxlen)
        probs = pred_model.predict(padded_input_tokens, batch_size=1)[0]
        first_n = np.argsort(probs)[-n_likely:]
        probs = probs[first_n] / np.sum(probs[first_n])
        predicted = np.random.choice(first_n, p=probs)
        padded_input_tokens = padded_input_tokens[0].tolist()
        padded_input_tokens.append(predicted)
        res_tokens.append(predicted)
        must_stop -= 1
    detokenised = utils.detokenise(ind2word, res_tokens)
    return ''.join(detokenised)

In [29]:
res = implement('We are friends, good friends', 300, 800, 5)

In [30]:
print(res)

We are friends, good friends ale oh


(  

( 

(e


(e
(
(
(  

(e
(( 

(eh I heao ( oneee((((
((
((eouu(((
((((((((
(((ouueueieueiuueueeieeeeuuuieeeeuuueeeueieuieeueeieeyeyeeiieeyeeeuiueiiieeiueeeeeieieeeeyeieeuiuiieeieee eieiuiieeeueieeeeoeieeyy oeeeeeeeeeieiuieueeeeeoeeuieeieueeueeeuuieueieeeeeeieiueiueieiieeueieuieeeeeeiieeeeiieieeeyeyeeeeiieueieeeaiueyeeeeeeueiieieuieeyoeeuynninuuieiiennnnnnnnnnnnnnnnnnuuunrrrrrraurninnoatniitniitnnntnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnroniaanaainiiiniiiniiiniiinoainiainaiini cniiandiano inna nniinnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnntntnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnntniiiii iniiin-rrrarrr-rrrarrr-rrmrrrriai--] --mmmp

--

--

--eem

ne 
yy 
en 
y e
yeen
e
e
ep
 e
 nee
ynee
e
e

 eeen  en 
e  y na
n een  a
y y e
 ye 
eeene

yyeeenne

yee 
