In [1]:
import importlib
import numpy as np
import pandas as pd
import utils

In [2]:
importlib.reload(utils)

<module 'utils' from '/home/fei/Documents/projects/lyrics/stacked_lstm_statefull/utils.py'>

In [3]:
from keras.models import Sequential, load_model
from keras.layers import Embedding, LSTM, Dense, Dropout, Flatten, Conv1D, MaxPooling1D, BatchNormalization
from keras.preprocessing.sequence import pad_sequences
# from keras.optimizers import Adam

Using TensorFlow backend.


In [4]:
word2ind, ind2word = utils.load_index_word_map()

In [5]:
vocab_size = len(word2ind)

In [6]:
dataset = pd.read_csv('train.csv')

In [7]:
dataset.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,100
0,44,57,54,63,1,30,1,72,50,68,...,54,1,56,64,63,54,1,1,0,1
1,36,63,54,1,53,50,74,1,64,55,...,1,68,57,50,67,54,1,74,64,70
2,30,1,72,50,63,69,1,69,64,1,...,1,74,64,70,4,67,54,1,64,63
3,25,64,1,74,64,70,1,54,71,54,...,71,67,74,51,64,53,74,4,68,1
4,39,64,64,69,68,1,35,50,69,69,...,67,50,63,52,57,54,68,1,1,0


In [8]:
dataset.shape

(59092, 101)

In [9]:
emb_dim = 100
input_length = 100
batch_size = 79

In [10]:
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=emb_dim, input_length=input_length, batch_input_shape=(batch_size, input_length)))
model.add(Conv1D(filters=256, kernel_size=4, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2, padding='same'))
model.add(Dropout(.2))
model.add(Conv1D(filters=32, kernel_size=4, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2, padding='same'))
model.add(LSTM(100, return_sequences=True, dropout=.2, recurrent_dropout=.2, stateful=True))
model.add(BatchNormalization())
model.add(LSTM(100, dropout=.2, recurrent_dropout=.2, stateful=True))
model.add(BatchNormalization())
model.add(Dropout(.2))
model.add(Dense(vocab_size, activation='softmax'))

In [11]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (79, 100, 100)            7600      
_________________________________________________________________
conv1d_1 (Conv1D)            (79, 100, 256)            102656    
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (79, 50, 256)             0         
_________________________________________________________________
dropout_1 (Dropout)          (79, 50, 256)             0         
_________________________________________________________________
conv1d_2 (Conv1D)            (79, 50, 32)              32800     
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (79, 25, 32)              0         
_________________________________________________________________
lstm_1 (LSTM)                (79, 25, 100)             53200     
__________

In [12]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [13]:
for _ in range(5):
    model.reset_states()
    model.fit(dataset.iloc[:, :100].values, dataset.iloc[:, [100]].values, epochs=1, batch_size=batch_size)

Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1


In [14]:
# save model
model.save('model.h5')

In [15]:
pred_model = Sequential()
pred_model.add(Embedding(input_dim=vocab_size, output_dim=emb_dim, input_length=input_length, batch_input_shape=(1, input_length)))
pred_model.add(Conv1D(filters=256, kernel_size=4, padding='same', activation='relu'))
pred_model.add(MaxPooling1D(pool_size=2, padding='same'))
pred_model.add(Conv1D(filters=32, kernel_size=4, padding='same', activation='relu'))
pred_model.add(MaxPooling1D(pool_size=2, padding='same'))
pred_model.add(LSTM(100, return_sequences=True, dropout=.2, recurrent_dropout=.2, stateful=True))
pred_model.add(BatchNormalization())
pred_model.add(LSTM(100, dropout=.2, recurrent_dropout=.2, stateful=True))
pred_model.add(BatchNormalization())
pred_model.add(Dense(vocab_size, activation='softmax'))

In [16]:
pred_model.set_weights(model.get_weights())

In [17]:
def implement(seed_text, maxlen=100, must_stop=2000):
    cleaned = list(seed_text)
    padded_input_tokens = utils.tokenise(word2ind, cleaned)
    res_tokens = [token for token in padded_input_tokens]
    while must_stop > 0:
        padded_input_tokens = pad_sequences([padded_input_tokens], maxlen=maxlen)
        probs = pred_model.predict(padded_input_tokens, batch_size=1)[0]
        first_n = np.argsort(probs)[-3:]
        probs = probs[first_n] / np.sum(probs[first_n])
        predicted = np.random.choice(first_n, p=probs)
        padded_input_tokens = padded_input_tokens[0].tolist()
        padded_input_tokens.append(predicted)
        res_tokens.append(predicted)
        must_stop -= 1
    detokenised = utils.detokenise(ind2word, res_tokens)
    return ''.join(detokenised)

In [18]:
res = implement('We are friends, good friends', 100, 200)

In [19]:
print(res)

We are friends, good friends the world the wattenet  
I've got to ba be the to wast the time  
Any the wat all the tilled there'de stast and and the say thin't to see  
And and  
And thite  
So  
And to to bangen  
She to so tha
