In [4]:
import importlib
from glob import glob
import numpy as np
import pandas as pd
import utils

In [5]:
importlib.reload(utils)

<module 'utils' from '/home/fei/Documents/projects/lyrics/stacked_lstm_statefull/utils.py'>

In [3]:
from keras.models import Sequential, load_model
from keras.layers import Embedding, LSTM, Dense, Dropout, Flatten, Conv1D, MaxPooling1D, BatchNormalization, Bidirectional
from keras.preprocessing.sequence import pad_sequences
# from keras.optimizers import Adam

Using TensorFlow backend.


In [6]:
word2ind, ind2word = utils.load_index_word_map()

In [7]:
vocab_size = len(word2ind)

In [9]:
train_filename = np.random.choice(glob('data/train_*.csv'), size=1)[0]
train_filename

'data/train_1022.csv'

In [11]:
before_scan = pd.read_csv(train_filename)
batch_size = before_scan.shape[0]
batch_size

47

In [12]:
def scanning(array, input_length):
    batches = []
    text_length = array.shape[1]
    for i in range(text_length-input_length):
        batches.append(array[:, i:i+input_length+1])
    return np.concatenate(batches)

In [13]:
emb_dim = 128
input_length = 300

In [15]:
dataset = scanning(before_scan.values, input_length)

In [17]:
dataset.shape

(33934, 301)

In [18]:
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=emb_dim, input_length=input_length, batch_input_shape=(batch_size, input_length)))
model.add(Conv1D(filters=256, kernel_size=4, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2, padding='same'))
model.add(Dropout(.2))
model.add(Bidirectional(LSTM(50, activation='softmax', return_sequences=True, dropout=.2, recurrent_dropout=.2, stateful=True)))
model.add(BatchNormalization())
model.add(Conv1D(filters=256, kernel_size=4, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2, padding='same'))
model.add(Dropout(.2))
model.add(Bidirectional(LSTM(100, activation='softmax', dropout=.2, recurrent_dropout=.2, stateful=True)))
model.add(BatchNormalization())
model.add(Dropout(.2))
model.add(Dense(vocab_size, activation='softmax'))

In [19]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (47, 300, 128)            9728      
_________________________________________________________________
conv1d_1 (Conv1D)            (47, 300, 256)            131328    
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (47, 150, 256)            0         
_________________________________________________________________
dropout_1 (Dropout)          (47, 150, 256)            0         
_________________________________________________________________
bidirectional_1 (Bidirection (47, 150, 100)            122800    
_________________________________________________________________
batch_normalization_1 (Batch (47, 150, 100)            400       
_________________________________________________________________
conv1d_2 (Conv1D)            (47, 150, 256)            102656    
__________

In [22]:
model.set_weights(load_model('model_alter.h5').get_weights())

In [20]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [22]:
for _ in range(5):
    print(_)
    model.reset_states()
    model_hist = model.fit(dataset[:, :-1], dataset[:, [-1]], epochs=1, batch_size=batch_size)
    if model_hist.history['acc'][0] > .55: break

0
Epoch 1/1
 3337/33934 [=>............................] - ETA: 193s - loss: 3.8755 - acc: 0.2266

KeyboardInterrupt: 

In [25]:
# save model
model.save('model_alter.h5')

In [26]:
pred_model = Sequential()
pred_model.add(Embedding(input_dim=vocab_size, output_dim=emb_dim, input_length=input_length, batch_input_shape=(1, input_length)))
pred_model.add(Conv1D(filters=256, kernel_size=4, padding='same', activation='relu'))
pred_model.add(MaxPooling1D(pool_size=2, padding='same'))
pred_model.add(Dropout(.2))
pred_model.add(Bidirectional(LSTM(50, activation='softmax', return_sequences=True, dropout=.2, recurrent_dropout=.2, stateful=True)))
pred_model.add(BatchNormalization())
pred_model.add(Conv1D(filters=256, kernel_size=4, padding='same', activation='relu'))
pred_model.add(MaxPooling1D(pool_size=2, padding='same'))
pred_model.add(Dropout(.2))
pred_model.add(Bidirectional(LSTM(100, activation='softmax', dropout=.2, recurrent_dropout=.2, stateful=True)))
pred_model.add(BatchNormalization())
pred_model.add(Dropout(.2))
pred_model.add(Dense(vocab_size, activation='softmax'))

In [27]:
pred_model.set_weights(model.get_weights())

In [28]:
def implement(seed_text, maxlen=100, must_stop=2000, n_likely=5):
    cleaned = list(seed_text)
    padded_input_tokens = utils.tokenise(word2ind, cleaned)
    res_tokens = [token for token in padded_input_tokens]
    pred_model.reset_states()
    while must_stop > 0:
        padded_input_tokens = pad_sequences([padded_input_tokens], maxlen=maxlen)
        probs = pred_model.predict(padded_input_tokens, batch_size=1)[0]
        first_n = np.argsort(probs)[-n_likely:]
        probs = probs[first_n] / np.sum(probs[first_n])
        predicted = np.random.choice(first_n, p=probs)
        padded_input_tokens = padded_input_tokens[0].tolist()
        padded_input_tokens.append(predicted)
        res_tokens.append(predicted)
        must_stop -= 1
    detokenised = utils.detokenise(ind2word, res_tokens)
    return ''.join(detokenised)

In [29]:
res = implement('We are friends, good friends', 300, 800, 5)

In [30]:
print(res)

We are friends, good friends ale oh


(  

( 

(e


(e
(
(
(  

(e
(( 

(eh I heao ( oneee((((
((
((eouu(((
((((((((
(((ouueueieueiuueueeieeeeuuuieeeeuuueeeueieuieeueeieeyeyeeiieeyeeeuiueiiieeiueeeeeieieeeeyeieeuiuiieeieee eieiuiieeeueieeeeoeieeyy oeeeeeeeeeieiuieueeeeeoeeuieeieueeueeeuuieueieeeeeeieiueiueieiieeueieuieeeeeeiieeeeiieieeeyeyeeeeiieueieeeaiueyeeeeeeueiieieuieeyoeeuynninuuieiiennnnnnnnnnnnnnnnnnuuunrrrrrraurninnoatniitniitnnntnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnroniaanaainiiiniiiniiiniiinoainiainaiini cniiandiano inna nniinnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnntntnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnntniiiii iniiin-rrrarrr-rrrarrr-rrmrrrriai--] --mmmp

--

--

--eem

ne 
yy 
en 
y e
yeen
e
e
ep
 e
 nee
ynee
e
e

 eeen  en 
e  y na
n een  a
y y e
 ye 
eeene

yyeeenne

yee 
