Based on: 

- https://github.com/keras-team/keras/blob/master/examples/lstm_text_generation.py
- https://machinelearningmastery.com/develop-word-based-neural-language-models-python-keras/

In [1]:
import json
import random
import sys

import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.callbacks import LambdaCallback
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding
from keras.optimizers import RMSprop

Using TensorFlow backend.


In [2]:
# load data
with open("../data/songs.json") as f:
    songs = json.load(f)

# cleanup & consolidate
text = ""
for song in songs:
    lyrics = song['lyrics'].replace('\r', '').lower()
    text += lyrics

print('corpus length:', len(text))

corpus length: 658180


In [6]:
# Tokenize text
tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>@[\\]^_`{|}~\t')
tokenizer.fit_on_texts([text])
encoded = tokenizer.texts_to_sequences([text])[0]

vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)

Vocabulary Size: 20531


In [8]:
# create word -> word sequences
sequences = list()
for i in range(1, len(encoded)):
    sequence = encoded[i-1:i+1]
    sequences.append(sequence)
print('Total Sequences: %d' % len(sequences))

Total Sequences: 114333


In [9]:
# split into X and y elements
sequences = np.array(sequences)
X, y = sequences[:,0],sequences[:,1]

In [10]:
# one hot encode outputs
y = to_categorical(y, num_classes=vocab_size)

In [11]:
# define model
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=1))
model.add(LSTM(50))
model.add(Dense(vocab_size, activation='softmax'))

model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1, 10)             205310    
_________________________________________________________________
lstm_1 (LSTM)                (None, 50)                12200     
_________________________________________________________________
dense_1 (Dense)              (None, 20531)             1047081   
Total params: 1,264,591
Trainable params: 1,264,591
Non-trainable params: 0
_________________________________________________________________


In [12]:
# compile network
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit network
model.fit(X, y, epochs=5, verbose=2)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/5
 - 59s - loss: 7.6193 - accuracy: 0.0485
Epoch 2/5
 - 57s - loss: 7.1009 - accuracy: 0.0493
Epoch 3/5
 - 58s - loss: 6.8332 - accuracy: 0.0579
Epoch 4/5
 - 60s - loss: 6.5959 - accuracy: 0.0806
Epoch 5/5
 - 59s - loss: 6.3829 - accuracy: 0.1016


<keras.callbacks.callbacks.History at 0x7f96dfa2d110>

In [16]:
# evaluate
in_text = 'how'
print(in_text)

encoded = tokenizer.texts_to_sequences([in_text])[0]
encoded = np.array(encoded)
#print(encoded)

yhat = model.predict_classes(encoded, verbose=0)
#print(yhat)

for word, index in tokenizer.word_index.items():
    if index == yhat:
        print(word)

how
i


In [19]:
# generate a sequence from the model
def generate_seq(model, tokenizer, seed_text, n_words):
    in_text, result = seed_text, seed_text
    # generate a fixed number of words
    for _ in range(n_words):
        # encode the text as integer
        encoded = tokenizer.texts_to_sequences([in_text])[0]
        encoded = np.array(encoded)
        # predict a word in the vocabulary
        yhat = model.predict_classes(encoded, verbose=0)
        # map predicted word index to word
        out_word = ''
        for word, index in tokenizer.word_index.items():
            if index == yhat:
                out_word = word
                break
        # append to input
        in_text, result = out_word, result + ' ' + out_word
    return result

In [27]:
generate_seq(model, tokenizer, 'here', 70)

'here in the road of the road of the road of the road of the road of the road of the road of the road of the road of the road of the road of the road of the road of the road of the road of the road of the road of the road of the road of the road of the road of the road of the road of'

In [3]:
chars = sorted(list(set(text)))
print('total chars:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

total chars: 64


In [4]:
# cut the text in semi-redundant sequences of maxlen characters
maxlen = 40
step = 3
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen])
    next_chars.append(text[i + maxlen])
print('nb sequences:', len(sentences))

nb sequences: 219380


In [5]:
print('Vectorization...')
x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

Vectorization...


In [6]:
# build the model: a single LSTM
print('Build model...')
model = Sequential()
model.add(LSTM(128, input_shape=(maxlen, len(chars))))
model.add(Dense(len(chars), activation='softmax'))

optimizer = RMSprop(learning_rate=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

Build model...


In [7]:
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)


def on_epoch_end(epoch, _):
    # Function invoked at end of each epoch. Prints generated text.
    print()
    print('----- Generating text after Epoch: %d' % epoch)

    start_index = random.randint(0, len(text) - maxlen - 1)
    for diversity in [0.2, 0.5, 1.0, 1.2]:
        print('----- diversity:', diversity)

        generated = ''
        sentence = text[start_index: start_index + maxlen]
        generated += sentence
        print('----- Generating with seed: "' + sentence + '"')
        sys.stdout.write(generated)

        for i in range(400):
            x_pred = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(sentence):
                x_pred[0, t, char_indices[char]] = 1.

            preds = model.predict(x_pred, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_char = indices_char[next_index]

            sentence = sentence[1:] + next_char

            sys.stdout.write(next_char)
            sys.stdout.flush()
        print()

print_callback = LambdaCallback(on_epoch_end=on_epoch_end)

In [8]:
model.fit(
    x,
    y,
    batch_size=128,
    epochs=60,
    callbacks=[print_callback])

Epoch 1/60

----- Generating text after Epoch: 0
----- diversity: 0.2
----- Generating with seed: "s
yea! heavy and a bottle of bread
yea! "
s
yea! heavy and a bottle of bread
yea! i’m gonna be the wide a findly the brand

what i got a mone the said the mone

i never said the book a stroin
in the boor a long the back the trimber the word
what i was a changed to the blown
where the boon and i too book a long

the boon all the the world your chain
they know i’m gonna do in the the would do a blood

the bear a far the shood the brown
in the the was a money, i did the brood and 
----- diversity: 0.5
----- Generating with seed: "s
yea! heavy and a bottle of bread
yea! "
s
yea! heavy and a bottle of bread
yea! and the think is man fars
that know that the this in what it do my head

know who are the brownd the the wolld a dound
and i all will be on my were the door
and the the mood the black one the long
i’ll you me the road in whee her would done

we hands in my blood to the boon
drown your 

  after removing the cwd from sys.path.


ough herembileadin’, ayowelce ’em bat, mude, baby, lying i wonde’s gettin’ me up one teruste
the war toivis gites named dark on, little or
we peiden ax-agatws around, i’m 
Epoch 19/60

----- Generating text after Epoch: 18
----- diversity: 0.2
----- Generating with seed: "undred wives
well, ev’rybody’s got somet"
undred wives
well, ev’rybody’s got something
i was a hold of the day
i was the streets and the way i was the stone

well, i was the stone of the way

i was a shot of stranger in the stream like i see
i was a shot of stranger and he was the stream
when the dead is an arrife and the wind

if i was a hard of the story
and the things and the world of the conceor
and the stream like a song baby
i was the rain of the street

well, i was a han
----- diversity: 0.5
----- Generating with seed: "undred wives
well, ev’rybody’s got somet"
undred wives
well, ev’rybody’s got something

in the high wedway comes on the boss with the morning kind
to the shadows of the world
is they stop to see

<keras.callbacks.callbacks.History at 0x7f7d58186550>

In [9]:
# Save the model
model.save('model1.h5')

In [12]:
import keras

# Recreate the exact same model purely from the file
new_model = keras.models.load_model('model1.h5')