In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import tensorflow as tf
import keras
from keras.models import Sequential
from keras.layers import Dense, LSTM, Activation, Dropout
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils
import random


%matplotlib inline

print('Modules imported.')

Using TensorFlow backend.


Modules imported.


# Reading in the dataset

The dataset consists from the lyrics for 57650 songs. The data has been acquired from LyricsFreak through scraping. Then some very basic work has been done on removing inconvenient data: non-English lyrics, extremely short and extremely long lyrics, lyrics with non-ASCII symbols. The dataset contains 4 columns:

* Artist
* Song Name
* Link to a webpage with the song (for reference). This is to be concatenated with http://www.lyricsfreak.com to form a real URL.
* Lyrics of the song, unmodified.

In [2]:
data = pd.read_csv('./dataset/songdata.csv', usecols=['text'])

In [3]:
text = data['text'].str.cat(sep='\n').lower()
print('Corpus length:', len(text))
print('Example text:', text[0:300])

Corpus length: 68056106
Example text: look at her face, it's a wonderful face  
and it means something special to me  
look at the way that she smiles when she sees me  
how lucky can one fellow be?  
  
she's just my kind of girl, she makes me feel fine  
who could ever believe that she could be mine?  
she's just my kind of girl, with


# Preprocessing

In [4]:
# truncating the corpus
# since it is going to take too long to train
text = text[:1000000]
print('Truncated corpus length:', len(text))

Truncated corpus length: 1000000


In [5]:
# creating a character vocabulary
chars = sorted(list(set(text)))
print('Total chars:', len(chars))
print(chars)

Total chars: 50
['\n', ' ', '!', '"', "'", '(', ')', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', '?', '[', ']', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [6]:
# creating lookup dictionaries
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

In [35]:
maxlen = 40 # the window size
step = 3 # The steps between the windows
sentences = []
next_chars = []

In [36]:

for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen]) # range from current index i for max length characters 
    next_chars.append(text[i + maxlen]) # the next character after that 
sentences = np.array(sentences)
next_chars = np.array(next_chars)
print('Shape of sentences:', sentences.shape)
print(sentences)
print(next_chars)

Shape of sentences: (333320,)
["look at her face, it's a wonderful face "
 "k at her face, it's a wonderful face  \na"
 "t her face, it's a wonderful face  \nand " ...,
 "t that we're less worse  \n  \ntears are n"
 "hat we're less worse  \n  \ntears are not "
 " we're less worse  \n  \ntears are not eno"]
[' ' 'n' 'i' ..., 'o' 'e' 'u']


In [25]:
# convert each character to categorical numbers
label_encoder = LabelEncoder()
integer_encoded_X = label_encoder.fit_transform(sentences)
integer_encoded_y = label_encoder.fit_transform(next_chars)

print('Integer encoded X:', integer_encoded_X)
print('Integer encoded y:', integer_encoded_y)

Integer encoded X: [192992 181993 264233 ..., 266428 157152  73624]
Integer encoded y: [ 1 37 32 ..., 38 28 44]


In [26]:
# one-hot encode each categorical number
onehot_encoder = OneHotEncoder(sparse=False)

integer_encoded_X = integer_encoded_X.reshape(len(integer_encoded_X), 1)
onehot_encoded_X = onehot_encoder.fit_transform(integer_encoded_X)

integer_encoded_y = integer_encoded_y.reshape(len(integer_encoded_y), 1)
onehot_encoded_y = onehot_encoder.fit_transform(integer_encoded_y)


print(onehot_encoded_X, onehot_encoded_y)

X = onehot_encoded_X
y = onehot_encoded_y

MemoryError: 

In [27]:
# one-hot encoding the input values with a generator, since the dataset length causes the memory error (on my machine)
def generator(sentences, next_chars, batch_size):
    X = np.zeros((batch_size, maxlen, len(chars)), dtype=np.bool)
    y = np.zeros((batch_size, len(chars)), dtype=np.bool)
    length = len(sentences)
    index = 0
    while True:
        if index + batch_size >= length:
            index = 0
        X.fill(0)
        y.fill(0)
        for i in range(batch_size):
            sentence = sentences[index]
            for t, char in enumerate(sentence):
                X[i, t, char_indices[char]] = 1
            y[i, char_indices[next_chars[i]]] = 1
            index = index + 1
        yield X, y

# Network Model

In [31]:
# build the model: a single LSTM
print('Build model...')
model = Sequential()
model.add(LSTM(128, input_shape=(maxlen, len(chars)), activation='relu', kernel_initializer='random_uniform'))
model.add(Dropout(0.2))
model.add(Dense(len(chars)*2))
model.add(Dense(len(chars)))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

print("Compiling model complete...")
model.summary()

Build model...
Compiling model complete...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_7 (LSTM)                (None, 128)               91648     
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 100)               12900     
_________________________________________________________________
dense_4 (Dense)              (None, 50)                5050      
_________________________________________________________________
activation_3 (Activation)    (None, 50)                0         
Total params: 109,598
Trainable params: 109,598
Non-trainable params: 0
_________________________________________________________________


In [32]:
# training checkpoints
filepath="weights{epoch:02d}-{loss:.4f}.h5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

In [33]:
# training
print('Training...')
batch_size = 128

history = model.fit_generator(generator(sentences, next_chars, batch_size), steps_per_epoch=10000, epochs=10, callbacks=callbacks_list)

Training...
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
