In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import tensorflow as tf
import keras
from keras.models import Sequential
from keras.layers import Dense, LSTM, Activation, Dropout, BatchNormalization
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils
from keras.models import load_model
import random

# to later delete "optimizer_weights" part of the model weights
# due to errors that occur when training on gpu and testing on cpu
import h5py


%matplotlib inline

print('Modules imported.')

Using TensorFlow backend.


Modules imported.


# Reading in the dataset

The dataset consists from the lyrics for 57650 songs. The data has been acquired from LyricsFreak through scraping. Then some very basic work has been done on removing inconvenient data: non-English lyrics, extremely short and extremely long lyrics, lyrics with non-ASCII symbols. The dataset contains 4 columns:

* Artist
* Song Name
* Link to a webpage with the song (for reference). This is to be concatenated with http://www.lyricsfreak.com to form a real URL.
* Lyrics of the song, unmodified.

In [2]:
data = pd.read_csv('./dataset/songdata.csv', usecols=['text'])

In [3]:
corpus = data['text'].str.cat(sep='\n').lower()
print('Corpus length:', len(corpus))
print('Example text:', corpus[0:300])

Corpus length: 68056106
Example text: look at her face, it's a wonderful face  
and it means something special to me  
look at the way that she smiles when she sees me  
how lucky can one fellow be?  
  
she's just my kind of girl, she makes me feel fine  
who could ever believe that she could be mine?  
she's just my kind of girl, with


# Preprocessing

In [4]:
# truncating the corpus
# since it is going to take too long to train
corpus = corpus[:1000000]
print('Truncated corpus length:', len(corpus))

Truncated corpus length: 1000000


In [5]:
# creating a character vocabulary
chars = sorted(list(set(corpus)))
print('Total chars:', len(chars))
print(chars)

Total chars: 50
['\n', ' ', '!', '"', "'", '(', ')', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', '?', '[', ']', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [6]:
# creating lookup dictionaries
char_to_ind = dict((c, i) for i, c in enumerate(chars))
ind_to_char = dict((i, c) for i, c in enumerate(chars))

In [7]:
maxlen = 40 # the window size
step = 3 # step of the window
sentences = []
next_chars = []

In [8]:
# sentences as features, next chars as labels
for i in range(0, len(corpus) - maxlen, step):
    sentences.append(corpus[i: i + maxlen]) # range from current index i for max length characters 
    next_chars.append(corpus[i + maxlen]) # the next character after that 
sentences = np.array(sentences)
next_chars = np.array(next_chars)
print('Shape of sentences:', sentences.shape)
print(sentences)
print(next_chars)

Shape of sentences: (333320,)
["look at her face, it's a wonderful face "
 "k at her face, it's a wonderful face  \na"
 "t her face, it's a wonderful face  \nand " ...,
 "t that we're less worse  \n  \ntears are n"
 "hat we're less worse  \n  \ntears are not "
 " we're less worse  \n  \ntears are not eno"]
[' ' 'n' 'i' ..., 'o' 'e' 'u']


In [9]:
# convert each character to categorical numbers
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(next_chars)

print('Integer encoded y:', integer_encoded)

Integer encoded X: [192992 181993 264233 ..., 266428 157152  73624]
Integer encoded y: [ 1 37 32 ..., 38 28 44]


In [None]:
# one-hot encode each categorical number
onehot_encoder = OneHotEncoder(sparse=False)

integer_encoded = integer_encoded_y.reshape(len(integer_encoded), 1)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)

print(onehot_encoded)

y = onehot_encoded

In [9]:
# one-hot encoding the input values wrapped in a separate function,
# since the dataset length causes the memory error (on my machine) when converting all at once
def encode(sentences, next_chars):
    X = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
    y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
    length = len(sentences)
    index = 0
    for i in range(len(sentences)):
        sentence = sentences[i]
        for t, char in enumerate(sentence):
            X[i, t, char_to_ind[char]] = 1
        y[i, char_to_ind[next_chars[i]]] = 1
    return X, y

X, y = encode(sentences, next_chars)

# Network Model

In [10]:
model = Sequential()
model.add(LSTM(128, input_shape=(maxlen, len(chars))))
model.add(Dense(len(chars)))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 128)               91648     
_________________________________________________________________
dense_1 (Dense)              (None, 50)                6450      
_________________________________________________________________
activation_1 (Activation)    (None, 50)                0         
Total params: 98,098
Trainable params: 98,098
Non-trainable params: 0
_________________________________________________________________


In [11]:
# training checkpoints
filepath="weights{epoch:02d}-{loss:.4f}.h5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

In [12]:
# training
batch_size = 128

trained = model.fit(X, y, batch_size, epochs=30, callbacks=callbacks_list)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


## Predictions

In [None]:
# OPTIONAL: run this if the model is trained on gpu but being tested on cpu
# RUN ONCE FOR A FILE
f = h5py.File('weights29-1.1538.h5', 'r+')
del f['optimizer_weights']
f.close()

In [23]:
# example test phrase
sentence = 'Gabriele loves dancing\nall day and night'
sentence = sentence.lower()
print('Sentence length:', len(sentence))

# construct a vector from an example phrase
x = np.zeros((1, maxlen, len(chars)))
for i, char in enumerate(sentence):
    x[0, i, char_to_ind[char]] = 1.
    
# load trained weights
model = load_model('weights29-1.1538.h5')


print('Predictions per char in vocabulary', model.predict(x, verbose=0))

Sentence length: 40
Predictions per char in vocabulary [[  7.85644632e-04   6.48357809e-01   2.39531408e-04   6.93314723e-05
    1.92984089e-03   3.50722723e-04   2.05814256e-03   5.79579026e-02
    2.30197984e-04   2.02430710e-02   3.32634727e-07   4.05892625e-10
    1.07458886e-08   9.59632089e-08   3.98085618e-08   6.00360606e-09
    2.51268867e-11   7.56856397e-14   6.72226077e-12   5.60327429e-08
    8.10267807e-07   3.99530778e-04   1.47456335e-11   1.24109029e-05
    1.28299434e-04   2.41462010e-04   3.04026980e-05   1.62235265e-05
    2.84606474e-04   2.40745176e-05   2.13956610e-05   5.40698133e-03
    4.33711568e-04   4.03412469e-06   5.74045345e-08   7.95448082e-04
    3.43134016e-04   5.62815367e-05   8.08755503e-05   7.52190954e-06
    1.85356603e-05   2.52930564e-04   2.58004218e-01   2.00749695e-04
    2.36398409e-04   3.09081059e-07   3.25245819e-05   5.33050120e-08
    7.44019926e-04   3.30167666e-07]]


In [24]:
# OPTIONAL: adding randomness for choosing predicted characters
def sample(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [30]:
# temperature defines randomness rate for using predictions
temperature = 0.8
print('Temperature: ', temperature)

generated = ''
original = sentence
window = sentence

# predicting next 500 chars
for i in range(500):
    x = np.zeros((1, maxlen, len(chars)))
    for t, char in enumerate(window):
        x[0, t, char_to_ind[char]] = 1.

    preds = model.predict(x, verbose=0)[0]
    next_index = sample(preds, temperature)
    next_char = ind_to_char[next_index]

    generated += next_char
    window = window[1:] + next_char

print(original + generated)

Temperature:  0.8
gabriele loves dancing
all day and night  
and when i feel the seess  
we love yoursed you will  
i have to things it will be a molurons day  
well i want you to let you  
you could tee  
listen you through the grows  
smiting the mornisy and she's so here around  
and fallin' alwight  
  
over slowing buy sewors  
your look why,  
i'm living in your eyes  
that i needder that we said  
gonna'd like inforce  
  
hey need a mine to from the butle of the corner  
it's tree  
but you dring a frovieve  
  
i'm living in the morning as it 
