In [14]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import tensorflow as tf
import keras
from keras.models import Sequential
from keras.layers import Dense, LSTM, Activation, Dropout, BatchNormalization
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils
from keras.models import load_model
import random


%matplotlib inline

print('Modules imported.')

Modules imported.


# Reading in the dataset

The dataset consists from the lyrics for 57650 songs. The data has been acquired from LyricsFreak through scraping. Then some very basic work has been done on removing inconvenient data: non-English lyrics, extremely short and extremely long lyrics, lyrics with non-ASCII symbols. The dataset contains 4 columns:

* Artist
* Song Name
* Link to a webpage with the song (for reference). This is to be concatenated with http://www.lyricsfreak.com to form a real URL.
* Lyrics of the song, unmodified.

In [33]:
data = pd.read_csv('./dataset/songdata.csv', usecols=['text'])

In [34]:
corpus = data['text'].str.cat(sep='\n').lower()
print('Corpus length:', len(corpus))
print('Example text:', corpus[0:300])

Corpus length: 68056106
Example text: look at her face, it's a wonderful face  
and it means something special to me  
look at the way that she smiles when she sees me  
how lucky can one fellow be?  
  
she's just my kind of girl, she makes me feel fine  
who could ever believe that she could be mine?  
she's just my kind of girl, with


# Preprocessing

In [35]:
# truncating the corpus
# since it is going to take too long to train
corpus = corpus[:1000000]
print('Truncated corpus length:', len(corpus))

Truncated corpus length: 1000000


In [36]:
# creating a character vocabulary
chars = sorted(list(set(corpus)))
print('Total chars:', len(chars))
print(chars)

Total chars: 50
['\n', ' ', '!', '"', "'", '(', ')', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', '?', '[', ']', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [37]:
# creating lookup dictionaries
char_to_ind = dict((c, i) for i, c in enumerate(chars))
ind_to_char = dict((i, c) for i, c in enumerate(chars))

In [38]:
maxlen = 40 # the window size
step = 3 # The steps between the windows
sentences = []
next_chars = []

In [39]:
# sentences as features, next chars as labels
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen]) # range from current index i for max length characters 
    next_chars.append(text[i + maxlen]) # the next character after that 
sentences = np.array(sentences)
next_chars = np.array(next_chars)
print('Shape of sentences:', sentences.shape)
print(sentences)
print(next_chars)

Shape of sentences: (333320,)
["look at her face, it's a wonderful face "
 "k at her face, it's a wonderful face  \na"
 "t her face, it's a wonderful face  \nand " ...,
 "t that we're less worse  \n  \ntears are n"
 "hat we're less worse  \n  \ntears are not "
 " we're less worse  \n  \ntears are not eno"]
[' ' 'n' 'i' ..., 'o' 'e' 'u']


In [9]:
# convert each character to categorical numbers
label_encoder = LabelEncoder()
integer_encoded_X = label_encoder.fit_transform(sentences)
integer_encoded_y = label_encoder.fit_transform(next_chars)

print('Integer encoded X:', integer_encoded_X)
print('Integer encoded y:', integer_encoded_y)

Integer encoded X: [192992 181993 264233 ..., 266428 157152  73624]
Integer encoded y: [ 1 37 32 ..., 38 28 44]


In [11]:
# one-hot encode each categorical number
onehot_encoder = OneHotEncoder(sparse=False)

integer_encoded_X = integer_encoded_X.reshape(len(integer_encoded_X), 1)
onehot_encoded_X = onehot_encoder.fit_transform(integer_encoded_X)

integer_encoded_y = integer_encoded_y.reshape(len(integer_encoded_y), 1)
onehot_encoded_y = onehot_encoder.fit_transform(integer_encoded_y)


print(onehot_encoded_X, onehot_encoded_y)

X = onehot_encoded_X
y = onehot_encoded_y

MemoryError: 

In [40]:
# one-hot encoding the input values wrapped in a separate function,
# since the dataset length causes the memory error (on my machine) when converting all at once
def encode(sentences, next_chars):
    X = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
    y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
    length = len(sentences)
    index = 0
    for i in range(len(sentences)):
        sentence = sentences[i]
        for t, char in enumerate(sentence):
            X[i, t, char_to_ind[char]] = 1
        y[i, char_to_ind[next_chars[i]]] = 1
    return X, y

X, y = encode(sentences, next_chars)

# Network Model

In [41]:
model = Sequential()
model.add(LSTM(128, input_shape=(maxlen, len(chars))))
model.add(Dense(len(chars)))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_4 (LSTM)                (None, 128)               91648     
_________________________________________________________________
dense_4 (Dense)              (None, 50)                6450      
_________________________________________________________________
activation_4 (Activation)    (None, 50)                0         
Total params: 98,098
Trainable params: 98,098
Non-trainable params: 0
_________________________________________________________________


In [12]:
# training checkpoints
filepath="weights{epoch:02d}-{loss:.4f}.h5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

In [12]:
# training
batch_size = 100

trained = model.fit(X, y,batch_size=100, epochs=30)

Training...
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [44]:
# example test phrase
sentence = 'dance all night'

# construct a vector from an example phrase
x = np.zeros((1, maxlen, len(chars)))
for i, char in enumerate(sentence):
    x[0, i, char_to_ind[char]] = 1.
    
# load trained weights
model = load_model('weights01-2.3384.h5')


print('Predictions per char in vocabulary', model.predict(x, verbose=0))

Prediction per char in vocabulary [[  1.14192622e-09   2.61941582e-01   1.22791299e-09   1.11195997e-09
    9.89448745e-10   1.04919839e-09   1.12323861e-09   1.25236310e-09
    1.16939869e-09   1.19848964e-09   1.23993693e-09   1.14135135e-09
    1.10743481e-09   1.24895194e-09   1.09906406e-09   1.11621845e-09
    1.14694454e-09   1.25958333e-09   1.13789311e-09   1.07535858e-09
    1.28547040e-09   1.16846910e-09   1.20295551e-09   1.19953469e-09
    4.75156344e-02   2.43664589e-02   2.40089260e-02   1.07945619e-09
    1.66302472e-01   1.05106324e-09   1.18246357e-09   2.38009505e-02
    4.71939221e-02   1.08539600e-09   2.40857508e-02   4.77083512e-02
    7.14605674e-02   4.73378152e-02   7.16472492e-02   1.19791133e-09
    1.16055965e-09   1.05425957e-09   7.10829794e-02   7.15472549e-02
    1.10959775e-09   1.15319188e-09   1.19163091e-09   1.24365329e-09
    1.19769439e-09   1.13973075e-09]]


In [45]:
# OPTIONAL: adding randomness for choosing predicted characters
def sample(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [46]:
# temperature defines randomness rate for using predictions
temperature = 1
print('Temperature: ', temperature)

generated = ''
original = sentence
window = sentence

# predicting next 500 chars
for i in range(500):
    x = np.zeros((1, maxlen, len(chars)))
    for t, char in enumerate(window):
        x[0, t, char_indices[char]] = 1.

    preds = model.predict(x, verbose=0)[0]
    next_index = sample(preds, temperature)
    next_char = indices_char[next_index]

    generated += next_char
    window = window[1:] + next_char

print(original + generated)

Temperature:  1
dance all nights ho eneeakmecee  st  mteeb n me neonooe   it  albc mns oe amhce  eosom m  asce b  eeetnee sse henmel s   noembetlsineobamsiomelennt te oh l   ekeot  ea a    o io os eo  kb tioe e la ecm  elk elmi be  oo k n tbinste l ososnenetel ic am ts teo esi  iml oo isei st eebiaeheleitienosson  msn nent eoeee ae o tiktaem nlb lkekobeec cecb i tlemsom ne nl c o el  monce  m eten mtsalmo ktbi se ettso  bse  n  oem eebtsmnch  ecekoekiteol eoncec t   teeetbt tmtee tte ee teac   balcblka  snetooiam haotteeile  
