In [1]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import tensorflow as tf
import keras
from keras.models import Sequential
from keras.layers import Dense, LSTM, Activation, Dropout, BatchNormalization
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils
from keras.models import load_model
import random

# to later delete "optimizer_weights" part of the model weights
# due to errors that occur when training on gpu and testing on cpu
import h5py


%matplotlib inline

print('Modules imported.')

Using TensorFlow backend.


Modules imported.


# Reading in the dataset

The dataset consists from the lyrics for 57650 songs. The data has been acquired from LyricsFreak through scraping. Then some very basic work has been done on removing inconvenient data: non-English lyrics, extremely short and extremely long lyrics, lyrics with non-ASCII symbols. The dataset contains 4 columns:

* Artist
* Song Name
* Link to a webpage with the song (for reference). This is to be concatenated with http://www.lyricsfreak.com to form a real URL.
* Lyrics of the song, unmodified.

In [2]:
data = pd.read_csv('./dataset/songdata.csv', usecols=['text'])

In [3]:
corpus = data['text'].str.cat(sep='\n').lower()
print('Corpus length:', len(corpus))
print('Example text:', corpus[0:300])

Corpus length: 68056106
Example text: look at her face, it's a wonderful face  
and it means something special to me  
look at the way that she smiles when she sees me  
how lucky can one fellow be?  
  
she's just my kind of girl, she makes me feel fine  
who could ever believe that she could be mine?  
she's just my kind of girl, with


# Preprocessing

In [4]:
# truncating the corpus
# since it is going to take too long to train
corpus = corpus[:1000000]
print('Truncated corpus length:', len(corpus))

Truncated corpus length: 1000000


In [5]:
# creating a character vocabulary
chars = sorted(list(set(corpus)))
print('Total chars:', len(chars))
print(chars)

Total chars: 50
['\n', ' ', '!', '"', "'", '(', ')', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', '?', '[', ']', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [6]:
# creating lookup dictionaries
char_to_ind = dict((c, i) for i, c in enumerate(chars))
ind_to_char = dict((i, c) for i, c in enumerate(chars))

In [7]:
maxlen = 40 # the window size
step = 3 # step of the window
sentences = []
next_chars = []

In [8]:
# sentences as features, next chars as labels
for i in range(0, len(corpus) - maxlen, step):
    sentences.append(corpus[i: i + maxlen]) # range from current index i for max length characters 
    next_chars.append(corpus[i + maxlen]) # the next character after that 
sentences = np.array(sentences)
next_chars = np.array(next_chars)
print('Shape of sentences:', sentences.shape)
print(sentences)
print(next_chars)

Shape of sentences: (333320,)
["look at her face, it's a wonderful face "
 "k at her face, it's a wonderful face  \na"
 "t her face, it's a wonderful face  \nand " ...,
 "t that we're less worse  \n  \ntears are n"
 "hat we're less worse  \n  \ntears are not "
 " we're less worse  \n  \ntears are not eno"]
[' ' 'n' 'i' ..., 'o' 'e' 'u']


In [14]:
# convert each character to categorical numbers
label_encoder = LabelEncoder()
integer_encoded_X = label_encoder.fit_transform(sentences)
integer_encoded_y = label_encoder.fit_transform(next_chars)

print('Integer encoded X:', integer_encoded_X)
print('Integer encoded y:', integer_encoded_y)

Integer encoded X: [192992 181993 264233 ..., 266428 157152  73624]
Integer encoded y: [ 1 37 32 ..., 38 28 44]


In [15]:
# one-hot encode each categorical number
onehot_encoder = OneHotEncoder(sparse=False)

integer_encoded_X = integer_encoded_X.reshape(len(integer_encoded_X), 1)
onehot_encoded_X = onehot_encoder.fit_transform(integer_encoded_X)

integer_encoded_y = integer_encoded_y.reshape(len(integer_encoded_y), 1)
onehot_encoded_y = onehot_encoder.fit_transform(integer_encoded_y)


print(onehot_encoded_X, onehot_encoded_y)

X = onehot_encoded_X
y = onehot_encoded_y

MemoryError: 

In [9]:
# one-hot encoding the input values wrapped in a separate function,
# since the dataset length causes the memory error (on my machine) when converting all at once
def encode(sentences, next_chars):
    X = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
    y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
    length = len(sentences)
    index = 0
    for i in range(len(sentences)):
        sentence = sentences[i]
        for t, char in enumerate(sentence):
            X[i, t, char_to_ind[char]] = 1
        y[i, char_to_ind[next_chars[i]]] = 1
    return X, y

X, y = encode(sentences, next_chars)

# Network Model

In [10]:
# model 1
# NOTE:TRAIN AND TEST 1 MODEL AT A TIME
model = Sequential()
model.add(LSTM(128, input_shape=(maxlen, len(chars))))
model.add(Dense(len(chars)))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 128)               91648     
_________________________________________________________________
dense_1 (Dense)              (None, 50)                6450      
_________________________________________________________________
activation_1 (Activation)    (None, 50)                0         
Total params: 98,098
Trainable params: 98,098
Non-trainable params: 0
_________________________________________________________________


In [10]:
#model 2
model2 = Sequential()
model2.add(LSTM(128, input_shape=(maxlen, len(chars))))
model2.add(Dense(128))
model2.add(Dropout(0.5))
model2.add(Dense(len(chars)))
model2.add(Activation('softmax'))
model2.compile(loss='categorical_crossentropy', optimizer='rmsprop')

model2.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 128)               91648     
_________________________________________________________________
dense_1 (Dense)              (None, 128)               16512     
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 50)                6450      
_________________________________________________________________
activation_1 (Activation)    (None, 50)                0         
Total params: 114,610
Trainable params: 114,610
Non-trainable params: 0
_________________________________________________________________


In [11]:
# training checkpoints
filepath="weights_model2{epoch:02d}-{loss:.4f}.h5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

In [13]:
# training
batch_size = 42

trained = model2.fit(X, y, batch_size, epochs=30, callbacks=callbacks_list)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


## Predictions

In [None]:
# OPTIONAL: run this if the model is trained on gpu but being tested on cpu
# RUN ONCE FOR A FILE
f = h5py.File('weights29-1.1538.h5', 'r+')
del f['optimizer_weights']
f.close()

In [41]:
# example test phrase
sentence = 'wise men say only fools rush in but can '
sentence = sentence.lower()
print('Sentence length:', len(sentence))

# construct a vector from an example phrase
x = np.zeros((1, maxlen, len(chars)))
for i, char in enumerate(sentence):
    x[0, i, char_to_ind[char]] = 1.
    
# load trained weights
model = load_model('weights29-1.1538.h5')


print('Predictions per char in vocabulary', model.predict(x, verbose=0))

Sentence length: 40
Predictions per char in vocabulary [[  1.71119809e-05   3.55375111e-02   3.41021769e-06   3.78997298e-04
    3.98959179e-04   2.78970995e-03   1.24004885e-06   1.13178930e-05
    1.01340382e-04   6.87545935e-06   2.05232845e-06   1.37025680e-04
    1.05412648e-04   4.65848598e-06   5.19655487e-06   6.08608207e-06
    2.02593510e-04   1.84048508e-06   5.33222192e-06   1.71361007e-05
    3.25404312e-06   5.19605201e-05   5.64325564e-05   4.10753501e-07
    1.92759149e-02   9.45081562e-02   3.21600474e-02   2.19398439e-02
    1.06698321e-02   3.52355726e-02   4.42303494e-02   9.67846066e-02
    3.11897211e-02   9.77126230e-03   2.41910759e-02   1.64146617e-01
    1.20490817e-02   3.38718249e-03   6.52648415e-03   3.67834084e-02
    1.99042726e-03   1.12015735e-02   1.44536838e-01   7.38101080e-02
    7.27118109e-04   6.28432550e-04   3.59937400e-02   4.56165872e-05
    4.82902229e-02   8.08818659e-05]]


In [42]:
# OPTIONAL: adding randomness for choosing predicted characters
def sample(preds, temperature):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [44]:
# temperature defines randomness rate for using predictions
temperature = 0.8
print('Temperature: ', temperature)

generated = ''
original = sentence
window = sentence

# predicting next 500 chars
for i in range(500):
    x = np.zeros((1, maxlen, len(chars)))
    for t, char in enumerate(window):
        x[0, t, char_to_ind[char]] = 1.

    preds = model.predict(x, verbose=0)[0]
    next_index = sample(preds, temperature)
    next_char = ind_to_char[next_index]

    generated += next_char
    window = window[1:] + next_char

print(original + generated)

Temperature:  0.8
wise men say only fools rush in but can be sall  
i have the smappeat leed me  
  
shine chasing two the man and then you ever fidds  
home is so on the tears  
freemonts my show  
i want to midnide me and the strousing for a get a boy trought  
give me on the tile is just only money, jeen, more that spine mingin' rifer  
the mount, and the selt time we till in the couls sidenth  
there's no breath me to the bould say  
goodollin' at the even cake the morning  
it's slowd when prectory  
and but there's too find  
i stop myse the dest
