In [1]:
from __future__ import division, print_function, unicode_literals
import numpy as np
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
import pandas as pd
import re
import tensorflow.keras as krs

# to make this notebook's output stable across runs
def reset_graph(seed=42):
    krs.backend.clear_session()
    tf.reset_default_graph()
    tf.set_random_seed(seed)
    np.random.seed(seed)

## Preprocess the data

In [2]:
# TODO (maybe) : split on songs

file_path = "data/taylor_swift_lyrics.csv"

# Read the dataset from csv
df = pd.read_csv(filepath_or_buffer=file_path,
                encoding = "ISO-8859-1",
                sep=";")

# Extract the lyrics column
lyrics = df['lyric']

# Remove all special characters and add the words to a list
all_chars = []
for line in lyrics:
    for char in list(line):
        c = re.sub('[^A-Za-z0-9\s]+', '', char)
        all_chars.append(c)
    all_chars.append("\n")

# Get all unique characters from the dataset
unique_chars = list(set(all_chars))
        
# Create translation tables from char -> ind and ind -> char for one-hot encoding
chars_to_ind = dict((c, i) for i, c in enumerate(unique_chars))
ind_to_chars = dict((i, c) for i, c in enumerate(unique_chars))

N = len(all_chars)
d = len(unique_chars)

print("All chars: ", N)
print("Unique chars: ", d)


All chars:  173542
Unique chars:  63


## One-Hot encoding of the lyrics data

In [116]:
X = np.zeros((N, d))
for i,char in enumerate(all_chars):
    X[i][chars_to_ind[char]] = 1

Let's train an RNN to classify MNIST images. We will treat each image as a sequence of 28 rows of 28 pixels each (since each MNIST image is 28×28 pixels). Assume we will use cells of 100 recurrent neurons, plus a fully connected layer containing 10 neurons connected to the output of the last time step, followed by a softmax layer.

In [None]:
# multi-layer RNN network for mnist
reset_graph()

seq_length = 100

model = krs.Sequential()
model.add(krs.layers.LSTM(d, input_shape=(seq_length, d)))
model.add(krs.layers.Dropout(0.2))
model.add(krs.layers.Dense(d, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

nb_epoch=10
batch_size=1000
nr_batches = int(N / batch_size)

for e in range(nb_epoch):
    print("epoch %d" % e)
    for b in range(nr_batches):
        # print("batch: ", b, "/", nr_batches)
        X_batch=np.zeros((batch_size, seq_length, d))
        Y_batch=np.zeros((batch_size, d))
        for i in range(batch_size):
            X_batch[i] = X[(b*batch_size) + i:(b*batch_size) + i + seq_length]
            Y_batch[i] = X[(b*batch_size) + i + seq_length]
        #model.fit(X_batch, Y_batch, batch_size=batch_size, epochs=1, verbose=0)
        model.train_on_batch(X_batch, Y_batch)
    score = model.evaluate(X_batch, Y_batch)
    print("loss: ", score)

print("Done training")
########################################
# executing the model
########################################


epoch 0
accuracy:  3.0202081337
epoch 1
accuracy:  2.75914653015
epoch 2


## Synthesize data from the model

In [114]:
gen_seq_length = 100
#generated_sequence = None
#n_mod = model
X = np.reshape(X, (len(X), 1, d))
input_pattern = X[:100]
#print("input shape", input_pattern.shape)
print(''.join([ind_to_chars[ind[0]] for ind in np.argmax(input_pattern, axis=2)]))

for i in range(gen_seq_length):
    c = len(input_pattern)
    x = np.reshape(input_pattern, (1, c, d))

    prediction = model.predict(x, verbose=0)
    #new_ind = np.argmax(prediction)
    #print(prediction[0].shape)
    new_ind = np.random.choice(d, p=prediction[0])
    #print("new_ind ", new_ind)
    #print(new_ind)
    print(ind_to_chars[new_ind], end="")
    new_pattern = np.zeros((1, 1, d))
    new_pattern[0][0][new_ind] = 1
    #print("new ", new_pattern.shape)
    #print("in ", input_pattern.shape)
    input_pattern = np.concatenate((input_pattern, new_pattern))
    #print("last elem ", np.argmax(input_pattern[-1]))
    #print("in after ", input_pattern.shape)
    input_pattern = input_pattern[1:len(input_pattern)]
    #print(np.argmax(input_pattern, axis=2))
    #print("last elem after", np.argmax(input_pattern[-1]))
print("--------------")

indices = np.argmax(input_pattern, axis=2)
sequence = [ind_to_chars[ind[0]] for ind in indices]
#gen_sequence = [ind_to_chars[ind] for ind in indices]

print("".join(sequence))

He said the way my blue eyes shined
Put those Georgia stars to shame that night
I said Thats a li
ma wtordiik tor al tiss me mate yow I tloikir
Ous winsss axve go llo ah Wo cand
The oe alt bey--------------
ma wtordiik tor al tiss me mate yow I tloikir
Ous winsss axve go llo ah Wo cand
The oe alt bey
