# Pink Floyd Lyrics Generator Using ANN

## Data import

In [41]:
import pandas as pd
dataset = pd.read_csv('pink_floyd_lyrics.csv')
dataset = dataset['lyrics']
dataset.head(10)

0    "Moon in both [houses]..."...Scorpio, [Arabian...
1    Lucifer Sam, siam cat\nAlways sitting by your ...
2    There was a king who ruled the land\nHis Majes...
3    Alone in the clouds all blue\nLying on an eide...
4    TCH TCH\nAHH (AHH)\nTCH TCH\nAHH AHH\nDoi doi\...
5    Doctor, doctor!\nI’m in bed (Doctor, doctor)\n...
6                                                  NaN
7    I want to tell you a story\nBout’ a little man...
8    All movement is accomplished in six stages\nAn...
9    The black and green scarecrow as everyone know...
Name: lyrics, dtype: object

### Regular Expression to clean the text

In [32]:
import re
for i in range(len(dataset)):
    dataset[i] = re.sub(r'[ÁÀ]', 'A', str(dataset[i]))
    dataset[i] = re.sub(r'[áà]', 'a', str(dataset[i]))
    dataset[i] = re.sub(r'[ÉÈËЕ]', 'E', str(dataset[i]))
    dataset[i] = re.sub(r'[éèëе]', 'e', str(dataset[i]))
    dataset[i] = re.sub(r'[ÍÌ]', 'I', str(dataset[i]))
    dataset[i] = re.sub(r'[íì]', 'i', str(dataset[i]))
    dataset[i] = re.sub(r'[ÓÒŌ]', 'O', str(dataset[i]))
    dataset[i] = re.sub(r'[óòō]', 'o', str(dataset[i]))
    dataset[i] = re.sub(r'[ÚÙÜ]', 'U', str(dataset[i]))
    dataset[i] = re.sub(r'[úùü]', 'u', str(dataset[i]))
    dataset[i] = re.sub(r'[ćč]', 'c', str(dataset[i]))
    dataset[i] = re.sub(r'[ĆČ]', 'c', str(dataset[i]))
    dataset[i] = re.sub(r'[^a-zA-Z0-9ñÑ,.:;?[\]()!"\'‘’“”…¡¿\n ]', '', str(dataset[i]))
dataset.to_csv('lyrics.txt', index=False, header=False) 

In [43]:
with open('lyrics.txt', 'r', encoding='utf-8') as file:
    data = file.read()
print(data[0:300])

"""Moon in both [houses]...""...Scorpio, [Arabian Skies], Libra...""...Pluto was not discovered until 1930...""
Lime and limpid green, a second scene
A fight between the blue you once knew
Floating down, the sound resounds
Around the icy waters underground
Jupiter and Saturn, Oberon, Miranda and Tit


## Tokenizer

In [34]:
import tensorflow as tf
tokenizer = tf.keras.preprocessing.text.Tokenizer(char_level=True)
tokenizer.fit_on_texts(data)
max_id = len(tokenizer.word_index)
dataset_size = tokenizer.document_count

In [45]:
import numpy as np
[encoded] = np.array(tokenizer.texts_to_sequences([data])) - 1

## Pre process the data

In [46]:
import tensorflow as tf
train_size = dataset_size * 90 // 100
dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size])

In [47]:
n_steps = 100
window_length = n_steps + 1
dataset = dataset.window(window_length, shift=1, drop_remainder=True)

In [48]:
dataset = dataset.flat_map(lambda window: window.batch(window_length))

In [49]:
np.random.seed(15)
tf.random.set_seed(15)

In [50]:
batch_size = 32
dataset = dataset.shuffle(10000).batch(batch_size)
dataset = dataset.map(lambda windows: (windows[:, :-1], windows[:, 1:]))

In [51]:
dataset = dataset.map(
    lambda X_batch, Y_batch: (tf.one_hot(X_batch, depth=max_id), Y_batch))

In [52]:
dataset = dataset.prefetch(1)
for X_batch, Y_batch in dataset.take(1):
    print(X_batch.shape, Y_batch.shape)

(32, 100, 54) (32, 100)


## Fit the Model

In [20]:
from tensorflow import keras
model = tf.keras.models.Sequential([
    keras.layers.GRU(128, return_sequences=True, input_shape=[None, max_id],
                        dropout=0.2),
    keras.layers.GRU(128, return_sequences=True,
                        dropout=0.2),
    keras.layers.TimeDistributed(keras.layers.Dense(max_id,
                                                    activation="softmax"))
])
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam")
history = model.fit(dataset, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


## Predict

In [21]:
def preprocess(texts):
    X = np.array(tokenizer.texts_to_sequences(texts)) - 1
    return tf.one_hot(X, max_id)

In [24]:
def next_char(text, temperature=1):
    X_new = preprocess([text])
    Y_pred = model.predict(X_new, verbose=0)[0, -1:, :]
    rescaled_logits = tf.math.log(Y_pred) / temperature
    char_id = tf.random.categorical(rescaled_logits, num_samples=1) + 1
    return tokenizer.sequences_to_texts(char_id.numpy())[0]

In [25]:
def complete_text(text, n_chars=50, temperature=1):
    for _ in range(n_chars):
        text += next_char(text, temperature)
    return text

## Output

In [40]:
print(complete_text("Money is ", n_chars=300, temperature=0.4))

Money is gone that it's always been?
could be the hoarts and haggles what have been and haggle
for you?
and did you know staining away?"
"one sould, maggie, what have we gonna me trons
in the window this sound of the night
and if i can treat in the dream
how can you ever wanted to be to rime

fow you
must he


## Saving the model

In [36]:
# Guardamos el modelo
model.save('model.h5')

In [37]:
import pickle
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)