### Lyrics Generator using a Taylor Swift Lyrics Dataset

In [20]:
# Import the dependencies
import numpy as np
import pandas as pd
import sys
from keras.models import Sequential
from keras.layers import (
    LSTM,
    Activation,
    Flatten,
    Dropout,
    Dense,
    Embedding,
    TimeDistributed,
)
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils

In [21]:
# Loading Dataset
dataset = pd.read_csv("data/taylor_swift_lyrics.csv", encoding="latin1")

dataset.head()

Unnamed: 0,artist,album,track_title,track_n,lyric,line,year
0,Taylor Swift,Taylor Swift,Tim McGraw,1,He said the way my blue eyes shined,1,2006
1,Taylor Swift,Taylor Swift,Tim McGraw,1,Put those Georgia stars to shame that night,2,2006
2,Taylor Swift,Taylor Swift,Tim McGraw,1,"I said, ""That's a lie""",3,2006
3,Taylor Swift,Taylor Swift,Tim McGraw,1,Just a boy in a Chevy truck,4,2006
4,Taylor Swift,Taylor Swift,Tim McGraw,1,That had a tendency of gettin' stuck,5,2006


Concatenate the lines of each song to get each song by its own string

In [22]:
def process_first_line(
    lyrics: list, song_id: list, song_name: list, row: int
) -> tuple[list, list, list]:
    lyrics.append(row["lyric"] + "\n")
    song_id.append(row["year"] * 100 + row["track_n"])
    song_name.append(row["track_title"])

    return lyrics, song_id, song_name


# define empty lists for the lyrics, song_id, song_name
lyrics, song_id, song_name = [], [], []

# song_number indicates the song number in the dataset
song_number = 1

# i indicates the song number
i = 0
is_first_line = True

# Iterate through every lyrics line and join them together for each song independently
for index, row in dataset.iterrows():
    if song_number == row["track_n"]:
        if is_first_line:
            lyrics, song_id, song_name = process_first_line(
                lyrics, song_id, song_name, row
            )
            is_first_line = False
        else:
            # if we still in the same song, keep joining the lyrics lines
            lyrics[i] += row["lyric"] + "\n"
    # When it's done joining a song's lyrics lines , go to the next song :
    else:
        lyrics, song_id, song_name = process_first_line(lyrics, song_id, song_name, row)
        songNumber = row["track_n"]
        i += 1

Define a new pandas dataframe to save song_id, song_name, lyric

In [23]:
lyrics_data = pd.DataFrame(
    {"song_id": song_id, "song_name": song_name, "lyrics": lyrics}
)

Saving lyrics to a text file

In [24]:
with open("data/lyrics_text.txt", "w", encoding="UTF-8") as file:
    for list_item in lyrics:
        file.write("%s\n" % list_item)

### Preprocessing the Lyrics

Convert lyrics to lowercase

In [25]:
text_file_name = "data/lyrics_text.txt"

raw_rext = open(text_file_name, encoding="UTF-8").read()
raw_text = raw_rext.lower()

Mapping characters


In [26]:
# Make 2 dictionaries, one to convert chars to ints, the other to convert ints back to chars
chars = sorted(list(set(raw_text)))

int_chars = dict((i, c) for i, c in enumerate(chars))
chars_int = dict((i, c) for c, i in enumerate(chars))

In [27]:
# Get number of chars and vocab in our text
n_chars = len(raw_text)
n_vocab = len(chars)

print("Total Characters : ", n_chars)  # number of all the characters in lyrics_text.txt
print("Total Vocab : ", n_vocab)

Total Characters :  178312
Total Vocab :  60


Make samples and labels

In [28]:
# process the dataset:
seq_len = 100
data_X = []
data_Y = []

for i in range(0, n_chars - seq_len, 1):
    # Input Sequence(will be used as samples)
    seq_in = raw_text[i : i + seq_len]

    # Output Sequence(will be used as target)
    seq_out = raw_text[i + seq_len]

    # Store samples in data_X
    data_X.append([chars_int[char] for char in seq_in])

    # Store targets in data_y
    data_Y.append(chars_int[seq_out])

n_patterns = len(data_X)
print("Total Patterns : ", n_patterns)

Total Patterns :  178212


Prepare the samples and labels

In [29]:
# Reshape the X to be suitable to go into LSTM RNN:
X = np.reshape(data_X, (n_patterns, seq_len, 1))

# Normalizing Input Data
X = X / float(n_vocab)

# one hot encode the output targets
Y = np_utils.to_categorical(data_Y)

### Building the Model

In [30]:
LSTM_layer_num = 4  # number of LSTM layers
layer_size = [256, 256, 256, 256]  # number of nodes in each layer

In [31]:
model = Sequential()

In [32]:
# Adding an Input Layer
model.add(
    LSTM(
        layer_size[0], input_shape=(X.shape[1], X.shape[2]), return_sequences=True
    )
)

In [33]:
# Add some hidden layers
for i in range(1, LSTM_layer_num):
    model.add(LSTM(layer_size[i], return_sequences=True))

In [34]:
# Flatten the Data
model.add(Flatten())

In [35]:
model.add(Dense(Y.shape[1]))
model.add(Activation("softmax"))
model.compile(loss="categorical_crossentropy", optimizer="adam")

In [36]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 100, 256)          264192    
                                                                 
 lstm_1 (LSTM)               (None, 100, 256)          525312    
                                                                 
 lstm_2 (LSTM)               (None, 100, 256)          525312    
                                                                 
 lstm_3 (LSTM)               (None, 100, 256)          525312    
                                                                 
 flatten_1 (Flatten)         (None, 25600)             0         
                                                                 
 dense_1 (Dense)             (None, 60)                1536060   
                                                                 
 activation_1 (Activation)   (None, 60)               

In [37]:
# Configure the Checkpoint:
checkpoint_name = "Weights-LSTM-improvement-{epoch:03d}-{loss:.5f}-bigger.hdf5"
checkpoint = ModelCheckpoint(
    checkpoint_name, monitor="loss", verbose=1, save_best_only=True, mode="min"
)
callbacks_list = [checkpoint]

### Training

In [38]:
# Fit the model :
model_params = {
    "epochs": 30,
    "batch_size": 128,
    "callbacks": callbacks_list,
    "verbose": 1,
    "validation_split": 0.2,
    "validation_data": None,
    "shuffle": True,
    "initial_epoch": 0,
    "steps_per_epoch": None,
    "validation_steps": None,
}
model.fit(
    X,
    Y,
    epochs=model_params["epochs"],
    batch_size=model_params["batch_size"],
    callbacks=model_params["callbacks"],
    verbose=model_params["verbose"],
    validation_split=model_params["validation_split"],
    validation_data=model_params["validation_data"],
    shuffle=model_params["shuffle"],
    initial_epoch=model_params["initial_epoch"],
    steps_per_epoch=model_params["steps_per_epoch"],
    validation_steps=model_params["validation_steps"],
)

Epoch 1/30
  13/1114 [..............................] - ETA: 47:20 - loss: 3.3025