### Setup

In [None]:
import pandas as pd
import numpy as np
import re
import random
from tensorflow import keras
from keras import layers
from keras import optimizers

#### Importing and Cleaning Data

In [None]:
songs = pd.read_csv("doj_songs.csv")

In [None]:
# Tokenize Data

# merge all characters into one string
text = ""
clean = ""
for line in songs["lyrics"]:
    text = text + str(line).lower()
    clean = clean + " ".join(re.findall(r"[a-z']+", text))

# find all unique characters
tokens = re.findall(r"[a-z'\s]", clean)

In [None]:
# Define the Alphabet

characters = sorted(list(set(tokens)))
len(characters)
# 28 unique characters

# dictionary for character-to-index mapping
char_to_index = dict((char, index) for index, char in enumerate(characters))

# dictionary for index-to-character mapping
index_to_char = dict((index, char) for index, char in enumerate(characters))

In [None]:
len(characters)

28

In [None]:
# Create Training Sequences

# chunk the text into sequences
maxlen = 20  # n
step = 1  # length of step at each iteration

# list of sequences
sequences = []

# list of next characters model should predict
next_characters = []

# iterate over cleaned text string and each 20-length sequence into list
for i in range(0, len(clean) - maxlen, step):
    sequences.append(clean[i : (i + maxlen)])
    next_characters.append(clean[i + maxlen])

In [None]:
# Label Encode Training Sequences (one-hot encoding)

# create empty matrices for input and output sets
# input: each n-length sequence in sequences list
# output: next character after each n-length sequence
# i.e.: sentence = "hello there"
#       sequence = "hel"
#       next char = "l"

x = np.zeros((len(sequences), maxlen, len(characters)), dtype=np.bool)  # input
y = np.zeros((len(sequences), len(characters)), dtype=np.bool)  # output

for i, chunk in enumerate(sequences):
    for j, c in enumerate(chunk):
        x[i, j, char_to_index[c]] = 1
    y[i, char_to_index[next_characters[i]]] = 1

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  x = np.zeros((len(sequences), maxlen, len(characters)), dtype=np.bool)  # input
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  y = np.zeros((len(sequences), len(characters)), dtype=np.bool)  # output


### Build the Model
A single LSTM

In [None]:
model = keras.Sequential(
    [
        keras.Input(shape=(maxlen, len(characters))),
        layers.LSTM(128),
        layers.Dense(len(characters), activation="softmax"),
    ]
)

2021-12-11 00:40:00.544635: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
optimizer = keras.optimizers.RMSprop(learning_rate=0.01)

In [None]:
model.compile(loss="categorical_crossentropy", optimizer=optimizer)

In [None]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 128)               80384     
                                                                 
 dense (Dense)               (None, 28)                3612      
                                                                 
Total params: 83,996
Trainable params: 83,996
Non-trainable params: 0
_________________________________________________________________


### Prepare the Text Sampling Function

In [None]:
# Function to sample an index from a probability array

def sample(predictions, temperature=1.0):
    predictions = np.asarray(predictions).astype("float64")
    predictions = np.log(predictions) / temperature
    exp_predictions = np.exp(predictions)
    predictions = exp_predictions / np.sum(exp_predictions)
    probabilities = np.random.multinomial(1, predictions, 1)
    return np.argmax(probabilities)

### Train the Model

In [None]:
epochs = 40
size = 128

In [None]:
for epoch in range(epochs):

    model.fit(x, y, batch_size = size, epochs=1)
    print()
    print("Generating text after epoch: %d" %epoch)

    start_index = random.randint(0, len(text) - maxlen - 1)

    for diversity in [0.2, 0.5, 1.0, 1.2]:
        print("Diversity: ", diversity)

        generated = ""
        sentence = clean[start_index:start_index + maxlen]
        print("Generating with seed: '" + sentence + "'")

        for i in range(400):
            x_predict = np.zeros((1, maxlen, len(characters)))
            
            for t, char in enumerate(sentence):
                x_predict[0, t, char_to_index[char]] = 1.0
            predictions = model.predict(x_predict, verbose = 0)[0]
            next_index = sample(predictions, diversity)
            next_char = index_to_char[next_index]
            sentence = sentence[1:] + next_char
            generated += next_char

        print("Generated: ", generated)
        print()

  603/19275 [..............................] - ETA: 22:32 - loss: 1.2906