<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Scope" data-toc-modified-id="Scope-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Scope</a></span></li><li><span><a href="#Loading-libraries" data-toc-modified-id="Loading-libraries-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Loading libraries</a></span></li></ul></div>

### Scope
Create and train the RNN, and create a recommender system.

### Loading libraries

In [1]:
import pandas as pd
import numpy as np

In [2]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional, BatchNormalization, GRU
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Sequential
import tensorflow.keras.utils as ku

from tensorflow.keras.models import load_model
# import string, os
# import warnings
# warnings.filterwarnings("ignore")
# warnings.simplefilter(action='ignore', category='FutureWarning')

In [3]:
clean_lyrical_df = pd.read_csv("clean_lyrical_df.csv")
clean_lyrical_df

Unnamed: 0.1,Unnamed: 0,lyrics,song,artist
0,2,yeah when i grow up you know what i wanna be t...,"""When I Grow Up"" by NF:\n Yeah, when I grow...",nf
1,5,hey leave me alone panicstricken handle busin...,"""Leave Me Alone"" by NF:\n (Yeah)\n Hey!\...",nf
2,9,i just need some time im tryna think straight ...,"""If You Want Love"" by NF:\n I just need som...",nf
3,11,how could you leave so unexpected we waited we...,"""How Could You Leave Us"" by NF:\n How could...",nf
4,12,insidious is blind inception whats reality wit...,"""Mansion"" by NF:\n Insidious is blind incep...",nf
...,...,...,...,...
2254,4020,who do you believe in who do you believe in i...,"""Who Do U Believe In?"" by Token:\n [Intro:]...",token
2255,4022,productive successful im taught like that girl...,"""Humble (Remix)"" by Token:\n Productive, su...",token
2256,4023,token goin in you can call the beat a slot ma...,"""Left Somethin’"" by Token:\n [Verse 1]\n ...",token
2257,4026,uh its summer break yeah its summer break the...,"""Summer Break"" by Token:\n [Intro]\n Uh\...",token


In [4]:
# can only be run once
clean_lyrical_df.drop("Unnamed: 0", axis=1,inplace=True)

# Check
clean_lyrical_df.head(2)

Unnamed: 0,lyrics,song,artist
0,yeah when i grow up you know what i wanna be t...,"""When I Grow Up"" by NF:\n Yeah, when I grow...",nf
1,hey leave me alone panicstricken handle busin...,"""Leave Me Alone"" by NF:\n (Yeah)\n Hey!\...",nf


In [5]:
clean_lyrical_df["lyrics"] = clean_lyrical_df["lyrics"].astype(str)

In [6]:
# tokenizing by work (False)
lyrical_rap_tokenizer = Tokenizer(char_level=False, lower=True)
lyrical_rap_tokenizer.fit_on_texts(clean_lyrical_df["lyrics"])

Save tokenizer

In [7]:
import pickle
# saving
with open('lyrical_rap_tokenizer.pkl', 'wb') as handle:
    pickle.dump(lyrical_rap_tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [8]:
word_to_number = lyrical_rap_tokenizer.word_index
number_to_word = lyrical_rap_tokenizer.index_word

all_words = list(word_to_number.keys())

print(f"Vocabulary size: {len(all_words)}")

Vocabulary size: 30861


Sliding window length

In [9]:
dataset = lyrical_rap_tokenizer.texts_to_sequences(clean_lyrical_df["lyrics"])

In [10]:
print(dataset[0])

[51, 38, 2, 742, 19, 3, 27, 34, 2, 64, 29, 82, 4, 841, 75, 10, 84, 3, 7, 2557, 462, 2, 64, 155, 51, 2, 27, 33, 167, 6, 266, 5, 2, 43, 84, 117, 501, 498, 2, 143, 55, 8205, 25, 11, 321, 16, 8, 2, 1507, 1, 958, 108, 1069, 178, 76, 1656, 13, 68, 106, 29, 122, 2627, 2, 396, 3, 100, 4144, 174, 3, 24, 6, 17, 565, 25, 2, 76, 558, 22, 4, 184, 30, 35, 289, 22, 10, 1603, 2, 27, 21, 1735, 826, 142, 7370, 5, 1604, 2, 203, 57, 151, 54, 4, 1289, 25, 12, 20, 54, 4, 1289, 169, 80, 6, 1684, 8, 178, 76, 13283, 1562, 3, 35, 100, 29, 4, 17768, 6, 46, 1, 2558, 35, 13284, 22, 10, 2, 81, 436, 222, 33, 628, 59, 2, 14, 8, 6, 29, 1, 5355, 51, 3, 1735, 102, 3, 27, 34, 2, 209, 25, 34, 11, 402, 18, 30, 151, 659, 10, 11, 145, 1409, 51, 2, 113, 47, 6, 1138, 24, 9, 2497, 14, 229, 549, 3387, 5, 1735, 24, 4, 518, 12, 823, 339, 1, 1290, 12, 20, 54, 4, 223, 15, 1043, 6, 10, 547, 1, 717, 2850, 2, 20, 14, 1, 508, 8206, 2, 144, 584, 54, 4, 2177, 334, 54, 4, 1260, 510, 2, 118, 54, 4, 2177, 17769, 24, 1, 2261, 59, 3, 83, 8, 54

In [11]:
# sliding window
SEQUENCE_LENGTH = 5

X = []
y = []

for song in dataset:
    for window_start_idx in range(len(song)-SEQUENCE_LENGTH):
        window_end_idx = window_start_idx + SEQUENCE_LENGTH
        X.append(song[window_start_idx: window_end_idx])
        y.append(song[window_end_idx])

X = np.array(X)
y = np.array(y)

# Let's look at the shapes
print(X.shape)
print(y.shape)

(1091123, 5)
(1091123,)


In [12]:
number_of_classes = len(all_words)+1
embedding_dim = 8

lyrical_rap_model = Sequential()
lyrical_rap_model.add(Embedding(number_of_classes, embedding_dim))


lyrical_rap_model.add(LSTM(700, activation='tanh', return_sequences=True))
lyrical_rap_model.add(BatchNormalization())
lyrical_rap_model.add(Dropout(0.2))


lyrical_rap_model.add(LSTM(350, activation='tanh', return_sequences=False))
lyrical_rap_model.add(BatchNormalization())
lyrical_rap_model.add(Dropout(0.2))

lyrical_rap_model.add(Dense(175, activation='relu'))
lyrical_rap_model.add(BatchNormalization())
lyrical_rap_model.add(Dropout(0.2))

lyrical_rap_model.add(Dense(number_of_classes, activation='softmax'))

In [13]:
number_of_classes

30862

In [14]:
# Compile model
lyrical_rap_model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

In [17]:
lyrical_rap_model.summary()

Model: "sequential_13"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_13 (Embedding)     (None, None, 8)           246896    
_________________________________________________________________
lstm_31 (LSTM)               (None, None, 700)         1985200   
_________________________________________________________________
batch_normalization_30 (Batc (None, None, 700)         2800      
_________________________________________________________________
dropout_30 (Dropout)         (None, None, 700)         0         
_________________________________________________________________
lstm_32 (LSTM)               (None, 350)               1471400   
_________________________________________________________________
batch_normalization_31 (Batc (None, 350)               1400      
_________________________________________________________________
dropout_31 (Dropout)         (None, 350)             

In [54]:
history = lyrical_rap_model.fit(X, y,
        batch_size=1024,
        epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


50 epochs just to save time due to how computationally intensive it is.

In [56]:
# saving the model
# lyrical_rap_model.save("lyrical_rap_model.h5")

In [16]:
# loading the model
lyrical_rap_model = load_model("lyrical_rap_model.h5")

In [18]:
def generate_text(input_phrase, next_words, model):
    # process for the model
    processed_phrase = lyrical_rap_tokenizer.texts_to_sequences([input_phrase])[0]
    for i in range(next_words):
      network_input = np.array(processed_phrase[-(len(processed_phrase)):], dtype=np.float32)
      network_input = network_input.reshape((1, (len(processed_phrase)))) 

      # the RNN gives the probability of each word as the next one
      predict_proba = model.predict(network_input)[0] 
      
      # sample one word using these chances
      predicted_index = np.random.choice(number_of_classes, 1, p=predict_proba)[0]

      # add new index at the end of our list
      processed_phrase.append(predicted_index)
      

  # indices mapped to words - the method expects a list of lists so we need the extra bracket
      output_phrase = lyrical_rap_tokenizer.sequences_to_texts([processed_phrase])[0]

    return output_phrase

In [64]:
generate_text('calls for pots and pans that I cook', 15, lyrical_rap_model)

'calls for pots and pans that i cook highway and i know you takin ova cause dont got my hopes plus then we'

In [40]:
generate_text('barking like a dog to make', 10, lyrical_rap_model)

'barking like a dog to make your crews need five flows you better get in they'

In [38]:
generate_text('the mountains ive seen but havent travelled', 10, lyrical_rap_model)

'the mountains ive seen but havent travelled on your mind soon on everything look yourself theyll get'

In [63]:
generate_text('delivered or seen but nothing can be', 10, lyrical_rap_model)

'delivered or seen but nothing can be plan for what i is business the only the kid'

In [62]:
generate_text('to be furious is to be', 25, lyrical_rap_model)

'to be furious is to be lonely the perfect rappin terror and and now i part of my other believe some and the story for the rude before what these have'

In [61]:
generate_text('tshirt on the floor', 10, lyrical_rap_model)

'tshirt on the floor right there is two six nine nines to ya skin'