**Lyric generation using LSTMs**

Data set: Scraping lyrics of all songs from the internet.

In [0]:
from bs4 import BeautifulSoup
import requests
import pandas as pd 

BAND_NAME = 'Pink Floyd'

In [0]:
def get_song_links():
  url = 'https://www.allthelyrics.com/lyrics/pink_floyd'
  html = requests.get(url)
  soup = BeautifulSoup(html.text, 'html.parser')
  hrefs = []
  names = []
  songs = soup.find_all('a')

  for song in songs:
    if '/lyrics/pink_floyd/' in song['href'] and song['href'] not in hrefs:
      names.append(song.text)
      hrefs.append(song['href'])
      
  return hrefs, names

In [0]:
def get_song_lyrics(hrefs, names):
  lyrics = ""
  for href, name in zip(hrefs, names):
    url = 'https://www.allthelyrics.com' + href
    html = requests.get(url)
    soup = BeautifulSoup(html.text, 'html.parser')
    div = soup.find('div', {'class': 'content-text-inner'})
    lyrics += " " + div.get_text()
  return lyrics

In [0]:
hrefs, names = get_song_links()
lyrics = get_song_lyrics(hrefs, names)

print(lyrics)

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
export_csv = df.to_csv ('/content/drive/My Drive/Lyrics_generator/dataset_floyd.csv', index = None, header=True)

**Data cleaning**

In [0]:
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.preprocessing.text import Tokenizer
from keras.models import load_model
from keras.callbacks import EarlyStopping
from keras.models import Sequential
import keras.utils as ku 
from array import array

from tensorflow import set_random_seed
from numpy.random import seed
set_random_seed(2)
seed(1)

import numpy as np
import string, os 
import re

In [12]:
all_lyrics = []
df = pd.read_csv("/content/drive/My Drive/Lyrics_generator/dataset_floyd.csv")
all_lyrics.extend(list(df.Lyrics.values))

all_lyrics = [l for l in all_lyrics if l != "Unknown"]
print(all_lyrics)



In [14]:
import string

def clean_text(doc):
	doc = doc.replace('--', ' ')
	tokens = doc.split()
	table = str.maketrans('', '', string.punctuation)
	tokens = [w.translate(table) for w in tokens]
	tokens = [word for word in tokens if word.isalpha()]
	tokens = [word.lower() for word in tokens]
	return tokens

string_lyrics = ""
tokens = clean_text(string_lyrics.join(all_lyrics))
print('Total Tokens: %d' % len(tokens))
print('Unique Tokens: %d' % len(set(tokens)))

Total Tokens: 23028
Unique Tokens: 3098


In [28]:
length = 50 + 1
lines = list()
for i in range(length, len(tokens)):
	# select sequence of tokens
	seq = tokens[i-length:i]
	line = ' '.join(seq)
	lines.append(line)
print('Total Sequences: %d' % len(lines))

Total Sequences: 22977


In [0]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
text_seq = tokenizer.texts_to_sequences(lines)
# vocabulary size
vocab_size = len(tokenizer.word_index) + 1  

In [0]:
text_seq = np.array(text_seq)
X, y = text_seq[:,:-1], text_seq[:,-1]
y = ku.to_categorical(y, num_classes=vocab_size)
seq_length = X.shape[1]

In [29]:
checkpoint_path = "/content/drive/My Drive/Lyrics_generator/cp-{epoch:04d}.ckpt" 

def create_model(vocab_size, seq_length):
  model = Sequential()
  model.add(Embedding(vocab_size, 50, input_length=seq_length))
  model.add(LSTM(100, return_sequences=True))
  model.add(LSTM(100))
  model.add(Dropout(0.5))
  model.add(Dense(100, activation='relu'))
  model.add(Dense(vocab_size, activation='softmax'))
  model.summary()

model = create_model(vocab_size, seq_length)

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 50, 50)            154950    
_________________________________________________________________
lstm_1 (LSTM)                (None, 50, 100)           60400     
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dropout_1 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_2 (Dense)              (None, 3099)              312999    
Total params: 618,849
Trainable params: 618,849
Non-trainable params: 0
________________________________________________

In [149]:
import os
from keras.callbacks import ModelCheckpoint
cp_callback = ModelCheckpoint(checkpoint_path, save_weights_only=True, verbose=1, period=5)
model.save_weights(checkpoint_path.format(epoch=0))
# model.load_weights("/content/drive/My Drive/Lyrics_generator/cp-0070.ckpt")
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X, y, batch_size=128, epochs=100, callbacks = [cp_callback])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100

Epoch 00005: saving model to /content/drive/My Drive/Lyrics_generator/cp-0005.ckpt
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100

Epoch 00010: saving model to /content/drive/My Drive/Lyrics_generator/cp-0010.ckpt
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100

Epoch 00015: saving model to /content/drive/My Drive/Lyrics_generator/cp-0015.ckpt
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100

Epoch 00020: saving model to /content/drive/My Drive/Lyrics_generator/cp-0020.ckpt
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100

Epoch 00025: saving model to /content/drive/My Drive/Lyrics_generator/cp-0025.ckpt
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100

Epoch 00030: saving model to /content/drive/My Drive/Lyrics_generator/cp-0030.ckpt
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100

Epoch 00035: saving model to /content/drive/My Dr

<keras.callbacks.History at 0x7ff976a48908>

In [0]:
model.save('/content/drive/My Drive/Lyrics_generator/PinkFloydLyricsGenerator.h5')

In [9]:
model = load_model('/content/drive/My Drive/Lyrics_generator/PinkFloydLyricsGenerator.h5')
print("Loaded model")
model.summary()





Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.








Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Loaded model
Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 50, 50)            154950    
_________________________________________________________________
lstm_7 (LSTM)                (None, 50, 100)           60400     
_________________________________________________________________
lstm_8 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dropout_7 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_7 (Dense)              (None, 100)               10100     
________________

In [0]:
# generate a sequence from a language model
def generate_seq(model, tokenizer, seq_length, seed_text, n_words):
	result = list()
	in_text = seed_text
	# generate a fixed number of words
	for _ in range(n_words):
		# encode the text as integer
		encoded = tokenizer.texts_to_sequences([in_text])[0]
		# truncate sequences to a fixed length
		encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
		# predict probabilities for each word
		yhat = model.predict_classes(encoded, verbose=0)
		# map predicted word index to word
		out_word = ''
		for word, index in tokenizer.word_index.items():
			if index == yhat:
				out_word = word
				break
		# append to input
		in_text += ' ' + out_word
		result.append(out_word)
	return ' '.join(result)

In [27]:
# generate new text
from random import randint
# seed_text = lines[randint(0,len(lines))]
seed_text = "hello is there anybody out there just nod if you can hear me is there anyone at home come on now i hear youre feeling down i can ease your pain and get you on your feet again relax ill need some information first just the basic facts can you show where it hurts"
print(seed_text)

generated = generate_seq(model, tokenizer, seq_length, seed_text, 50)
print(generated)

hello is there anybody out there just nod if you can hear me is there anyone at home come on now i hear youre feeling down i can ease your pain and get you on your feet again relax ill need some information first just the basic facts can you show where it hurts
the same is no matter vegetable man now musk bang motionless upon the crowd of judging i have withdrawn and the generals gave thanks as the other ranks held pouring the next meal try to stay in the attic to you pleasednah nah nah nah nah nah nah nah nah
