Import the libraries

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, Dropout, LSTM, Dense, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential

In [2]:
data=pd.read_csv('data.csv')
data=data.head(500)

Visualize the number of words in each lyrics and the average number of words

In [4]:
data['Word_Count'] = data['lyrics'].apply(lambda x: len(str(x).split()))
data.head()

Unnamed: 0,lyrics,Number_of_words
0,it s all right it s all right yeah it s all ri...,98
1,maybe she s just too old for me living in a te...,457
2,challenge me i will stay by your side draw you...,283
3,i got strung up from our loving i wish you car...,532
4,i came to town the other night i heard the noi...,520


In [5]:
lyric_lengths = data['Word_Count']
lyric_lengths.describe()

count    500.000000
mean     270.166000
std      161.845464
min        1.000000
25%      158.000000
50%      235.000000
75%      342.250000
max      933.000000
Name: Number_of_words, dtype: float64

Preprocessing

In [6]:
lyrics = data['lyrics'].astype(str).str.lower()
lyrics[0:100]

0     it s all right it s all right yeah it s all ri...
1     maybe she s just too old for me living in a te...
2     challenge me i will stay by your side draw you...
3     i got strung up from our loving i wish you car...
4     i came to town the other night i heard the noi...
                            ...                        
95    can t stop this money marathon but they gonna ...
96    been a long time since you touched me and made...
97    lover one lovely day love came planning to sta...
98    would you go to war for me baby would you cry ...
99    there s something rules our destiny right from...
Name: lyrics, Length: 100, dtype: object

Tokenization

In [7]:
token_maker = Tokenizer()
token_maker.fit_on_texts(lyrics)
total_words = len(token_maker.word_index) + 1

In [9]:
tokenized_lyrics = token_maker.texts_to_sequences(lyrics)
tokenized_lyrics[0]

[7,
 11,
 19,
 75,
 7,
 11,
 19,
 75,
 57,
 7,
 11,
 19,
 75,
 7,
 11,
 19,
 75,
 57,
 7,
 1018,
 19,
 98,
 1,
 27,
 6,
 867,
 44,
 11,
 6,
 906,
 867,
 42,
 7,
 252,
 48,
 5,
 907,
 9,
 44,
 450,
 35,
 5,
 40,
 3,
 33,
 9,
 440,
 12,
 2,
 1501,
 33,
 9,
 386,
 72,
 98,
 42,
 7,
 252,
 5,
 907,
 9,
 57,
 44,
 11,
 266,
 19,
 266,
 8,
 332,
 2505,
 74,
 332,
 2505,
 74,
 332,
 2505,
 74,
 1,
 27,
 6,
 867,
 44,
 11,
 6,
 906,
 867,
 42,
 7,
 252,
 48,
 5,
 907,
 9,
 44,
 20,
 266,
 19,
 266]

n gram sequence

In [10]:
input_sequences = []
for lyric in tokenized_lyrics:
    for i in range(1, len(lyric)):
        n_gram_sequence = lyric[:i+1]
        input_sequences.append(n_gram_sequence)

Pre padding

In [None]:
max_sequence_length = max([len(seq) for seq in input_sequences])
padded_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_length, padding='pre'))

create predictors and label

In [20]:
X, labels = padded_sequences[:,:-1], padded_sequences[:,-1]
y = tf.keras.utils.to_categorical(labels, num_classes=total_words)
print(X.shape)
print(max_sequence_length)

(134583, 932)
933


create the LSTM model and train it for 10 epochs

In [None]:
model = Sequential()
model.add(Embedding(total_words, 40, input_length=max_sequence_length-1))
model.add(Bidirectional(LSTM(250)))
model.add(Dropout(0.1))
model.add(Dense(total_words, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
earlystop = EarlyStopping(monitor='loss', min_delta=0, patience=3, verbose=0, mode='auto')
history = model.fit(X, y, epochs=10, verbose=1, callbacks=[earlystop])

Save the model

In [None]:
model.save('song_lyrics_generator.h5')

Function to generate lyrics

In [11]:
def complete_this_song(seed_text, num_words):
    for _ in range(num_words):
        token_list = token_maker.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_length-1, padding='pre')
        predicted_probs = model.predict(token_list)[0]
        predicted_index = np.argmax(predicted_probs)

        output_word = ""
        for word, index in token_maker.word_index.items():
            if index == predicted_index:
                output_word = word
                break
        seed_text += " " + output_word
    return seed_text

Load the model

In [12]:
from tensorflow.keras.models import load_model
model = load_model('song_lyrics_generator.h5')

Test case

In [13]:
input='party all night'
generated_lyrics = complete_this_song(input, 50)
print(generated_lyrics)

party all night to find you here i wonder i m a fool to play i m a live to spill you i m gonna make it to you i m using one way no i won t let go of my friend i m a live wire i m a live wire
