<a href="https://colab.research.google.com/github/czengnn/lana-del-rey-lyrics-generator/blob/main/LDR_by_word.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
! pip install matplotlib
! pip install keras

Collecting matplotlib
  Using cached matplotlib-3.4.2-cp37-cp37m-manylinux1_x86_64.whl (10.3 MB)
Collecting kiwisolver>=1.0.1
  Using cached kiwisolver-1.3.1-cp37-cp37m-manylinux1_x86_64.whl (1.1 MB)
Collecting cycler>=0.10
  Using cached cycler-0.10.0-py2.py3-none-any.whl (6.5 kB)
Installing collected packages: kiwisolver, cycler, matplotlib
Successfully installed cycler-0.10.0 kiwisolver-1.3.1 matplotlib-3.4.2
Collecting keras
  Using cached Keras-2.4.3-py2.py3-none-any.whl (36 kB)
Installing collected packages: keras
Successfully installed keras-2.4.3


In [3]:
import pandas as pd 
import numpy as np 
import re 
import os
import time
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow.keras.layers.experimental import preprocessing
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.losses import sparse_categorical_crossentropy

from keras.models import Sequential
from keras.layers import LSTM, Activation, Flatten, Dropout, Dense, Embedding, TimeDistributed, Bidirectional
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils

In [4]:
# load songs
songs1 = pd.read_csv('data/lana_lyrics_83.csv').drop('Unnamed: 0', axis=1)
songs2 = pd.read_csv('data/lana_lyrics_15.csv').drop('Unnamed: 0', axis=1)
songs = pd.concat([songs1, songs2], axis=0)
songs.shape

(98, 2)

In [5]:
# put lyrics into 1 string
text = ''
for song in songs['lyrics']:
    text = text + song.lower()
    
# remove the text with brackets around them, such as [Verse 1]
text = re.sub(r'\[[^][]*\]', '', text)

In [6]:
# length of text is the number of characters in it
print(f'Length of text: {len(text)} characters')

# The unique characters in the file
vocab = sorted(set(text))
print(f'{len(vocab)} unique characters')

Length of text: 161722 characters
61 unique characters


In [7]:
corpus = [line for line in text.split('\n') if line != '']
corpus[:10]

['why? ("got that?")',
 'who, me? ("louder!")',
 'why? ("got that?")',
 "feet don't fail me now",
 'take me to the finish line',
 'oh, my heart, it breaks every step that i take',
 "but i'm hoping at the gates, they'll tell me that you're mine",
 'walking through the city streets, is it by mistake or design?',
 'i feel so alone on a friday night',
 "can you make it feel like home, if i tell you you're mine?"]

In [8]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(set(corpus))
total_words = len(tokenizer.word_index) + 1
print(total_words)

2575


In [9]:
input_sequences = []
for line in corpus:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

In [10]:
input_sequences[20:30]

[[42, 7, 161, 10, 2145, 141, 528],
 [42, 7, 161, 10, 2145, 141, 528, 17],
 [42, 7, 161, 10, 2145, 141, 528, 17, 2],
 [42, 7, 161, 10, 2145, 141, 528, 17, 2, 60],
 [20, 12],
 [20, 12, 838],
 [20, 12, 838, 53],
 [20, 12, 838, 53, 3],
 [20, 12, 838, 53, 3, 2096],
 [20, 12, 838, 53, 3, 2096, 803]]

In [11]:
# pad sequences
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences,
                                         maxlen = max_sequence_len, padding='pre'))

In [12]:
input_sequences[:2]

array([[  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0, 111,  32],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0, 111,  32,  17]], dtype=int32)

In [13]:
predictors, label = input_sequences[:,:-1], input_sequences[:,-1]

### Building The Model

In [14]:
def sparse_cat_loss(y_true,y_pred):
  return sparse_categorical_crossentropy(y_true, y_pred, from_logits=True)

vocab_size = total_words
embed_dim = 50
def create_model(vocab_size, embed_dim):
    model = Sequential()
    model.add(Embedding(vocab_size, embed_dim, input_length=max_sequence_len-1))
    # Add an LSTM Layer
    model.add(Bidirectional(LSTM(150, return_sequences=True)))
    # A dropout layer for regularisation
    model.add(Dropout(0.2))
    # Add another LSTM Layer
    model.add(LSTM(100,return_sequences=False)) 
    model.add(Dense(vocab_size//2, activation='relu'))
    # In the last layer, the shape should be equal to the total number of words present in our corpus
    model.add(Dense(vocab_size, activation='softmax'))
    model.compile(loss=sparse_cat_loss, optimizer='adam', metrics='accuracy')  #(# Pick a loss function and an optimizer)
    return model

model = create_model(vocab_size, embed_dim)
print(model.summary())

[2021-05-12 04:13:43.188 tensorflow-2-3-cpu-py-ml-m5-xlarge-d436dc423c77fbec5bea3834b1e4:25 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None
[2021-05-12 04:13:43.329 tensorflow-2-3-cpu-py-ml-m5-xlarge-d436dc423c77fbec5bea3834b1e4:25 INFO profiler_config_parser.py:102] Unable to find config at /opt/ml/input/config/profilerconfig.json. Profiler is disabled.
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 17, 50)            128750    
_________________________________________________________________
bidirectional (Bidirectional (None, 17, 300)           241200    
_________________________________________________________________
dropout (Dropout)            (None, 17, 300)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               160400    
_______________________

In [None]:
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=5)
epochs = 100
history = model.fit(predictors, label, epochs=epochs, verbose=1, callbacks=[early_stopping])

# Save the weights
model.save_weights('models/ldr_by_word/ldr_by_word')

Epoch 1/100
137/865 [===>..........................] - ETA: 43s - loss: 7.8321 - accuracy: 0.0365

In [None]:
# list all data in history
print(history.history.keys())

In [None]:
# summarize history for accuracy
plt.plot(history.history['loss'], label='loss')
plt.plot(history.history['accuracy'], label='accuracy')
plt.title('loss & accuracy')
plt.ylabel('value')
plt.xlabel('epoch')
plt.legend()
plt.show()

#### Recreate the model and load saved weights

In [21]:
# Create a new model instance
model_loaded = create_model(vocab_size, embed_dim)
model_loaded.load_weights('models/ldr_by_word/ldr_by_word')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f93aa7e2050>

In [22]:
def make_lyrics(model, seed_text, next_words):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list],
                     maxlen=max_sequence_len-1,padding='pre')
        predicted = np.argmax(model.predict(token_list, verbose=0), axis=-1)
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " " + output_word
    return seed_text

In [23]:
predicted_lyrics = make_lyrics(model, 'lolita', 100)
print(predicted_lyrics)

lolita seems diddy sweatshirt sweatshirt pain age age seems mustang mustang chaos dining crowns anywhere crowns crowns clear there's thrills national upper snow smiling smiling for… till drag baking baking believe believe he's he's he's he's he's he's he's he's headshot headshot headshot headshot headshot headshot headshot headshot headshot headshot tryna tryna honestly crowns doll doll doll doll headshot threshold butterflies blurring dear truck line line commitment god free till began livin's teeth began teeth began teeth charge teeth a charge stopped stopped clear backseat clear backseat glow glow ciao ciao baking baking baking clear paved are are sweatshirt understand understand


In [24]:
predicted_lyrics = make_lyrics(model_loaded, 'lolita', 100)
print(predicted_lyrics)

lolita to it's coast but do all it thing better what the walls at doop you take you da huh off he my ride god 'cause 'cause 'cause 'cause 'cause 'cause 'cause 'cause 'cause 'cause 'cause 'cause 'cause 'cause 'cause 'cause 'cause 'cause 'cause 'cause 'cause 'cause 'cause 'cause 'cause 'cause 'cause 'cause 'cause 'cause 'cause 'cause 'cause 'cause 'cause 'cause 'cause 'cause 'cause 'cause 'cause 'cause 'cause 'cause 'cause 'cause 'cause 'cause 'cause 'cause 'cause 'cause 'cause 'cause 'cause 'cause 'cause 'cause 'cause 'cause 'cause 'cause 'cause 'cause 'cause 'cause 'cause 'cause 'cause 'cause 'cause 'cause 'cause 'cause 'cause 'cause


In [25]:
predicted_lyrics = make_lyrics(model_loaded, 'baby blue', 100)
print(predicted_lyrics)

baby blue tired lived time and i'm my jam out girl the cry now be top make lookin' summer's soundin' tropic up dream get and to the have you the stronger da anywhere they'll on of nothin' all a soft powerful in a friend in lot gunnin' girl to scene we and then to see you have back mine is love when do up his you song i want is so me wha soul run get i just— patio light follow or to scene is stay it pearls them 'til do do do do do do do do the die me it's
