<a href="https://colab.research.google.com/github/czengnn/lana-del-rey-lyrics-generator/blob/main/LDR_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd 
import numpy as np 
import re 
import os
import time

import tensorflow as tf
from tensorflow.keras.layers.experimental import preprocessing

from keras.models import Sequential
from keras.layers import LSTM, Activation, Flatten, Dropout, Dense, Embedding, TimeDistributed
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils

In [2]:
# load songs
songs = pd.read_csv('/content/drive/MyDrive/data/lana_lyrics_83.csv').drop('Unnamed: 0', axis=1)

# put lyrics into 1 string
text = ''
for song in songs['lyrics']:
  text = text + song.lower()

# length of text is the number of characters in it
print(f'Length of text: {len(text)} characters')

# The unique characters in the file
vocab = sorted(set(text))
print(f'{len(vocab)} unique characters')

Length of text: 143660 characters
65 unique characters


In [3]:
# taking a look at the first 300 characters
print(text[:300])

[intro: lana del rey + sample]
why? ("got that?")
who, me? ("louder!")
why? ("got that?")

[verse 1]
feet don't fail me now
take me to the finish line
oh, my heart, it breaks every step that i take
but i'm hoping at the gates, they'll tell me that you're mine
walking through the city streets, is it 


### 2- Mapping characters :
 Make two dictionaries , one to convert chars to ints , the other to convert ints back to chars : 


In [4]:
# Mapping chars to ints
chars = sorted(list(set(text)))
int_chars = dict((i, c) for i, c in enumerate(chars))
chars_int = dict((i, c) for c, i in enumerate(chars))

In [5]:
# Get number of chars and vocab in our text
n_chars = len(text)
n_vocab = len(chars)

print('Total Characters : ' , n_chars) # number of all the characters in lyricsText.txt
print('Total Vocab : ', n_vocab) # number of unique characters

Total Characters :  143660
Total Vocab :  65


### Make samples and labels to feed the RNN

In [6]:
# process the dataset
seq_len = 100
data_X = []
data_y = []

for i in range(0, n_chars - seq_len, 1):
    # Input Sequeance(will be used as samples)
    seq_in  = text[i:i+seq_len]
    # Output sequence (will be used as target)
    seq_out = text[i + seq_len]
    # Store samples in data_X
    data_X.append([chars_int[char] for char in seq_in])
    # Store targets in data_y
    data_y.append(chars_int[seq_out])
n_patterns = len(data_X)
print( 'Total Patterns : ', n_patterns)

Total Patterns :  143560


In [8]:
text[i + seq_len]

'y'

### 4- Prepare the samples and labels :
prepare the samples and labels to be ready to go into our model.
* Reshape the samples
* Normalize them
* One hot encode the output targets 

In [7]:
# Reshape X to be suitable to go into LSTM RNN :
# X = np.reshape(data_X , (n_patterns, seq_len, 1))
# Normalizing input data :
# X = X/ float(n_vocab)
X = np.array(data_X)/ float(n_vocab)
# One hot encode the output targets :
y = np_utils.to_categorical(data_y)

In [10]:
X

array([[0.4       , 0.55384615, 0.63076923, ..., 0.01538462, 0.23076923,
        0.41538462],
       [0.55384615, 0.63076923, 0.72307692, ..., 0.23076923, 0.41538462,
        0.        ],
       [0.63076923, 0.72307692, 0.69230769, ..., 0.41538462, 0.        ,
        0.50769231],
       ...,
       [0.69230769, 0.49230769, 0.63076923, ..., 0.15384615, 0.01538462,
        0.44615385],
       [0.49230769, 0.63076923, 0.43076923, ..., 0.01538462, 0.44615385,
        0.43076923],
       [0.63076923, 0.43076923, 0.47692308, ..., 0.44615385, 0.43076923,
        0.44615385]])

After we finished processing the dataset , we will start building our LSTM RNN model .

## Building The Model 

In [8]:
LSTM_layer_num = 4 # number of LSTM layers
layer_size = [256,256,256,256] # number of nodes in each layer
batch_size = 128

In [9]:
model = Sequential()
model.add(Embedding(n_vocab, layer_size[0], batch_input_shape=[batch_size, None]))
for i in range(LSTM_layer_num) :
    model.add(LSTM(layer_size[i], return_sequences=True, stateful=True))
model.add(Dense(n_vocab))
model.add(Activation('softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam')
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (128, None, 256)          16640     
_________________________________________________________________
lstm (LSTM)                  (128, None, 256)          525312    
_________________________________________________________________
lstm_1 (LSTM)                (128, None, 256)          525312    
_________________________________________________________________
lstm_2 (LSTM)                (128, None, 256)          525312    
_________________________________________________________________
lstm_3 (LSTM)                (128, None, 256)          525312    
_________________________________________________________________
dense (Dense)                (128, None, 65)           16705     
_________________________________________________________________
activation (Activation)      (128, None, 65)           0

In [11]:
# # Configure the checkpoint :
# checkpoint_name = 'Weights-LSTM-improvement-{epoch:03d}-{loss:.5f}-bigger.hdf5'
# checkpoint = ModelCheckpoint(checkpoint_name, monitor='loss', verbose = 1, save_best_only = True, mode ='min')
# callbacks_list = [checkpoint]

### Training

In [62]:
# Fit the model :
model_params = {'epochs':30,
                'batch_size':128,
                # 'callbacks':callbacks_list,
                'verbose':1,
                'validation_split':0.2,
                'validation_data':None,
                'shuffle': True,
                'initial_epoch':0,
                'steps_per_epoch':None,
                'validation_steps':None}

model.fit(X,
          y,
          epochs = model_params['epochs'],
           batch_size = model_params['batch_size'],
        #    callbacks= model_params['callbacks'],
           verbose = model_params['verbose'],
           validation_split = model_params['validation_split'],
           validation_data = model_params['validation_data'],
           shuffle = model_params['shuffle'],
           initial_epoch = model_params['initial_epoch'],
           steps_per_epoch = model_params['steps_per_epoch'],
           validation_steps = model_params['validation_steps'])

Epoch 1/30


ValueError: ignored

### Lyrics Generation

In [26]:
len(data_X)

143560

In [33]:
[chars_int[char] for char in 'james dean']

[37, 28, 40, 32, 46, 1, 31, 32, 28, 41]

In [36]:
start = np.random.randint(0, len(data_X)-1)
pattern = data_X[start]
print('Seed : ')
print("\"",''.join([int_chars[value] for value in pattern]), "\"\n")


Seed : 
" hydrangea, cold cash divine
cashmere, cologne, and white sunshine
red racing cars, sunset and vine
t "



In [38]:
type(pattern)

list

In [39]:
# set a random seed :
start = np.random.randint(0, len(data_X)-1)
pattern = data_X[start]
print('Seed : ')
print("\"",''.join([int_chars[value] for value in pattern]), "\"\n")


# How many characters you want to generate
generated_characters = 300

# Generate Charachters :
new_lyrics = ''
for i in range(generated_characters):
    x = np.reshape(pattern, ( 1, len(pattern), 1))
    x = x / float(n_vocab)
    prediction = model.predict(x,verbose = 0)
    index = np.argmax(prediction)
    result = int_chars[index]
    #seq_in = [int_chars[value] for value in pattern]
    new_lyrics = new_lyrics + result
    pattern.append(index)
    pattern = pattern[1:len(pattern)]
print('\nDone')


Seed : 
" , honey
be the first who ever did
ah, ah, ah, ah, ah, ah
hold me, love me, touch me, honey
be the fi "


Done


In [40]:
print(new_lyrics)

rst who ever did

[verse 2]
kerosene in my hands
you make me mad, on fire again
all the pills that you take
violet, blue, green, red to keep me at arm's length don't work
[chorus]
there's things i wanna say to you, but i'll just let you live
like if you hold me without hurting me
you'll be the first


In [46]:
def generate_text(model, start_seed,gen_size=100,temp=1.0):
    '''
    model: Trained Model to Generate Text
    start_seed: Intial Seed text in string form
    gen_size: Number of characters to generate
    Basic idea behind this function is to take in some seed text, format it so
    that it is in the correct shape for our network, then loop the sequence as
    we keep adding our own predicted characters. Similar to our work in the RNN
    time series problems.
    '''
    # Number of characters to generate
    num_generate = gen_size
    # Vecotrizing starting seed text
    input_eval = [chars_int[s] for s in start_seed]
    # Expand to match batch format shape
    input_eval = tf.expand_dims(input_eval, 0)
    # Empty list to hold resulting generated text
    text_generated = []
    # Temperature effects randomness in our resulting text
    # The term is derived from entropy/thermodynamics.
    # The temperature is used to effect probability of next characters.
    # Higher probability == lesss surprising/ more expected
    # Lower temperature == more surprising / less expected
    temperature = temp
    # Here batch size == 1
    model.reset_states()
    for i in range(num_generate):
        # Generate Predictions
        predictions = model(input_eval)
        # Remove the batch shape dimension
        predictions = tf.squeeze(predictions, 0)
        # Use a cateogircal disitribution to select the next character
        predictions = predictions / temperature
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()
        # Pass the predicted charracter for the next input
        input_eval = tf.expand_dims([predicted_id], 0)
        # Transform back to character letter
        text_generated.append(ind_to_char[predicted_id])
    return (start_seed + ''.join(text_generated))


print(generate_text(model,"flower",gen_size=1000))

ValueError: ignored