<a href="https://colab.research.google.com/github/czengnn/lana-del-rey-lyrics-generator/blob/main/LDR.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [52]:
import pandas as pd 
import numpy as np 
import re 

import tensorflow as tf
from tensorflow.keras.layers.experimental import preprocessing

In [49]:
# load songs
songs = pd.read_csv('/content/drive/MyDrive/data/lana_lyrics_83.csv').drop('Unnamed: 0', axis=1)

# get rid of parts of text that are like: [Intro: Lana Del Rey + sample], [Pre-Chorus]
songs['lyrics'] = songs['lyrics'].apply(lambda s: re.sub('\[[^][]*\]', '', s))

# put lyrics into 1 string
text = ''
for song in songs['lyrics']:
  text = text + song.lower()

# Save Lyrics in .txt file
with open('lyricsText.txt', 'w',encoding="utf-8") as filehandle:  
    for song in songs['lyrics']:
        filehandle.write('%s\n' % song.lower())

In [50]:
# length of text is the number of characters in it
print(f'Length of text: {len(text)} characters')

# The unique characters in the file
vocab = sorted(set(text))
print(f'{len(vocab)} unique characters')

Length of text: 135928 characters
58 unique characters


In [51]:
# taking a look at the first 300 characters
print(text[:300])


why? ("got that?")
who, me? ("louder!")
why? ("got that?")


feet don't fail me now
take me to the finish line
oh, my heart, it breaks every step that i take
but i'm hoping at the gates, they'll tell me that you're mine
walking through the city streets, is it by mistake or design?
i feel so alone o


In [67]:
example_texts = ['fail', 'me', 'now']
chars = tf.strings.unicode_split(example_texts, input_encoding='UTF-8')
chars

<tf.RaggedTensor [[b'f', b'a', b'i', b'l'], [b'm', b'e'], [b'n', b'o', b'w']]>

In [53]:
ids_from_chars = preprocessing.StringLookup(vocabulary=list(vocab))

In [68]:
ids = ids_from_chars(chars)
ids

<tf.RaggedTensor [[28, 23, 31, 34], [35, 27], [36, 37, 45]]>

In [59]:
chars_from_ids = tf.keras.layers.experimental.preprocessing.StringLookup(
    vocabulary=ids_from_chars.get_vocabulary(), invert=True)

In [69]:
chars = chars_from_ids(ids)
chars

<tf.RaggedTensor [[b'f', b'a', b'i', b'l'], [b'm', b'e'], [b'n', b'o', b'w']]>

In [71]:
def text_from_ids(ids):
  return tf.strings.reduce_join(chars_from_ids(ids), axis=-1)

In [72]:
text_from_ids(ids)

<tf.Tensor: shape=(3,), dtype=string, numpy=array([b'fail', b'me', b'now'], dtype=object)>

In [73]:
all_ids = ids_from_chars(tf.strings.unicode_split(text, 'UTF-8'))
all_ids

<tf.Tensor: shape=(135928,), dtype=int64, numpy=array([ 2, 45, 30, ..., 23, 24, 47])>

In [74]:
ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)

In [75]:
for ids in ids_dataset.take(10):
    print(chars_from_ids(ids).numpy().decode('utf-8'))



w
h
y
?
 
(
"
g
o


In [76]:
seq_length = 100
examples_per_epoch = len(text)//(seq_length+1)

In [78]:
sequences = ids_dataset.batch(seq_length+1, drop_remainder=True)

for seq in sequences.take(5):
  print(text_from_ids(seq).numpy())

b'\nwhy? ("got that?")\nwho, me? ("louder!")\nwhy? ("got that?")\n\n\nfeet don\'t fail me now\ntake me to the f'
b"inish line\noh, my heart, it breaks every step that i take\nbut i'm hoping at the gates, they'll tell m"
b"e that you're mine\nwalking through the city streets, is it by mistake or design?\ni feel so alone on a"
b" friday night\ncan you make it feel like home, if i tell you you're mine?\nit's like i told you, honey "
b'("louder!")\n\n\ndon\'t make me sad, don\'t make me cry\nsometimes love is not enough and the road gets tou'


In [79]:
# takes a sequence as input, duplicates, and shifts it to align the input and label for each timestep
def split_input_target(sequence):
    input_text = sequence[:-1]
    target_text = sequence[1:]
    return input_text, target_text

In [81]:
dataset = sequences.map(split_input_target)

In [82]:
for input_example, target_example in dataset.take(1):
    print("Input :", text_from_ids(input_example).numpy())
    print("Target:", text_from_ids(target_example).numpy())

Input : b'\nwhy? ("got that?")\nwho, me? ("louder!")\nwhy? ("got that?")\n\n\nfeet don\'t fail me now\ntake me to the '
Target: b'why? ("got that?")\nwho, me? ("louder!")\nwhy? ("got that?")\n\n\nfeet don\'t fail me now\ntake me to the f'


### Create Training Batches

In [83]:
# Batch size
BATCH_SIZE = 64

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000

dataset = (
    dataset
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE))

dataset

<PrefetchDataset shapes: ((64, 100), (64, 100)), types: (tf.int64, tf.int64)>

### Building The Model

In [84]:
# Length of the vocabulary in chars
vocab_size = len(vocab)

# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 1024

In [85]:
class MyModel(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, rnn_units):
    super().__init__(self)
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(rnn_units,
                                   return_sequences=True,
                                   return_state=True)
    self.dense = tf.keras.layers.Dense(vocab_size)

  def call(self, inputs, states=None, return_state=False, training=False):
    x = inputs
    x = self.embedding(x, training=training)
    if states is None:
      states = self.gru.get_initial_state(x)
    x, states = self.gru(x, initial_state=states, training=training)
    x = self.dense(x, training=training)

    if return_state:
      return x, states
    else:
      return x

In [86]:
model = MyModel(
    # Be sure the vocabulary size matches the `StringLookup` layers.
    vocab_size=len(ids_from_chars.get_vocabulary()),
    embedding_dim=embedding_dim,
    rnn_units=rnn_units)

### Trying the model

In [87]:
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

(64, 100, 60) # (batch_size, sequence_length, vocab_size)


In [88]:
model.summary()

Model: "my_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        multiple                  15360     
_________________________________________________________________
gru (GRU)                    multiple                  3938304   
_________________________________________________________________
dense (Dense)                multiple                  61500     
Total params: 4,015,164
Trainable params: 4,015,164
Non-trainable params: 0
_________________________________________________________________


In [89]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices, axis=-1).numpy()

In [90]:
sampled_indices

array([ 2, 13, 29, 19, 48, 47,  8, 21, 48, 42, 42, 36, 40, 37, 56, 25, 34,
       41, 56, 54, 17, 55, 43,  8, 19, 16, 19, 11, 48, 19, 36, 16, 14, 41,
       51, 50, 32, 46, 11, 47, 43, 45, 49, 13,  5, 40, 15, 58, 47, 53, 45,
       20, 26, 55, 47, 36, 43, 19,  3, 53, 32, 59, 46, 30, 28, 45,  8,  8,
        3, 47, 30,  9, 27, 56, 33, 29,  0, 20, 36, 34, 40, 50, 25, 53, 22,
       19, 46, 17, 18,  8, 48, 12, 54,  9, 38,  5, 11, 40, 34, 34])

In [91]:
print("Input:\n", text_from_ids(input_example_batch[0]).numpy())
print()
print("Next Char Predictions:\n", text_from_ids(sampled_indices).numpy())

Input:
 b'gold\nand i was like...\n\n\ntake off, take off, take off all your clothes\ntake off, take off, take off '

Next Char Predictions:
 b'\n0g7zy);zttnro\xe2\x80\x99cls\xe2\x80\x99\xe2\x80\x8b5\xe2\x80\x94u)747.z7n41s\xc3\xb3\xc3\xb1jx.yuw\xc3\xad0"r2\xe2\x80\xa6y\xe2\x80\x8aw9d\xe2\x80\x94ynu7 \xe2\x80\x8aj\xef\xbb\xbfxhfw)) yh,e\xe2\x80\x99kg9nlr\xc3\xb1c\xe2\x80\x8a?7x56)z/\xe2\x80\x8b,p".rll'
