In [1]:
import numpy as np
import tensorflow as tf
import pandas as pd
import re
import string
from unidecode import unidecode
from dataclasses import dataclass

### Text Preprocessing

In [2]:
# Read file
file = r'./data/drake_data.csv'

# read as a csv with pandas
# the structure of the data makes pandas the best library to read, since it contains columns
data = pd.read_csv(file, sep = ",")

display(data.head(3))

Unnamed: 0,album,lyrics_title,lyrics_url,lyrics,track_views
0,Certified Lover Boy,Certified Lover Boy* Lyrics,https://genius.com/Drake-certified-lover-boy-l...,[Verse]\nPut my feelings on ice\nAlways been a...,8.7K
1,Certified Lover Boy,Like I’m Supposed To/Do Things Lyrics,https://genius.com/Drake-like-im-supposed-to-d...,[Verse]\nHands are tied\nSomeone's in my ear f...,38.8K
2,Certified Lover Boy,Not Around Lyrics,https://genius.com/Drake-not-around-lyrics,"[Intro]\nYeah, we back\nWassup ladies?\nSwisha...",129.8K


In [3]:
# dropnas in the lyrics, not interested in those
data = data.dropna(subset = ['lyrics'], axis = 0)

In [4]:
# get the song lyrics into a list
drake_songs = data['lyrics'].to_list()

In [5]:
# function to clean text
def clean_text(song: str, line_breaks_replacement: str = ' '):
    '''
    Cleans a specific drake song

    Args:
        song (str): text with the lyrics of a specific song
        line_breaks_replacement (str): character to use to replace line breaks

    Returns:
        drake_verses_joined (str): text with no line breaks and verses only sang by drake
    '''

    # remove Unicode characters
    normalized_song = unidecode(song)

    # remove line breaks
    song_list = normalized_song.split('\n')

    # new verses
    drake_verses = []
    
    # set default drake to be true
    drake = True

    # write a loop to iterate and return only the verses sung by drake
    for verse in song_list:

        if len(verse) == 0:
            continue

        # identify if its a hear by the squared brakers
        if '[' in verse:

            # update the verse_head value
            verse_head = verse

            def drake_sung(verse):
                '''Method to identify if drake is singing the verse'''

                # find a :
                match = verse.find(':')

                # when the artist is not specified its drake
                if match == -1:
                    return True

                # else we need to check if it will be a pure drake verse
                else:
                    
                    # get the list of singers
                    singers = verse[match+2:].replace(']', '').split(' ')

                    # get if drake is the only singer
                    if ('Drake' in singers) & (len(singers) == 1):
                        return True
                    else:
                        return False

            # bool if sung by drake
            drake = drake_sung(verse_head)

        if (drake) & ('[' not in verse):

            # remove punctuation from the verse
            clean_verse = ''.join([x.lower() for x in verse if x not in string.punctuation])

            # append to the list
            drake_verses.append(clean_verse)

    # join all of drake verses
    drake_verses_joined = ' '.join(drake_verses)

    # return the new list
    return drake_verses_joined

In [6]:
only_drake = ' '.join([clean_text(song) for song in drake_songs])

### Character vectorization

In [7]:
# get the unique number of characyers
drake_chars = sorted(set(only_drake))
print(f'{len(drake_chars)} unique characters')

37 unique characters


In [8]:
# create a Tensor of ids for each character present in the song
ids_from_chars = tf.keras.layers.StringLookup(
    vocabulary=list(drake_chars), mask_token=None)

In [9]:
# transform text into Unicde sequence
drake_ids = tf.strings.unicode_split(only_drake, 'UTF-8')

# with the String Lookup layer, tokenize the text
ids = ids_from_chars(drake_ids)

In [10]:
# create dataset from ids
ids_dataset = tf.data.Dataset.from_tensor_slices(ids)

In [11]:
# sequence length
sequence_length = 50

# create dataset
sequences = ids_dataset.batch(sequence_length + 1, drop_remainder=True)

In [12]:
# split input and output
def split_input_target(sequence):

    # input - everything up to last character
    input_text = sequence[:-1]
    
    # last character
    target_text = sequence[1:]
    
    return input_text, target_text

In [13]:
# map sequences as input and output
dataset = sequences.map(split_input_target)

In [14]:
# Batch size
BATCH_SIZE = 100

# Buffer size to shuffle the dataset
BUFFER_SIZE = 10000

# shuffle to randomizer, process in batches and prefetch the dataset
dataset = (
    dataset
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE))

dataset

<PrefetchDataset element_spec=(TensorSpec(shape=(100, 50), dtype=tf.int64, name=None), TensorSpec(shape=(100, 50), dtype=tf.int64, name=None))>

In [15]:
# save dataset
dataset.save('./data/character_dataset')

### Create Model

In [17]:
from modules.models import CharModel

In [16]:
# define parameters
VOCAB_SIZE = len(ids_from_chars.get_vocabulary())
EMBEDDING_DIM = 100
LSTM_UNITS = 200

In [None]:
model = CharModel(
    vocab_size=VOCAB_SIZE,
    embedding_dim=EMBEDDING_DIM,
    rnn_units=LSTM_UNITS)

In [None]:
# def build_model(vocab_size, embedding_dim, rnn_units, batch_size):

#     model = tf.keras.Sequential([
#         tf.keras.layers.Embedding(vocab_size, embedding_dim,\
#              batch_input_shape=[batch_size, None]),
#         tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(rnn_units,\
#             return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform')),
#         ft.keras.layers.Dropout()
#         tf.keras.layers.Dense(vocab_size)
#         ]
#     )

#     # compile model
#     loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)
#     model.compile(optimizer='adam', loss=loss)

#     # print model summary
#     model.summary()

#     return model

In [None]:
# create instance of model
model = build_model(vocab_size=VOCAB_SIZE, embedding_dim=EMBEDDING_DIM, \
    rnn_units=LSTM_UNITS, batch_size=BATCH_SIZE)

### Train the model

In [None]:
EPOCHS = 20

In [None]:
history = model.fit(dataset, epochs=EPOCHS)

In [None]:
import tensorflow as tf