In [1]:
import numpy as np
from keras import callbacks
from keras.callbacks import History
from keras.layers.recurrent import LSTM
from keras.layers.embeddings import Embedding
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras import utils
import string
from typing import List, Tuple
import re

In [2]:
import pandas as pd
from pandas import DataFrame


def load_lyrics(csv_path: str) -> DataFrame:
    df = pd.read_csv(csv_path, sep='\n', header=None)
    res = df.iloc[:, 0].str.rstrip(r'&, ').str.extract(r'([^,]+),([^,]+),(.+)')
    res.columns = ['artist', 'title', 'lyrics']
    return res


In [3]:
def create_x_y_train(songs, tokenize, total_words) -> Tuple[np.ndarray, np.ndarray]:
    input_sequences = []
    for line in songs:
        token_list = tokenize.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i + 1]
            input_sequences.append(n_gram_sequence)
    print(input_sequences[:10])
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
    x_train = input_sequences[:, :-1]
    y_train = input_sequences[:, -1]
    y_train = utils.to_categorical(y_train, num_classes=total_words)
    return x_train, y_train


def load_songs(path) -> List[str]:
    df = load_lyrics(path)
    songs = []
    for song in list(df['lyrics']):
        modified_song = re.sub(r"\([^()]*\)", "", song)
        # modified_song = re.sub(r"\[[^()]*\]", "", modified_song)
        # for ch in ["[", "]", "chorus"]:
        #     modified_song = modified_song.replace(ch, "")
        modified_song = modified_song.replace("chorus", "").replace("&", "silencio").lower()
        # modified_song = modified_song.replace("&", "silencio").lower()
        regex = re.compile('[%s]' % re.escape(string.punctuation))
        modified_song = regex.sub('', modified_song)
        songs.append(modified_song)

In [4]:
class LyricsGenerator(object):

    def __init__(self, embedding_dim: int, vocab_size: int, input_size: int, embedding_matrix: np.ndarray):
        embedding_layer = Embedding(
            vocab_size,
            embedding_dim,
            input_length=input_size,
            weights=[embedding_matrix],
            # embeddings_initializer=initializers.Constant(embedding_matrix),
            trainable=False,
        )

        self.model = Sequential()
        self.model.add(embedding_layer)
        self.model.add(LSTM(units=embedding_dim))
        self.model.add(Dropout(0.2))
        self.model.add(Dense(units=vocab_size, activation='softmax'))
        self.model.compile(optimizer='adam', loss='categorical_crossentropy')
        self.model.summary()

    def fit(self, x, y, hyper_parameters):
        callback = callbacks.EarlyStopping(monitor='loss', patience=3)
        callback2 = callbacks.LearningRateScheduler(self._lr_scheduler)
        callback3 = callbacks.ModelCheckpoint('model.h5', save_best_only=True, monitor='val_loss', mode='min')
        # callback = callbacks.EarlyStopping(monitor='val_accuracy', mode='max', min_delta=1)
        history = self.model.fit(x, y, epochs=100, verbose=1)
        # history = self.model.fit(x, y, batch_size=hyper_parameters['batch_size'], epochs=hyper_parameters['epochs'],
        #                          callbacks=[callback, callback2, callback3],
        #                          verbose=2, validation_split=hyper_parameters['validation_split'],
        #                          validation_data=None)
        
        return history

    def _lr_scheduler(self, epoch, lr):
        return 0.99 * lr

    # to get a picture of loss progress.
    def plot_metric(self, history: History, metric: str = 'loss') -> None:
        import matplotlib.pyplot as plt
        train_metrics = history.history[metric]
        val_metrics = history.history['val_'+metric]
        epochs = range(1, len(train_metrics) + 1)
        plt.plot(epochs, train_metrics)
        plt.plot(epochs, val_metrics)
        plt.title('Training and validation '+ metric)
        plt.xlabel("Epochs")
        plt.ylabel(metric)
        plt.legend(["train_"+metric, 'val_'+metric])
        plt.show()

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
embedding_matrixs =  np.load("/content/drive/MyDrive/DL/Assignment_3/Lyrics/embedding_matrix.npy")

In [7]:
df = pd.read_csv('/content/drive/MyDrive/DL/Assignment_3/Lyrics/lyrics_train_set.csv', sep='\n', header=None)
res = df.iloc[:, 0].str.rstrip(r'&, ').str.extract(r'([^,]+),([^,]+),(.+)')
print(len(res))
res.columns = ['artist', 'title', 'lyrics']
print(len(res['lyrics']))

songs = []
for song in list(res['lyrics']):
    modified_song = re.sub(r"\([^()]*\)", "", song)
    modified_song = modified_song.replace("chorus", "").replace("&", "silencio").lower()
    regex = re.compile('[%s]' % re.escape(string.punctuation))
    modified_song = regex.sub('', modified_song)
    songs.append(modified_song)

len(songs)

600
600


600

In [8]:
# Data preprocessing
tokenize = Tokenizer()
tokenize.fit_on_texts(songs)
total_words = len(tokenize.word_index) + 1

embedding_dim = 300
x_train, y_train = create_x_y_train(songs, tokenize, total_words)

[[221, 2451], [221, 2451, 2452], [221, 2451, 2452, 1], [221, 2451, 2452, 1, 314], [221, 2451, 2452, 1, 314, 4], [221, 2451, 2452, 1, 314, 4, 47], [221, 2451, 2452, 1, 314, 4, 47, 212], [221, 2451, 2452, 1, 314, 4, 47, 212, 2], [221, 2451, 2452, 1, 314, 4, 47, 212, 2, 67], [221, 2451, 2452, 1, 314, 4, 47, 212, 2, 67, 19]]


In [9]:
# Train model

parameters = {
    'batch_size' : 8 ,
    'validation_split' : 0.2 ,
    'epochs' : 5 ,
    'val_data' : None
}
lyrics_generator = LyricsGenerator(embedding_dim, total_words, x_train.shape[1], embedding_matrixs)
h = lyrics_generator.fit(x_train, y_train, parameters)
lyrics_generator.plot_metric(h)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 1479, 300)         2262000   
_________________________________________________________________
lstm (LSTM)                  (None, 300)               721200    
_________________________________________________________________
dropout (Dropout)            (None, 300)               0         
_________________________________________________________________
dense (Dense)                (None, 7540)              2269540   
Total params: 5,252,740
Trainable params: 2,990,740
Non-trainable params: 2,262,000
_________________________________________________________________
Epoch 1/100

KeyboardInterrupt: ignored