In [None]:
import os
import numpy as np
import pandas as pd
import torch
import absl.logging
from ast import literal_eval
from nlp_embeddings_no_nlu import DistilBERT, SentenceTransformerMPNET
absl.logging.set_verbosity(absl.logging.ERROR)

In [None]:
dataset_name = 'dataset2_proc' # Dataset name
max_words = 400 # Maixmum amount of words in song lyrics. Words above this amount will be cut off
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # Device for embedding

In [None]:
def save_embedding(data_x, nlp_embedding, batch_size=5000, start_idx=0, dir_path='data', prefix=''):
    '''
    Create and save embeddings of the lyrics.
    Parameters:
        data_x (Series): Lyrics to embedd.
        nlp_embedding (NLPEmbedding): Instance of the embedding method class.
        batch_size (int): Number of observations taken for single embedding.
        start_idx (int): In case of an interruption, from which observation restart the embedding process.
        dir_path (str): Path to directory where embeddings should be saved.
        prefix (str): Prefix to the name of file in which embeddings will be saved.
    '''
    fname = os.path.join(dir_path, f'{prefix}_{nlp_embedding.name}_{dataset_name}.csv')

    if not os.path.exists(dir_path):
        os.makedirs(dir_path)

    if start_idx == 0 and os.path.exists(fname):
        os.remove(fname)

    for i in range(start_idx, data_x.shape[0], batch_size):

        if i + batch_size > data_x.shape[0]:
            j = data_x.shape[0]
        else:
            j = i + batch_size

        print(f'Processing rows: {i} - {j - 1}')

        embeddings = nlp_embedding.embed_lyrics(data_x[i:j])
        pd.DataFrame(embeddings).to_csv(fname, mode='a', index=False, header=False)
    
    print('Success!')   

In [None]:
def add_normalized_lyrics(data):
    '''
    Add column to data frame with normalized lyrics created from tokens.
    Parameters:
        data (DataFrame): Data frame containing column with tokenized lyrics.
    '''
    tokens = data.tokens.apply(literal_eval)
    data['normalized_lyrics'] = [' '.join(t) for t in tokens]

In [None]:
# Loading data for embedding

train_data = pd.read_csv(f'data/train/{dataset_name}.csv')
test_data = pd.read_csv(f'data/test/{dataset_name}.csv')

In [None]:
# Adding normalized lyrics based on tokens

add_normalized_lyrics(train_data)
add_normalized_lyrics(test_data)

# Deleting rows without lyrics

train_data = train_data.loc[~train_data['lyrics'].isna()]
test_data = test_data.loc[~test_data['lyrics'].isna()]

In [None]:
# Creation of necessary directories

embedded_train_data_path = 'data/train/embeddings'
if not os.path.exists(embedded_train_data_path):
    os.makedirs(embedded_train_data_path)

embedded_test_data_path = 'data/test/embeddings'
if not os.path.exists(embedded_test_data_path):
    os.makedirs(embedded_test_data_path)

In [None]:
# Defining prefixes for names of the files in which embeddings will be saved

prefix = 'embedded'
prefix_normalized = 'embedded_norm'

## DistilBERT

In [None]:
# Creating and saving DistilBERT embeddings

emb_dbert = DistilBERT(max_words, device)

In [None]:
save_embedding(test_data.lyrics, emb_dbert, dir_path=embedded_test_data_path, prefix=prefix)

In [None]:
save_embedding(train_data.lyrics, emb_dbert, dir_path=embedded_train_data_path, prefix=prefix)

### Normalized data

In [None]:
save_embedding(test_data.normalized_lyrics, emb_dbert, dir_path=embedded_test_data_path, prefix=prefix_normalized)

In [None]:
save_embedding(train_data.normalized_lyrics, emb_dbert, dir_path=embedded_train_data_path, prefix=prefix_normalized)

## SentenceTransformerMPNET

In [None]:
# Creating and saving SentenceTransformerMPNET embeddings

emb_mpnet = SentenceTransformerMPNET()

In [None]:
save_embedding(test_data.lyrics, emb_mpnet, dir_path=embedded_test_data_path, prefix=prefix)

In [None]:
save_embedding(train_data.lyrics, emb_mpnet, dir_path=embedded_train_data_path, prefix=prefix)

### Normalized data

In [None]:
save_embedding(test_data.normalized_lyrics, emb_mpnet, dir_path=embedded_test_data_path, prefix=prefix_normalized)

In [None]:
save_embedding(train_data.normalized_lyrics, emb_mpnet, dir_path=embedded_train_data_path, prefix=prefix_normalized)