In [None]:
import os
import numpy as np
import pandas as pd
import absl.logging
from ast import literal_eval
import torch
from nlp_embeddings_no_nlu import DistilBERT, SentenceTransformerMPNET

absl.logging.set_verbosity(absl.logging.ERROR)

In [None]:
dataset_name = 'sample'

In [None]:
def save_embedding(data_x, nlp_embedding, batch_size=5000, start_idx=0, dir_name='', pred_dir='data'):
    dir_path = os.path.join(pred_dir, dir_name)
    fname = os.path.join(dir_path, f'embedded_{nlp_embedding.name}_{dataset_name}.csv')

    if not os.path.exists(dir_path):
        os.makedirs(dir_path)

    if start_idx == 0 and os.path.exists(fname):
        os.remove(fname)

    for i in range(start_idx, data_x.shape[0], batch_size):

        if i + batch_size > data_x.shape[0]:
            j = data_x.shape[0]
        else:
            j = i + batch_size

        print(f'Processing rows: {i} - {j - 1}')

        embeddings = nlp_embedding.embed_lyrics(data_x[i:j])
        pd.DataFrame(embeddings).to_csv(fname, mode='a', index=False, header=False)
    
    print('Success!')   

In [None]:
def add_normalized_lyrics(data):
    tokens = data.tokens.apply(literal_eval)
    data['normalized_lyrics'] = [' '.join(t) for t in tokens]

In [None]:
train_data = pd.read_csv(f'data/train/{dataset_name}.csv')
test_data = pd.read_csv(f'data/test/{dataset_name}.csv')

In [None]:
add_normalized_lyrics(train_data)
add_normalized_lyrics(test_data)

In [None]:
emb_bert = Bert(max_words)

In [None]:
save_embedding(test_data.normalized_lyrics, emb_bert, dir_name='test', batch_size=1000)

In [None]:
save_embedding(train_data.normalized_lyrics, emb_bert, dir_name='train', batch_size=1000)

In [None]:
emb_glove = GloVe(max_words)

In [None]:
save_embedding(test_data.normalized_lyrics, emb_glove, dir_name='test')

In [None]:
save_embedding(train_data.normalized_lyrics, emb_glove, dir_name='train')

In [None]:
max_words = 400
dataset_name = 'small_balanced'
batch_size = 16

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
train_data = train_data.loc[~train_data['lyrics'].isna()]
test_data = test_data.loc[~test_data['lyrics'].isna()]

## DistilBERT

In [None]:
model = DistilBERT(max_words)

In [None]:
save_embedding(test_data.normalized_lyrics, emb_glove, pred_dir=os.path.join('data', 'test', 'embeddings'))

In [None]:
save_embedding(train_data.normalized_lyrics, emb_glove, dir_name=os.path.join('data', 'train', 'embeddings'))

## SentenceTransformerMPNET

In [None]:
model = SentenceTransformerMPNET()

In [None]:
save_embedding(test_data.normalized_lyrics, emb_glove, pred_dir=os.path.join('data', 'test', 'embeddings'))

In [None]:
save_embedding(train_data.normalized_lyrics, emb_glove, dir_name=os.path.join('data', 'train', 'embeddings'))