# Processing lyrics 
Create lyric embeddings for every file in `processed_lyrics`

In [None]:
!cd 

In [17]:
import torch
from transformers import RobertaTokenizer, RobertaModel
from sentence_transformers import SentenceTransformer
import nltk
from nltk.tokenize import sent_tokenize
import os 
import pandas as pd
import numpy as np
import tqdm

In [2]:
torch.cuda.empty_cache()

In [25]:
# setting up directories
project_home_dir = 'D:/Projects/cs224-multimodal-recommender-system'

In [4]:
# !tar -xvzf {os.path.join(project_home_dir, 'datasets/m4a-onion-kaggle/processed_lyrics.tar.gz')} -C {os.path.join(project_home_dir, 'datasets/m4a-onion-kaggle/')}

In [7]:
lyrics_dir = os.path.join(project_home_dir, 'datasets/m4a-onion-kaggle/processed_lyrics/')

# Load pre-trained RoBERTa model and tokenizer
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaModel.from_pretrained('roberta-base').to(device)

# Function to generate embeddings using mean pooling
def generate_roberta_embedding(text):
    if isinstance(text, str):
        # Tokenize input text
        inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=512, padding=True).to(device)
        # Pass through RoBERTa model
        with torch.no_grad():
            outputs = model(**inputs)
        last_hidden_state = outputs.last_hidden_state
        mean_embedding = torch.mean(last_hidden_state, dim=1).squeeze()
        return mean_embedding.cpu().numpy()

    else:
        return None
data = []
# Iterate through each file in the lyrics directory
for filename in tqdm.tqdm(os.listdir(lyrics_dir)):
    if filename.endswith(".txt"):
        track_id = filename.replace(".txt", "")
        file_path = os.path.join(lyrics_dir, filename)
        # Read the lyrics from the file
        with open(file_path, 'r', encoding='utf-8') as file:
            lyrics = file.read()
        # Generate the embedding
        embedding = generate_roberta_embedding(lyrics)
        # Append the track ID and embedding to the data list
        data.append({'track_id': track_id, 'lyrics_embedding': embedding})

roberta_embedding_df = pd.DataFrame(data)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  attn_output = torch.nn.functional.scaled_dot_product_attention(
100%|██████████| 109269/109269 [12:11<00:00, 149.32it/s]


In [15]:
roberta_embedding_df.to_parquet('D:\\Projects\\cs224-multimodal-recommender-system\\processed_data\\m4a-onion-kaggle\\roberta_embedding_lyrics.parquet')

In [19]:

# Load pre-trained Sentence Transformer model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = SentenceTransformer('all-MiniLM-L6-v2').to(device)

# Function to generate embeddings for the entire lyrics
def generate_sentence_transformer_embedding(text):
    # Generate embedding for the entire text
    embedding = model.encode(text, convert_to_tensor=True, device=device)
    return embedding.cpu().numpy()

# Create a list to store track IDs and embeddings
data = []

# Iterate through each file in the lyrics directory
for filename in tqdm.notebook.tqdm(os.listdir(lyrics_dir)):
    if filename.endswith(".txt"):
        track_id = filename.replace(".txt", "")
        file_path = os.path.join(lyrics_dir, filename)
        # Read the lyrics from the file
        with open(file_path, 'r', encoding='utf-8') as file:
            lyrics = file.read()
        # Generate the embedding
        embedding = generate_sentence_transformer_embedding(lyrics)
        # Append the track ID and embedding to the data list
        data.append({'track_id': track_id, 'lyrics_embedding': embedding})

# Create a DataFrame from the data list
minilm_embedding_df = pd.DataFrame(data)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for filename in tqdm.tqdm_notebook(os.listdir(lyrics_dir)):


  0%|          | 0/109269 [00:00<?, ?it/s]

In [20]:
torch.cuda.empty_cache()

In [22]:
minilm_embedding_df.to_parquet('D:\\Projects\\cs224-multimodal-recommender-system\\processed_data\\m4a-onion-kaggle\\minilm_embedding_lyrics.parquet')