# Work with lyrics 

## Make lyrics inverted index

In [None]:
import pickle

# LOAD DICTS
lyrics_test_path = 'data/mxm_dataset_test.txt'
lyrics_train_path = 'data/mxm_dataset_train.txt'

# Output
lyrics_all_words_path = 'data/lyrics_all_words.txt'
lyrics_inverted_index_path = 'data/lyrics_inverted_idx.pkl'


def read_lyrics(lyrics_path):
    lyrics_dict = {}
    with open(lyrics_path, 'r') as file:
        for line in file:
            if line.startswith('#'):
                continue # It is a comment
            if line.startswith('%'):
                # List of all words
                all_words = line[1:].split(',')
            elif line.startswith('TR'):
                line = line.split(',')
                track_id = line[0]
                word_dict = {int(id): int(freq) for id_freq in line[2:] for id, freq in [id_freq.split(':')]}
                lyrics_dict[track_id] = word_dict
    return lyrics_dict, all_words

# Load the data
train_lyrics_dict, all_words_train = read_lyrics(lyrics_train_path)
test_lyrics_dict, all_words_test = read_lyrics(lyrics_test_path)

# Ensure the list of words is the same
assert all_words_train == all_words_test
all_words = all_words_train

# Make mappings from words to indices, where idx starts at 1
word_to_idx = {word: index+1 for index, word in enumerate(all_words)}
index_to_word = {index+1: word for index, word in enumerate(all_words)}

# Join the dictionaries
lyrics_dict_idx = {**train_lyrics_dict, **test_lyrics_dict}

# Create a dictionary with words instead of indices
lyrics_dict_word = {track_id: {index_to_word[int(id)]: int(freq) for id, freq in word_dict.items()} for track_id, word_dict in lyrics_dict_idx.items()}

# Make inversed index
def make_inveresed_index(lyrics_dict):
    # Initialize an empty list for each word in the vocabulary
    inverted_index = {word: {} for word in all_words}
    
    # Iterate over each track and its corresponding word dictionary
    for track_id, word_dict in lyrics_dict.items():
        # Iterate over each word ID in the word dictionary
        for word_id, count in word_dict.items():
            # Get the word corresponding to the word ID and append the track ID
            word = index_to_word[word_id]
            inverted_index[word][track_id] = count
    
    return inverted_index

inverted_index = make_inveresed_index(lyrics_dict_idx)

def save_lyrics_inverted_index(lyrics_inverted_index, path='data/lyrics_inverted_idx.pkl'):
    with open(path, 'wb') as file:
        pickle.dump(lyrics_inverted_index, file)
    print(f"lyrics_inverted_index saved as pickle at {path}")

# Save the inverted index
save_lyrics_inverted_index(inverted_index)

In [2]:
# SHOW DATA
print('Number of words:', len(all_words))
print('Number of tracks:', len(lyrics_dict_idx))
print('First 5 words:', all_words[:5])
print('One example of the index dictionary')
for track_id, word_dict in lyrics_dict_idx.items():
    print(f"\t{track_id}: {word_dict}")
    break
print('One example of the word dictionary')
for track_id, word_dict in lyrics_dict_word.items():
    print(f"\t{track_id}: {word_dict}")
    break

print('One example of the inversed index')
print(f"love ({len(inverted_index['love'])} tracks):", {k: v for k, v in list(inverted_index['love'].items())[:5]})

Number of words: 5000
Number of tracks: 237662
First 5 words: ['i', 'the', 'you', 'to', 'and']
One example of the index dictionary
	TRAAAAV128F421A322: {1: 6, 2: 4, 3: 2, 4: 2, 5: 5, 6: 3, 7: 1, 8: 1, 11: 1, 12: 2, 13: 3, 14: 1, 15: 1, 18: 2, 19: 2, 20: 2, 21: 2, 23: 4, 25: 1, 26: 2, 28: 1, 30: 1, 36: 2, 42: 1, 45: 1, 54: 2, 56: 1, 57: 1, 68: 1, 99: 1, 192: 2, 249: 1, 264: 1, 356: 1, 389: 1, 561: 1, 639: 1, 656: 1, 687: 1, 761: 1, 773: 1, 804: 1, 869: 2, 914: 1, 1035: 1, 1156: 1, 1221: 1, 1287: 1, 1364: 1, 1407: 1, 1533: 2, 1857: 1, 2096: 1, 2117: 1, 2482: 2, 2548: 1, 2705: 1, 2723: 1, 2868: 2, 2992: 2, 3455: 1, 3717: 1, 3851: 1, 4322: 1, 4382: 1, 4613: 1, 4713: 1, 4906: 1}
One example of the word dictionary
	TRAAAAV128F421A322: {'i': 6, 'the': 4, 'you': 2, 'to': 2, 'and': 5, 'a': 3, 'me': 1, 'it': 1, 'my': 1, 'is': 2, 'of': 3, 'your': 1, 'that': 1, 'are': 2, 'we': 2, 'am': 2, 'will': 2, 'for': 4, 'be': 1, 'have': 2, 'so': 1, 'this': 1, 'like': 2, 'de': 1, 'up': 1, 'was': 2, 'if': 1, '

For word embeddings, we will use FastText Pretrained Models, which handle out-of-vocabulary (OOV) words using subword information.

We will use the english embeddings from fasttext, downloaded from [here](https://fasttext.cc/docs/en/crawl-vectors.html)

In [3]:
# LOAD MODEL
# Info: https://fasttext.cc/docs/en/crawl-vectors.html
# !pip install fasttext
# fasttext.util.download_model('en', if_exists='ignore')  # English
# !pip install PyStemmer

# Import word2vec model
import fasttext
import fasttext.util
ft = fasttext.load_model('cc.en.300.bin')
print(ft.get_dimension())

# Use same word preprocessing as match-api
import sys
sys.path.append('../match-api/')
from utils.text_processing import process_text, normalize, tokenize_text

300


In [None]:
# SAVE A DICTIONARY WITH SIMILAR WORDS FOR ALL 5000 WORDS IN LYRICS
import json
import os
from tqdm import tqdm

def create_lyrics_similarity_dict(all_words, embeddings, save_path='lyrics_similarity_dict.json', batch_size=100):
    '''
    Create a dictionary with similar tokens to those in the lyrics.
    Similar tokens are those with score > 0.7.
    Progress is saved every batch_size words.
    If interrupted, resumes from the saved file.
    '''

    # Load existing dictionary if the file exists
    if os.path.exists(save_path):
        with open(save_path, 'r', encoding='utf-8') as f:
            lyrics_similarity_dict = json.load(f)
        processed_words = set(lyrics_similarity_dict.keys())
        print(f"Loaded {len(processed_words)} processed words from {save_path}")
    else:
        lyrics_similarity_dict = {}
        processed_words = set()

    # Get words to process
    words_to_process = set(all_words) - processed_words

    # Iterate over remaining words with a progress bar
    for i, word in enumerate(tqdm(words_to_process, desc="Processing words", initial=len(processed_words), total=len(all_words))):
        # Get all words with a similarity score of >0.7
        similar_words = {word.lower() for score, word in embeddings.get_nearest_neighbors(word, k=20) if score > 0.7}
        extra_tokens = set(normalize(similar_words)) & set(all_words) - {word}

        # Add the similar words to the dictionary
        lyrics_similarity_dict[word] = list(extra_tokens)  # Convert set to list for JSON compatibility

        # Save the dictionary to file every batch_size iterations
        if (i + 1) % batch_size == 0:
            with open(save_path, 'w', encoding='utf-8') as f:
                json.dump(lyrics_similarity_dict, f, ensure_ascii=False, indent=2)

    # Final save to ensure all data is written
    with open(save_path, 'w', encoding='utf-8') as f:
        json.dump(lyrics_similarity_dict, f, ensure_ascii=False, indent=2)
    print(f"Final save complete. {len(lyrics_similarity_dict)} words processed.")

    return lyrics_similarity_dict

lyrics_similarity_dict = create_lyrics_similarity_dict(all_words, ft, save_path='data/lyrics_similarity_dict.json', batch_size=100)

# Function to save the lyrics_similarity_dict as a pickle file
def save_lyrics_similarity_dict(lyrics_similarity_dict, path='data/lyrics_similarity_dict.pkl'):
    with open(path, 'wb') as file:
        pickle.dump(lyrics_similarity_dict, file)
    print(f"lyrics_similarity_dict saved as pickle at {path}")

# Save the lyrics_similarity_dict as a pickle for faster loading
save_lyrics_similarity_dict(lyrics_similarity_dict)

Processing words: 100%|██████████| 5000/5000 [39:56<00:00,  2.09it/s]

Final save complete. 5000 words processed.





## Standalone code for lyrics expansion

In [None]:
# STANDALONE CODE
import pickle
import fasttext
import fasttext.util

# Use same word preprocessing as match-api
import sys
sys.path.append('../match-api/')
from utils.text_processing import normalize, process_text

def load_lyrics_inverted_index(path='data/lyrics_inverted_idx.pkl'):
    '''load the lyrics_inverted_index from a pickle file'''
    with open(path, 'rb') as file:
        lyrics_inverted_index = pickle.load(file)
    print(f"lyrics_inverted_index loaded from pickle at {path}")
    return lyrics_inverted_index

def load_lyrics_similarity_dict(path='data/lyrics_similarity_dict.pkl'):
    '''load the lyrics_similarity_dict from a pickle file'''
    with open(path, 'rb') as file:
        lyrics_similarity_dict = pickle.load(file)
    print(f"lyrics_similarity_dict loaded from pickle at {path}")
    return lyrics_similarity_dict

def load_expansion_model_dicts(ft_path, lyrics_similarity_dict_path, lyrics_inverted_index_path):
    # Load word2vec model
    ft = fasttext.load_model(ft_path)
    print("fasttext model loaded")

    lyrics_similarity_dict = load_lyrics_similarity_dict(lyrics_similarity_dict_path)
    lyrics_inverted_index = load_lyrics_inverted_index(lyrics_inverted_index_path)

    return ft, lyrics_similarity_dict, lyrics_inverted_index

def expand_query(query, lyrics_similarity_dict: dict, embeddings, verbose=False):
    '''
    Given a query, expand it with similar tokens from the lyrics BOW.

    First, check if the word is alrady in the lyrics_similarity_dict.
    Otherwise, get the embeddings from fasttext and get the similar words
    '''

    # Convert query into a set of stemmed tokens
    tokens = set(process_text(query).split())
    
    # Check for tokens not present in the lyrics
    tokens_in_lyrics = tokens & set(lyrics_similarity_dict.keys())
    unseen_tokens = tokens - tokens_in_lyrics

    # Expand with seen tokens
    expanded_tokens = tokens_in_lyrics
    expanded_tokens |= {similar_token for token in tokens_in_lyrics for similar_token in lyrics_similarity_dict[token]}

    # Expand with unseen tokens (if present)
    if unseen_tokens:
        for token in unseen_tokens:
            # Get similar words for unseen tokens
            similar_words = {word.lower() for score, word in embeddings.get_nearest_neighbors(token, k=20) if score > 0.7}

            # Normalize the words
            similar_tokens = {token for token in normalize(similar_words) if token in lyrics_similarity_dict.keys()}

            # Add the similar tokens to the set of tokens
            expanded_tokens |= similar_tokens

    if verbose:
        print('Original tokens:', tokens)
        print('Tokens not in lyrics:', unseen_tokens)
    
    # Return the expanded query
    return expanded_tokens


# Load all required models and dicts
ft, lyrics_similarity_dict, lyrics_inverted_index = load_expansion_model_dicts('cc.en.300.bin', 
                                                                               'data/lyrics_similarity_dict.pkl', 
                                                                               'data/lyrics_inverted_idx.pkl')

# Example query
query = 'reew ewre love'
expanded_tokens = expand_query(query, lyrics_similarity_dict, ft, verbose=True)
expanded_tokens

fasttext model loaded
lyrics_similarity_dict loaded from pickle at data/lyrics_similarity_dict.pkl
lyrics_inverted_index loaded from pickle at data/lyrics_inverted_idx.pkl
Original tokens: {'reew', 'ewr', 'love'}
Tokens not in lyrics: {'reew', 'ewr'}


{'ador', 'love'}