In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("carlosgdcj/genius-song-lyrics-with-language-information")

Downloading from https://www.kaggle.com/api/v1/datasets/download/carlosgdcj/genius-song-lyrics-with-language-information?dataset_version_number=1...


100%|██████████| 3.04G/3.04G [00:24<00:00, 133MB/s] 

Extracting files...





In [2]:
import pandas as pd

data = pd.read_csv(path+"/song_lyrics.csv", skiprows=lambda x: x % 20 != 0)

In [3]:
# Filter Data by English Songs only and make sure the content-type is a song.

filtered_lang_data = data[data['language'] == 'en']

filtered_data = filtered_lang_data[filtered_lang_data['tag'] != 'misc']

In [4]:
import re
import string
import numpy as np
from nltk.stem import SnowballStemmer


# ------------Exact match search and ranking-----------------#

stemmer = SnowballStemmer("english")

_PUNCT_TO_REMOVE = string.punctuation.replace("'", "")
_RE_BRACKET_TAGS = re.compile(r"\[.*?\]")


# ------------Removing tags from lyrics---------------------#
def preprocess_lyrics(lyric: str) -> str:
    if not isinstance(lyric, str):
        return ""

    text = lyric.lower()
    text = _RE_BRACKET_TAGS.sub("", text)
    text = text.translate(str.maketrans("", "", _PUNCT_TO_REMOVE))
    text = re.sub(r"\s+", " ", text).strip()
    return text

def add_processed_column(df, source_col: str = "lyrics"):
    if "processed" not in df.columns:
        df = df.copy()
        df["processed"] = df[source_col].apply(preprocess_lyrics)
    return df

#-----------------Using Stemmer for additional search----------------#
def stem_query(query):
    return " ".join(stemmer.stem(tok) for tok in query.split())


#------Simple Search for Exact matching with stemming as well---------#
def simple_search(query,
                  data,
                  top_k: int = 10):
    raw_pattern  = re.escape(preprocess_lyrics(query))
    stem_pattern = re.escape(stem_query(query))

    exact_hits = data[data['processed'].str.contains(raw_pattern,  regex=True)]
    stem_hits  = data[data['processed'].str.contains(stem_pattern, regex=True)]

    parts = [hits for hits in (exact_hits, stem_hits) if not hits.empty]
    if not parts: # no match at all
        return pd.DataFrame(columns=data.columns)

    merged = pd.concat(parts, ignore_index=True)

    # Normalize the popularity into a score
    if merged['views'].nunique() > 1:
        range = merged['views'].max() - merged['views'].min()
        merged['norm_views'] = (merged['views'] - merged['views'].min()) / range
    else:
        merged['norm_views'] = 1.0 # Full score

    merged = (merged
              .drop_duplicates('id')
              .sort_values('norm_views', ascending=False)
              .head(top_k))

    return merged.reset_index(drop=True)


Steps:

1. lowercase everything
2. remove punctuation
3. remove anything in brackets ([Verse 1], [Chorus], etc)
4. not sure if lemmatization/stemming is useful since we'd often want exact matches on lyrics

In [5]:
#------------------Add Processed Column------------------#
processed_data = add_processed_column(filtered_data)

In [6]:
from collections import Counter
import math


#---------------Adding Bigrams Search------------------#
def make_ngrams(tokens, n):
    ng = [tuple(tokens[i: i + n]) for i in range(0, len(tokens) - n + 1)]
    return ng

def ngram_dict(df, n):
    res = {}
    for index, row in df.iterrows():
        id_val = row['id']

        processed_text = row['processed']
        tokens = processed_text.split()

        ngrams = make_ngrams(tokens, n)

        res[id_val] = Counter(ngrams)
    return res

In [7]:
NGRAM_SIZE = 2

# id -> ngram count
id_nmap = ngram_dict(processed_data, NGRAM_SIZE)

# corpus ngram counts
corpus_counter = Counter()

for doc_ngrams in id_nmap.values():
    corpus_counter.update(doc_ngrams)

total_ngrams = sum(corpus_counter.values())
unique_ngrams = len(corpus_counter)

id_ncount = {doc_id: sum(doc_counter.values()) for doc_id, doc_counter in id_nmap.items()}

id_nunique = {doc_id: len(doc_counter) for doc_id, doc_counter in id_nmap.items()}

In [8]:
#-------------------BM25 BIGRAM SEARCH MODEL------------#

def get_top_matches(query, top_k=10):
    query = preprocess_lyrics(query)
    tokens = query.split()
    query_ngrams = make_ngrams(tokens, NGRAM_SIZE)
    V = unique_ngrams

    results = []

    for doc_id, doc_counter in id_nmap.items():
        total_ngrams_in_doc = id_ncount[doc_id]
        log_score = 0.0
        match_count = 0

        for ngram in query_ngrams:
            count = doc_counter.get(ngram, 0)
            if count > 0:
                match_count += 1
            prob = (count + 1) / (total_ngrams_in_doc + V)
            log_score += math.log(prob)

        if match_count > 0:
            results.append((doc_id, match_count, log_score))

    sorted_results = sorted(results, key=lambda x: (x[1], x[2]), reverse=True)

    return sorted_results[:top_k]

In [26]:
from google.colab import drive

drive.mount('/content/drive')
data = pd.read_csv('/content/drive/My Drive/data.csv')

# Fix malformed embeddings from CSV
data['embedded'] = data['embedded'].apply(
    lambda x: np.fromstring(x.strip("[]"), sep=' ') if isinstance(x, str) else x
)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


  lambda x: np.fromstring(x.strip("[]"), sep=' ') if isinstance(x, str) else x


In [13]:
pip install -U FlagEmbedding

Collecting FlagEmbedding
  Using cached FlagEmbedding-1.3.4.tar.gz (163 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting datasets>=2.19.0 (from FlagEmbedding)
  Using cached datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting ir-datasets (from FlagEmbedding)
  Using cached ir_datasets-0.5.10-py3-none-any.whl.metadata (12 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets>=2.19.0->FlagEmbedding)
  Using cached dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets>=2.19.0->FlagEmbedding)
  Using cached xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets>=2.19.0->FlagEmbedding)
  Using cached multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets>=2.19.0->FlagEmbedding)
  Using cached fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu

In [27]:
from FlagEmbedding import BGEM3FlagModel

model = BGEM3FlagModel('BAAI/bge-m3',
                       use_fp16=True) # Setting use_fp16 to True speeds up computation with a slight performance degradation

Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]

In [31]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import ast

def find_most_similar(lyrics, top_k=10):
    # Embed the query and reshape to (1, 1024)
    lyric_embedded = model.encode(lyrics)['dense_vecs']
    lyric_embedded = np.array(lyric_embedded).reshape(1, -1)

    # Safe cosine similarity scoring
    similarities = []
    indices = []

    for idx, x in enumerate(data['embedded']):
        # Parse if necessary
        if isinstance(x, str):
            try:
                x = np.fromstring(x.strip("[]"), sep=' ')
            except Exception:
                continue

        x = np.array(x)
        if x.shape[0] != 1024:
            continue  # skip malformed

        sim = cosine_similarity(lyric_embedded, x.reshape(1, -1))[0][0]
        similarities.append(sim)
        indices.append(idx)

    if not similarities:
        return data.iloc[[]]  # return empty DataFrame if no valid matches

    sim_series = pd.Series(similarities, index=indices)
    top_indices = sim_series.nlargest(top_k).index

    return data.loc[top_indices].assign(score=sim_series.loc[top_indices])


In [32]:
from collections import defaultdict

"""
Now that we have three different searches.
We need to take each of their rankings for the top 10
and cross reference these in order to get the best results.
"""

def normalize_scores(results):
    if not results:
        return []
    scores = np.array([r["score"] for r in results])
    min_score, max_score = scores.min(), scores.max()
    norm_scores = (scores - min_score) / (max_score - min_score + 1e-12)
    for i, r in enumerate(results):
        r["score"] = norm_scores[i]
    return results


#------------------------Hybrid Search--------------------------#
def hybrid_ranked_results(query, data, top_k=10, alpha=0.33, beta=0.33, gamma=0.33):
    # Run all search models
    simple = simple_search(query, data, top_k=top_k)
    bm25 = get_top_matches(query, top_k=top_k)
    embed = find_most_similar(query)[:11]

    def wrap(results, model):
      if model == "simple":
          return [{"id": row["id"], "score": row["norm_views"], "source": model} for _, row in results.iterrows()]
      elif model == "bm25":
          return [{"id": r[0], "score": r[2], "source": model} for r in results if len(r) == 3]
      elif model == "embed":
          return [{"id": row["id"], "score": row["score"], "source": model} for _, row in results.iterrows()]


    all_results = (
        normalize_scores(wrap(simple, "simple"))
        + normalize_scores(wrap(pd.DataFrame(bm25, columns=["id", "count", "score"]), "bm25"))
        + normalize_scores(wrap(embed, "embed"))
    )

    combined_scores = defaultdict(float)

    for r in all_results:
        if r["source"] == "simple":
            combined_scores[r["id"]] += alpha * r["score"]
        elif r["source"] == "bm25":
            combined_scores[r["id"]] += beta * r["score"]
        elif r["source"] == "embed":
            combined_scores[r["id"]] += gamma * r["score"]

    sorted_ids = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)
    top_ids = [doc_id for doc_id, _ in sorted_ids[:top_k]]

    return data[data["id"].isin(top_ids)]




In [38]:
hybrid_ranked_results("love", data, alpha=0.50, beta=0.35, gamma=0.15)

Unnamed: 0,title,tag,artist,year,views,features,lyrics,id,language_cld3,language_ft,language,processed,embedded
2754,Over My Dead Body,rap,Drake,2011,2510453,{},"[Intro: Chantal Kreviazuk]\nHow I'm feeling, i...",58661,en,en,en,intro chantal kreviazuk\nhow im feeling it doe...,"[0.00283, -0.003355, -0.06824]"
8317,Part II On the Run,rap,JAY-Z,2013,2421972,{Beyoncé},[Intro: Beyoncé]\nWho wants that perfect love ...,180511,en,en,en,intro beyoncé\nwho wants that perfect love sto...,"[-0.003658, 0.05222, -0.04535]"
84685,​i​ hate u i love u,pop,Garrett Nash,2016,6154593,"{""Olivia O\\'Brien""}","[Verse 1: Olivia O'Brien]\nFeeling used, but I...",2263909,en,en,en,verse 1 olivia obrien\nfeeling used but im\nst...,"[0.00801, 0.05405, -0.0303]"
85857,Burn,rap,Burn Lyrics - Phillipa Soo,2015,2272737,"{""Phillipa Soo""}",[ELIZA]\nI saved every letter you wrote me\nFr...,2315135,en,en,en,eliza\ni saved every letter you wrote me\nfrom...,"[-0.00475, 0.0767, -0.002022]"
86727,Star Shopping,rap,Lil Peep,2015,4177837,{},[Verse]\nWait right here\nI'll be back in the ...,2353006,en,en,en,verse\nwait right here\nill be back in the mor...,"[-0.003742, 0.04068, -0.02107]"
97747,Perfect,pop,Ed Sheeran,2017,6201020,{},"[Verse 1]\nI found a love for me\nOh, darling,...",2953761,en,en,en,verse 1\ni found a love for me\noh darling jus...,"[0.01115, 0.04593, -0.03952]"
107198,Everybody Dies in Their Nightmares,rap,XXXTENTACION,2017,2844291,{},"[Intro: Shiloh Dynasty]\nOoh, ahh, ahh, ahh\nD...",3206881,en,en,en,intro shiloh dynasty\nooh ahh ahh ahh\ndont go...,"[-0.0004396, -0.01715, -0.006165]"
107465,Gucci Gang,rap,Lil Pump,2017,4579502,{},"[Intro]\nYuh, ooh\nBrr, brr\nGucci gang, ooh\n...",3214267,en,en,en,intro\nyuh ooh\nbrr brr\ngucci gang ooh\nthats...,"[-0.03766, 0.02202, -0.04028]"
130669,Venom,rap,Eminem,2018,3146013,{},[Intro]\nI got a song filled with shit for the...,3930594,en,en,en,intro\ni got a song filled with shit for the s...,"[0.04584, 0.03168, -0.0206]"
131651,Killshot,rap,Eminem,2018,6893540,{},"[Intro]\nYou sound like a bitch, bitch\nShut t...",3958196,en,en,en,intro\nyou sound like a bitch bitch\nshut the ...,"[-0.0106, 0.0231, -0.0415]"


In [None]:
#-----------------Evaluation-----------------------#

# MAP@10
def average_precision(retrieved_docs, relevant_docs):
    score = 0.0
    hits = 0
    for i, doc_id in enumerate(retrieved_docs, 1):
        if doc_id in relevant_docs:
            hits += 1
            score += hits / i
    return score / max(1, len(relevant_docs))

def mean_average_precision(queries, relevance_dict, retriever_fn, top_k=10):
    total_ap = 0.0
    for query in queries:
        retrieved_docs = retriever_fn(query)["id"].tolist()
        relevant_docs = relevance_dict.get(query, set())
        total_ap += average_precision(retrieved_docs[:top_k], relevant_docs)
    return total_ap / len(queries)


# Recall@10
def recall_at_k(retrieved_docs, relevant_docs, k=10):
    return len(set(retrieved_docs[:k]) & relevant_docs) / len(relevant_docs)

def mean_recall(queries, relevance_dict, retriever_fn, top_k=10):
    total_recall = 0.0
    for query in queries:
        retrieved_docs = retriever_fn(query)["id"].tolist()
        relevant_docs = relevance_dict.get(query, set())
        total_recall += recall_at_k(retrieved_docs, relevant_docs, top_k)
    return total_recall / len(queries)


# TODO: Create a dataset for evaluation. Perhaps using manual annotation on a small set of queries and seeing how it compares.