<a href="https://colab.research.google.com/github/danoonan2021/IS4200_final_project/blob/main/word_stemming.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
!pip install nltk
!pip install kagglehub



In [13]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("carlosgdcj/genius-song-lyrics-with-language-information")

print("Path to dataset files:", path)

Path to dataset files: C:\Users\andre\.cache\kagglehub\datasets\carlosgdcj\genius-song-lyrics-with-language-information\versions\1


In [21]:
import pandas as pd

data = pd.read_csv(path+"/song_lyrics.csv", skiprows=lambda x: x % 5 != 0)

In [22]:
# Only english songs in the data
filtered_data = data[data['language'] == 'en']

In [23]:
data.head()

Unnamed: 0,title,tag,artist,year,views,features,lyrics,id,language_cld3,language_ft,language
0,Fly In,rap,Lil Wayne,2005,78271,{},"[Intro]\nSo they ask me\n""Young boy\nWhat you ...",6,en,en,en
1,Lord You Know,rap,Cam'ron,2004,11882,"{""Cam\\'ron"",""Juelz Santana"",Jaheim}","[Chorus: Jaheim]\nNow Lord you know, just how ...",11,en,en,en
2,98 Freestyle,rap,Big L,2000,297788,{},"[Verse 1]\nYo, fuck all the glamours and glitz...",16,en,en,en
3,More Gangsta Music,rap,Cam'ron,2004,20419,"{""Cam\\'ron"",""Juelz Santana""}","[Intro: Juelz Santana]\nGangsta Music, part tw...",124,en,en,en
4,Love Is Love,rap,AZ,1998,8430,"{Nature,""Half A Mill""}","[Interlude: AZ]\nAZ Ha, ha, ha, new drink, Bai...",8715,en,en,en


In [24]:
import re
import string
import numpy as np
from nltk.stem import SnowballStemmer

stemmer = SnowballStemmer("english")

_PUNCT_TO_REMOVE = string.punctuation.replace("'", "")
_RE_BRACKET_TAGS = re.compile(r"\[.*?\]")

def preprocess_lyrics(lyric: str) -> str:
    if not isinstance(lyric, str):
        return ""

    text = lyric.lower()
    text = _RE_BRACKET_TAGS.sub("", text)
    text = text.translate(str.maketrans("", "", _PUNCT_TO_REMOVE))
    text = re.sub(r"\s+", " ", text).strip()
    return text

def add_processed_column(df, source_col: str = "lyrics"):
    if "processed" not in df.columns:
        df = df.copy()
        df["processed"] = df[source_col].apply(preprocess_lyrics)
    return df

def stem_query(query):
    return " ".join(stemmer.stem(tok) for tok in query.split())

def simple_search(query,
                  data,
                  top_k: int = 100):
    raw_pattern  = re.escape(preprocess_lyrics(query))
    stem_pattern = re.escape(stem_query(query))

    exact_hits = data[data['processed'].str.contains(raw_pattern,  regex=True)]
    stem_hits  = data[data['processed'].str.contains(stem_pattern, regex=True)]

    parts = [hits for hits in (exact_hits, stem_hits) if not hits.empty]
    if not parts:                       # no match at all
        return pd.DataFrame(columns=data.columns)

    merged = pd.concat(parts, ignore_index=True)

    # Normalize the popularity into a score
    if merged['views'].nunique() > 1:
        range = merged['views'].max() - merged['views'].min()
        merged['norm_views'] = (merged['views'] - merged['views'].min()) / range
    else:
        merged['norm_views'] = 1.0 # Full score

    merged = (merged
              .drop_duplicates('id')
              .sort_values('norm_views', ascending=False)
              .head(top_k))

    return merged.reset_index(drop=True)


In [25]:
processed_data = add_processed_column(filtered_data)

In [28]:
# Test query
simple_search("take your time", processed_data)

Unnamed: 0,title,tag,artist,year,views,features,lyrics,id,language_cld3,language_ft,language,processed,norm_views
0,Kiss Land,rb,The Weeknd,2013,1190219,{},"[Part I]\n\n[Verse 1]\nWhen I got on stage, sh...",152385,en,en,en,when i got on stage she swore i was six feet t...,1.000000
1,Congratulations,rap,Mac Miller,2016,410290,{Bilal},"[Intro: Kilo Kish, Paige Montgomery, Ariana Gr...",2843734,en,en,en,where are you ohoh hehehe oh the divine femini...,0.344718
2,Father and Son,rock,Cat Stevens,1970,353793,{},[Verse 1: Father]\nIt's not time to make a cha...,342826,en,en,en,it's not time to make a change just relax take...,0.297250
3,Teenage Fantasy,rb,Jorja Smith,2017,249431,{},[Verse 1]\nYou weren't the boy I thought I kne...,3113293,en,en,en,you weren't the boy i thought i knew maybe i w...,0.209567
4,Heart Attack,rap,Dave,2021,189466,{},[Intro]\nKnife crimes is at a near-record high...,7006575,en,en,en,knife crimes is at a nearrecord high with more...,0.159185
...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,Sweet Premium Wine,rap,KMD,1998,8944,{},"[Intro]\n""Forget it, here let's have a drink""\...",26531,en,en,en,forget it here let's have a drink you're drink...,0.007514
96,With You,pop,Marin Hoxha & Chris Linton,2018,8901,{},Verse 1 [Chris Linton]\n\nWhen I'm with you\nI...,3834096,en,en,en,verse 1 when i'm with you i am able to keep pu...,0.007478
97,Life Goes On,rock,The Damned,1982,8813,{},[Verse 1]\nLife goes on and on and on\nIf you ...,1384847,en,en,en,life goes on and on and on if you think it's a...,0.007404
98,Alexander Hamilton ErinEl,rap,ErinEl,2016,8813,{},(Lights up on Aaron Burr & the company.)\n\n[A...,2879951,en,en,en,lights up on aaron burr the company how does a...,0.007404


In [97]:
print(processed_data.loc[processed_data['id'] == 152385, 'processed'].values[0])

print(preprocess_lyrics("nothing is going to change my love for you"))

when i got on stage she swore i was six feet tall but when she put it in her mouth she can't seem to reach my ballin' ain't an issue for me i'll make a hundred stacks right back next week do it all again i'm faded off the wrong thing the wrong thing and i'll admit baby i'm a little camerashy but exceptions can be made baby ‘cause you're too damn fly for what it's worth i hope you enjoy the show 'cause if you're back here only takin' pictures you gon' have to take your ass home 'cause the only thing you're takin' is your clothes off go 'head girl strip it down close your mouth i just wanna hear your body talk nothing is going to change my love for you nothing is going to change my love for you oh girl don’t hold back let it out nothing is going to change my love for you nothing is going to change my love for you oh girl don’t hold back let it out nothing is going to change my love for you nothing is going to change my love for you don’t hold back let it out nothing is going to change my

In [46]:
from collections import Counter
import math

def make_ngrams(tokens, n):
    ng = [tuple(tokens[i: i + n]) for i in range(0, len(tokens) - n + 1)]
    return ng

def ngram_dict(df, n):
    res = {}
    for index, row in df.iterrows():
        id_val = row['id']
        
        processed_text = row['processed']
        tokens = processed_text.split()
        
        ngrams = make_ngrams(tokens, n)
        
        res[id_val] = Counter(ngrams)
    return res

In [43]:
NGRAM_SIZE = 2

# id -> ngram count
id_nmap = ngram_dict(processed_data, NGRAM_SIZE)

# corpus ngram counts
corpus_counter = Counter()

for doc_ngrams in id_nmap.values():
    corpus_counter.update(doc_ngrams)
    
total_ngrams = sum(corpus_counter.values())
unique_ngrams = len(corpus_counter)

In [45]:
id_ncount = {doc_id: sum(doc_counter.values()) for doc_id, doc_counter in id_nmap.items()}

id_nunique = {doc_id: len(doc_counter) for doc_id, doc_counter in id_nmap.items()}

In [None]:
def get_top_matches(query, top_k=10):
    """
    Scores documents using Query Likelihood with Laplace smoothing,
    sorted first by number of exact n-gram matches, then by log-likelihood score.

    Returns:
        List of tuples: (doc_id, match_count, log_score)
    """
    print("Query", query)
    query = preprocess_lyrics(query)
    tokens = query.split()
    query_ngrams = make_ngrams(tokens, NGRAM_SIZE)
    print("Ngrams", query_ngrams)
    V = unique_ngrams

    results = []

    for doc_id, doc_counter in id_nmap.items():
        total_ngrams_in_doc = id_ncount[doc_id]
        log_score = 0.0
        match_count = 0

        for ngram in query_ngrams:
            count = doc_counter.get(ngram, 0)
            if count > 0:
                match_count += 1
            prob = (count + 1) / (total_ngrams_in_doc + V)
            log_score += math.log(prob)

        if match_count > 0:
            results.append((doc_id, match_count, log_score))

    sorted_results = sorted(results, key=lambda x: (x[1], x[2]), reverse=True)

    return sorted_results[:top_k]

In [120]:
def query_results(query):
    results = get_top_matches(query)

    print("\nTop 5 documents by likelihood:")
    for i, (doc_id, matches, log_prob) in enumerate(results, 1):
        doc_title = processed_data.loc[processed_data['id'] == doc_id, 'title'].iloc[0]
        print(f"{i}. Document ID: {doc_id}, Title: '{doc_title}', Matches: '{matches}', Log Probability: {log_prob:.4f}")

In [121]:
query_results("nothing is going to change my love for you")

Query nothing is going to change my love for you
Ngrams [('nothing', 'is'), ('is', 'going'), ('going', 'to'), ('to', 'change'), ('change', 'my'), ('my', 'love'), ('love', 'for'), ('for', 'you')]

Top 5 documents by likelihood:
1. Document ID: 6919821, Title: 'Baby O My Love', Matches: '151', Log Probability: -128.4591
2. Document ID: 152385, Title: 'Kiss Land', Matches: '128', Log Probability: -110.8174
3. Document ID: 528973, Title: 'Lecture 12: Sales and Marketing', Matches: '105', Log Probability: -123.5499
4. Document ID: 7079260, Title: 'For You', Matches: '96', Log Probability: -123.5591
5. Document ID: 1963041, Title: 'All This Love That Im Givin', Matches: '75', Log Probability: -128.0804
6. Document ID: 5640456, Title: 'All My Love Part 1', Matches: '75', Log Probability: -129.1522
7. Document ID: 338974, Title: 'Across the Universe', Matches: '72', Log Probability: -123.8753
8. Document ID: 4430534, Title: 'I Express My Love Live', Matches: '69', Log Probability: -124.0927
9.

In [118]:
query_results("take your time")

Query take your time
Ngrams [('take', 'your'), ('your', 'time')]

Top 5 documents by likelihood:
1. Document ID: 7351152, Title: 'Time', Matches: '2', Log Probability: -26.5694
2. Document ID: 642995, Title: 'Take Your Time', Matches: '2', Log Probability: -26.7790
3. Document ID: 3242418, Title: 'Take Your Time', Matches: '2', Log Probability: -26.9330
4. Document ID: 4214762, Title: 'Breathe Your Breath', Matches: '2', Log Probability: -27.2817
5. Document ID: 3304817, Title: 'Timeless', Matches: '2', Log Probability: -27.3793
6. Document ID: 1425025, Title: 'King Of Ska 93', Matches: '2', Log Probability: -27.4818
7. Document ID: 1537357, Title: 'Whales of Tadoussac', Matches: '2', Log Probability: -27.5900
8. Document ID: 954756, Title: 'Take Your Time', Matches: '2', Log Probability: -27.7043
9. Document ID: 4910180, Title: 'Take Your Time', Matches: '2', Log Probability: -27.7043
10. Document ID: 7296138, Title: 'Killer Love', Matches: '2', Log Probability: -27.7043
