<a href="https://colab.research.google.com/github/danoonan2021/IS4200_final_project/blob/main/word_stemming.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install nltk



In [2]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("carlosgdcj/genius-song-lyrics-with-language-information")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/carlosgdcj/genius-song-lyrics-with-language-information?dataset_version_number=1...


100%|██████████| 3.04G/3.04G [01:18<00:00, 41.6MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/carlosgdcj/genius-song-lyrics-with-language-information/versions/1


In [3]:
import pandas as pd

data = pd.read_csv(path+"/song_lyrics.csv")

In [10]:
# Only english songs in the data
filtered_data = data[data['language'] == 'en']

In [25]:
import re
import string
import numpy as np
from nltk.stem import SnowballStemmer

stemmer = SnowballStemmer("english")

_PUNCT_TO_REMOVE = string.punctuation.replace("'", "")
_RE_BRACKET_TAGS = re.compile(r"\[.*?\]")

def preprocess_lyrics(lyric: str) -> str:
    if not isinstance(lyric, str):
        return ""

    text = lyric.lower()
    text = _RE_BRACKET_TAGS.sub("", text)
    text = text.translate(str.maketrans("", "", _PUNCT_TO_REMOVE))
    text = re.sub(r"\s+", " ", text).strip()
    return text

def add_processed_column(data, source_col: str = "lyrics"):
    if "processed" not in df.columns:
        df = df.copy()
        df["processed"] = df[source_col].apply(preprocess_lyrics)
    return df

def stem_query(query):
    return " ".join(stemmer.stem(tok) for tok in query.split())

def simple_search(query,
                  data,
                  top_k: int = 100):
    raw_pattern  = re.escape(preprocess_lyrics(query))
    stem_pattern = re.escape(stem_query(query))

    exact_hits = data[data['processed'].str.contains(raw_pattern,  regex=True)]
    stem_hits  = data[data['processed'].str.contains(stem_pattern, regex=True)]

    parts = [hits for hits in (exact_hits, stem_hits) if not hits.empty]
    if not parts:                       # no match at all
        return pd.DataFrame(columns=data.columns)

    merged = pd.concat(parts, ignore_index=True)

    # Normalize the popularity into a score
    if merged['views'].nunique() > 1:
        range = merged['views'].max() - merged['views'].min()
        merged['norm_views'] = (merged['views'] - merged['views'].min()) / range
    else:
        merged['norm_views'] = 1.0 # Full score

    merged = (merged
              .drop_duplicates('id')
              .sort_values('norm_views', ascending=False)
              .head(top_k))

    return merged.reset_index(drop=True)


In [13]:
processed_data = add_processed_column(filtered_data)

In [26]:
# Test query
simple_search("take your time coming home", processed_data)

Unnamed: 0,title,tag,artist,year,views,features,lyrics,id,language_cld3,language_ft,language,processed,norm_views
0,Take Your Time Coming Home,rock,fun.,2009,11925,{​fun.},[Intro]\nDin-an-ah\nDin-an-ah\nHoldin' on-nah\...,296937,en,en,en,dinanah dinanah holdin' onnah holdin' onnah ho...,1.0
1,Take Your Time,pop,fun.,2015,1099,{​fun.},Take your time coming home\nHear the wheels as...,934454,en,en,en,take your time coming home hear the wheels as ...,0.089104
2,Take Your Time - Acoustic,pop,fun.,2009,343,{​fun.},"One, two, three, four\n\nTake your time coming...",5901036,en,en,en,one two three four take your time coming home ...,0.025494
3,Never Had the Courage,pop,Chase Coy,2010,213,{},"I've diled your number, half a thousand times\...",1716287,en,en,en,i've diled your number half a thousand times h...,0.014556
4,I Never Had The Courage,pop,Chase Coy,2015,82,{},I've dialed your number\nHalf a thousand times...,1761415,en,en,en,i've dialed your number half a thousand times ...,0.003534
5,Keep from Moving,pop,Creeper Lagoon,2001,40,{},Take down this ragged nest\nYou have built on ...,1427161,en,en,en,take down this ragged nest you have built on u...,0.0
