In [1]:
import numpy as np
import pandas as pd
from numpy.linalg import norm

In [3]:
# change the file path here
file_path_base = r"C:\Users\Richard\Desktop\JKU\1.Semester-WS2023\MultimediaSearchAndRetrieval\\"
file_path_info = file_path_base + "id_information_mmsr.tsv" 
file_path_word2vec = file_path_base + "id_lyrics_word2vec_mmsr.tsv"
file_path_tfidf = file_path_base + "id_lyrics_tf-idf_mmsr.tsv"
file_path_bert = file_path_base + "id_lyrics_bert_mmsr.tsv"

df_info = pd.read_table(file_path_info)
df_word2vec = pd.read_table(file_path_word2vec)
df_tfidf = pd.read_table(file_path_tfidf)
df_bert = pd.read_table(file_path_bert)

df_info

Unnamed: 0,id,artist,song,album_name
0,01Yfj2T3YTwJ1Yfy,We As Human,Take The Bullets Away (feat. Lacey Sturm),We As Human
1,01gyRHLquwXDlhkO,The Notorious B.I.G.,Somebody's Gotta Die,Life After Death (Remastered Edition)
2,01rMxQv6vhyE1oQX,Against the Current,Chasing Ghosts,In Our Bones
3,02RGE9FNH65RtMS7,Barthezz,Infected,Trance - The Early Years (1997-2002)
4,02ZnlCGZEbkfCDxo,Laura Pausini,Tra Te E Il Mare,The Best of Laura Pausini - E Ritorno Da Te
...,...,...,...,...
10090,zyzILCQvVeUFIINi,Crowded House,When You Come,Temple Of Low Men
10091,zzgS4ZqyswamEWNj,Britney Spears,My Only Wish (This Year),Platinum Christmas
10092,zzoFYDMlqU1X2zz1,Thundercat,DUI,Drunk
10093,zzpkRCGA5ud8q4mv,Otis Redding,Rock Me Baby,Otis Blue


In [4]:
# Random base line

def random_baseline(data: pd.DataFrame, title: str, artist: str, n: int) -> pd.DataFrame:
    return df_info.sample(n=n)[["id","artist","song"]]
    
random_baseline(df_info, "Jingle Bells", "Frank Sinatra", 10)

Unnamed: 0,id,artist,song
10026,zTxCINaydSvXbYha,Ed Motta,Colombina
1482,8qQGED0CK83gqK2H,Sheryl Crow,Redemption Day
4646,SSl9GsGNsv28e6sd,Belle and Sebastian,If She Wants Me
534,34X08ZxyRV0GJF75,Girl In Red,watch you sleep.
6970,h0Jaex0Pdbn3aVXv,Green Day,Wake Me Up When September Ends
4407,R0kjvPkfpzkRCdE1,Primal Scream,Damaged
7680,lLr42PhCGChOItPt,Zola Blood,Play Out
7920,mjBqNBFLspc3FCtE,Napalm Death,Greed Killing
4979,UZEnd6AnZjUSnGES,Tut Tut Child,Dance To It
9063,tddx4dWD9zLD1viM,Sugababes,Joy Division


In [5]:
def cos_sim(query: [int], target: [int]) -> int:
    return np.dot(query,target)/(norm(query)*norm(target))

def euc_sim(query: [int], target: [int]) -> int:
    return 1/(1+norm(query-target))
  
def text_based_retrieval(info: pd.DataFrame, feature: pd.DataFrame, title: str, artist: str, n: int, sim_func = cos_sim) -> pd.DataFrame:
    feature_no_id = feature.drop(columns="id") # drop id column for similarity measurement
    query = info[(info["artist"] == artist) & (info["song"] == title)]["id"].values[0] # search for song in info
    query = feature[feature["id"]==query].drop(columns="id") # get feature vector for song
    sims = [sim_func(query, target) for target in feature_no_id.values] # compute similarity between query and each target song

    info["sim"] = sims
    info_sorted=info.sort_values(by=["sim"], ascending=False)
    return info_sorted[["id","artist","song"]][1:n+1] # skips the first row, because it is the query track 
    

In [6]:
# Text-based(cos-sim, tf-idf)
text_based_retrieval(df_info, df_tfidf, "Jingle Bells", "Frank Sinatra", 10)

Unnamed: 0,id,artist,song
2778,GvR3ihpANzPFUV9q,Ryan Star,We Might Fall
6220,cZq0NEOrY3Ub5Bqc,All Time Low,"Merry Christmas, Kiss My Ass"
1299,7o67ctY7Qo945yfv,The Smashing Pumpkins,The Everlasting Gaze
8221,oUaMwbrwMD9grrIA,Mayhem,Crystalized Pain In Deconstruction
5240,WESZfNkEXTSzBsaH,Love Generation,Dance Alone
8349,pH97idDxXVlnq3xH,Dixie Chicks,Loving Arms
8105,nkmwTJvccNwNK8mo,Suede,The Chemistry Between Us
9543,wYoJBT9Lg1bBf8Nd,Moby,One Of These Mornings
2856,HMvRdVsvXUPwROWw,Title Fight,Liar's Love
4039,OqVFiTyVf5wR5FNq,Brandi Carlile,Late Morning Lullaby


In [7]:
# Text-based(cos-sim, bert)
text_based_retrieval(df_info, df_bert, "Jingle Bells", "Frank Sinatra", 10)

Unnamed: 0,id,artist,song
2778,GvR3ihpANzPFUV9q,Ryan Star,We Might Fall
4257,QBGxad2tXIjeAZmP,Nevilton,Pressuposto
3128,J3Y3E5wh2TcJcUGM,Steel Panther,Death To All But Metal
1379,8DxVQ1vrjsNTIqOv,New Found Glory,My Friends Over You
2423,EeUoJbeUQNiHGJeX,Angra,Caveman
1758,AZGxAypdz7mBmx1R,Emilie Autumn,Opheliac
1756,AYxLbkF55hvueDLM,Gloria Groove,Coisa Boa
9804,y3WWGqthYTQjP7uM,Eddie Vedder,Goodbye
451,2ZFMLcQ9rWNndcLi,Jane Weaver,Don't Take My Soul
5420,XLT9XQT48DfHAZEc,Elton John,No Shoe Strings On Louise


In [8]:
# Text-based(euc-sim, word2vec)
text_based_retrieval(df_info, df_word2vec, "Jingle Bells", "Frank Sinatra", 10, euc_sim)

Unnamed: 0,id,artist,song
2463,EsxmiDUT0v0NDbWP,Thousand Foot Krutch,New Drug
4319,QUcZsJvxjp5NkHSx,Mr. Big,Shine
3462,LIYIBenQlQQnEjRA,The Three Degrees,Dirty Ol' Man
8349,pH97idDxXVlnq3xH,Dixie Chicks,Loving Arms
8756,rlMcTiHz9HSidm55,Everyone Everywhere,"$1,000,000,000"
4880,TuBvinshhNZgQpYN,Flume,Holdin On
1741,ASmQDQZeVJytIHp4,No Doubt,Full Circle
2646,G7KquYFevSK3v9Ve,Killer Be Killed,Face Down
8922,sqnkDpNrmNYok0Og,The Smashing Pumpkins,Beautiful
667,3ur9VXvhUvHSMZIK,Brandon Flowers,Hard Enough
