# Similarity-based Playlist Generation

In [None]:
from pathlib import Path
import pandas as pd
import numpy as np
import IPython.display as ipd

In [None]:
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

loaders_by_format = {
    "csv": pd.read_csv,
    "xlsx": pd.read_excel,
    "db": pd.read_sql,
    "parquet": pd.read_parquet,
    "npy": lambda x: pd.DataFrame(list(np.load(x, allow_pickle=True))),
}


def load_data(filepath):
    extension = Path(filepath).name.split(".")[1]
    return loaders_by_format[extension](filepath)


df = load_data("../data/processed/20250212032022_audio_features.npy")
df["track"] = df["filepath"].apply(lambda x: Path(x).name)
df["genre"] = df["style_genre_discogs400"].apply(lambda x: x.split("---")[0])
df["style"] = df["style_genre_discogs400"].apply(lambda x: x.split("---")[-1])

In [None]:
df.columns

In [None]:
# get 10 random tracks
sample = df.sample(3)[["track", "key_temperley_predict", "genre", "style", "filepath", "sample_rate"]]
for track in sample.to_dict(orient="records"):
    fpath = track["filepath"]
    sr = track["sample_rate"]
    # ipd.display(ipd.Audio(fpath, rate=sr))
sample

In [None]:
query_track = "3RLV9wC6HBmfB3Vicwejc2.mp3"
query_discogs_embeddings = df[df["track"] == query_track]["discogs_embeddings_mean"].values[0]
query_musicnn_embeddings = df[df["track"] == query_track]["musicnn_embeddings_mean"].values[0]

df_query = df[["filepath", "track", "discogs_embeddings_mean", "musicnn_embeddings_mean"]].copy()

df_query["cosine_discogs_similarity"] = df_query["discogs_embeddings_mean"].apply(lambda x: cosine_similarity(x, query_discogs_embeddings))
df_query["cosine_musicnn_similarity"] = df_query["musicnn_embeddings_mean"].apply(lambda x: cosine_similarity(x, query_musicnn_embeddings))

In [None]:
# top 5 similar tracks based on discogs embeddings
discogs_results = df_query.sort_values("cosine_discogs_similarity", ascending=False).head(5)[["filepath", "track", "cosine_discogs_similarity"]]
musicnn_results = df_query.sort_values("cosine_musicnn_similarity", ascending=False).head(5)[["filepath", "track", "cosine_musicnn_similarity"]]

for result in [discogs_results, musicnn_results]:
    for track in result.to_dict(orient="records"):
        fpath = track["filepath"]
        # ipd.display(ipd.Audio(fpath, rate=sr))
    print(result)