In [12]:
import pandas as pd

# dataset link: https://www.kaggle.com/datasets/mrmorj/dataset-of-songs-in-spotify?select=genres_v2.csv

# retrieve and clean df
songs_df = pd.read_csv("spotify-songs.csv", low_memory=False)
songs_df.index = songs_df["id"] # set index to a song's unique id
songs_df.drop(["analysis_url", "uri", "track_href", "type", "title", "Unnamed: 0", "id", "genre"], axis=1, inplace=True) # remove unused/non-numeric columns
songs_df.dropna(subset=["song_name"], inplace=True) # remove all rows with NaN song_names
songs_df = songs_df[~songs_df.index.duplicated(keep="first")] # remove duplicate entries
songs_df.head(5)

Unnamed: 0_level_0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,song_name
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2Vc6NJ9PW9gD9q343XFRKx,0.831,0.814,2,-7.364,1,0.42,0.0598,0.0134,0.0556,0.389,156.985,124539,4,Mercury: Retrograde
7pgJBLVz5VmnL7uGHmRj6p,0.719,0.493,8,-7.23,1,0.0794,0.401,0.0,0.118,0.124,115.08,224427,4,Pathology
0vSWgAlfpye0WCGeNmuNhy,0.85,0.893,5,-4.783,1,0.0623,0.0138,4e-06,0.372,0.0391,218.05,98821,4,Symbiote
0VSXnJqQkwuH2ei1nOQ1nu,0.476,0.781,0,-4.71,1,0.103,0.0237,0.0,0.114,0.175,186.948,123661,3,ProductOfDrugs (Prod. The Virus and Antidote)
4jCeguq9rMTlbMmPHuO7S3,0.798,0.624,2,-7.668,1,0.293,0.217,0.0,0.166,0.591,147.988,123298,4,Venom


In [13]:
# used to look up song names based on id
song_lookup = {
    id: songs_df.loc[id]["song_name"]
    for id in songs_df.index
}

In [14]:
songs_df.drop("song_name", axis=1, inplace=True) # remove song_name
normalized_songs_df = songs_df

# normalize columns
for col in normalized_songs_df.columns:
    min, max = normalized_songs_df[col].min(), normalized_songs_df[col].max()
    normalized_songs_df[col] = (songs_df[col] - min)/(max - min)

normalized_songs_df.head(10)


Unnamed: 0_level_0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2Vc6NJ9PW9gD9q343XFRKx,0.829884,0.81477,0.181818,0.712039,1.0,0.430304,0.060525,0.013549,0.036345,0.38399,0.601808,0.124922,0.75
7pgJBLVz5VmnL7uGHmRj6p,0.708527,0.49337,0.727273,0.71571,1.0,0.06141,0.40587,0.0,0.101891,0.107776,0.338223,0.26967,0.75
0vSWgAlfpye0WCGeNmuNhy,0.850471,0.893868,0.454545,0.782742,1.0,0.04289,0.013967,4e-06,0.368697,0.019283,0.98591,0.087654,0.75
0VSXnJqQkwuH2ei1nOQ1nu,0.445227,0.781729,0.0,0.784742,1.0,0.086971,0.023987,0.0,0.097689,0.160934,0.790277,0.123649,0.5
4jCeguq9rMTlbMmPHuO7S3,0.794127,0.624533,0.181818,0.703712,1.0,0.292754,0.219635,0.0,0.152311,0.594538,0.545216,0.123123,0.75
6fsypiJHyWmeINsOLC1cos,0.710695,0.568464,0.0,0.604356,1.0,0.423806,0.045748,0.214358,0.112395,0.092141,0.525887,0.107492,0.75
0XfQbq7DaMOmVXgQ71eA6E,0.707444,0.668588,0.727273,0.799753,1.0,0.123795,0.025707,0.007887,0.108193,0.018136,0.437266,0.056879,0.75
0LLeuNBWPOg3XA73yab3PT,0.681439,0.711642,0.727273,0.762416,1.0,0.214773,0.040181,0.0,0.095588,0.273504,0.482699,0.129247,0.5
37gqBnUAZe8BY8WR56kDNk,0.768122,0.751691,0.090909,0.846788,1.0,0.189862,0.062145,0.0,0.054412,0.175526,0.997924,0.147799,0.75
2ggqfj97qyiORmXoVFzP5j,0.897064,0.907886,1.0,0.628708,1.0,0.372902,0.153845,0.031446,0.564076,0.293308,0.87201,0.121212,0.75


In [15]:
from sklearn.metrics import DistanceMetric
metric = DistanceMetric.get_metric("euclidean")

target_songs = {
    "6gBFPUFcJLzWGx4lenP6h2": "goosebumps - Travis Scott",
    "1MYlx4dBtiyjn7K8YSyfzT": "Real Slim Shady - Eminem",
    "5Sg09MvHqNWPWsYeuY2toY": "Blinding Lights - The Weeknd"
}

song_distances = {}

for target_id in target_songs.keys():
    target_values = normalized_songs_df.loc[target_id].values

    # populate distances from current song
    song_distances[target_id] = {
        id: metric.pairwise([target_values, content.values])[0][1]
        for id, content in normalized_songs_df.iterrows()
    }

In [19]:

for song_id, song_distance in song_distances.items():
    sorted_distances = sorted(song_distance.items(), key=lambda x: x[1])
    most_similar_songs = sorted_distances[1:11] # get most similar, excluding the target, which should always be 0

    song_name = song_lookup[song_id]
    output = f"Output for {song_name}:\n"
    i = 1
    for id, dist in most_similar_songs:  
        output += f"{i}, {song_lookup[id]}, {dist}\n"
        i += 1

    with open(f"results/output_{song_name}.txt", "w") as f:
        f.write(output)