In [198]:
import numpy as np
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances, silhouette_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from scipy.spatial.distance import correlation
from tqdm.notebook import tqdm
import pickle
import os

# Data Preprocessing

In [276]:
data = pd.read_csv('data/dataset.csv', index_col=0)
data.drop_duplicates(subset='track_id')
features = ['track_name', 'artists', 'popularity', 'energy', 'track_genre', 'danceability', 'loudness', 'acousticness', 'valence', 'tempo']
X = data[features].copy()
X.drop_duplicates(subset=['track_name', 'artists'], keep='first', inplace=True)
X.reset_index(drop=True, inplace=True)  # fix indices after removing duplicates

# Standardize data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X[['popularity', 'energy', 'danceability', 'loudness', 'acousticness', 'valence', 'tempo']])
cos_scaled = X_scaled / np.linalg.norm(X_scaled, axis=1)[:, np.newaxis]

euc_model = KMeans(n_clusters=9, init="k-means++", random_state=42)
cos_model = KMeans(n_clusters=9, init="k-means++", random_state=42)

In [301]:
clusters = euc_model.fit_predict(X_scaled)
results = X.copy()
results['cluster'] = clusters.astype(str)

# cos_model.fit(cos_scaled)
# results['cluster'] = clusters.astype(str)

"""
with open('eucd.pkl', 'wb') as handle:
    pickle.dump(euc_model, handle)
with open('cos.pkl', 'wb') as handle:
    pickle.dump(cos_model, handle)
"""


In [279]:
results[results['cluster'] == '2'].track_genre.value_counts()

track_genre
grindcore      936
black-metal    881
death-metal    671
heavy-metal    659
hardstyle      534
              ... 
pagode           2
sad              2
pop-film         2
tango            2
k-pop            1
Name: count, Length: 110, dtype: int64

# Create Song Search

In [280]:
title = 'Pretender'
artist = 'HIGE'
X[X['track_name'].str.contains(title, na=False, case=False) & X['artists'].str.contains(artist, na=False, case=False)]

Unnamed: 0,track_name,artists,popularity,energy,track_genre,danceability,loudness,acousticness,valence,tempo
4146,Pretender,Official HIGE DANdism,60,0.869,anime,0.538,-3.464,0.047,0.369,91.972
4375,Pretender - LIVE,Official HIGE DANdism,25,0.786,anime,0.491,-7.452,0.0178,0.251,91.957


In [281]:
data[data['track_name'].str.contains(title, na=False, case=False) & data['artists'].str.contains(artist, na=False, case=False)]

Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
5294,15HNdxGKNCIO9pgaY4n7FU,Official HIGE DANdism,Traveler,Pretender,60,326842,False,0.538,0.869,8,-3.464,1,0.0275,0.047,0.0,0.14,0.369,91.972,4,anime
5537,2XRTwDgMQajOFOO7dH5pvP,Official HIGE DANdism,One-Man Tour 2021-2022 -Editorial-@Saitama Sup...,Pretender - LIVE,25,344813,False,0.491,0.786,8,-7.452,1,0.0382,0.0178,0.0,0.671,0.251,91.957,4,anime
62587,15HNdxGKNCIO9pgaY4n7FU,Official HIGE DANdism,Traveler,Pretender,60,326842,False,0.538,0.869,8,-3.464,1,0.0275,0.047,0.0,0.14,0.369,91.972,4,j-pop
63231,15HNdxGKNCIO9pgaY4n7FU,Official HIGE DANdism,Traveler,Pretender,60,326842,False,0.538,0.869,8,-3.464,1,0.0275,0.047,0.0,0.14,0.369,91.972,4,j-rock
63528,2XRTwDgMQajOFOO7dH5pvP,Official HIGE DANdism,One-Man Tour 2021-2022 -Editorial-@Saitama Sup...,Pretender - LIVE,25,344813,False,0.491,0.786,8,-7.452,1,0.0382,0.0178,0.0,0.671,0.251,91.957,4,j-rock


# Create Playlist Song Recommendation

In [282]:
most_similar = np.argpartition(np.mean((X_scaled - X_scaled[4146].T)**2, axis=1), kth=7)[:7]
results.iloc[most_similar]

Unnamed: 0,track_name,artists,popularity,energy,track_genre,danceability,loudness,acousticness,valence,tempo,cluster
4146,Pretender,Official HIGE DANdism,60,0.869,anime,0.538,-3.464,0.047,0.369,91.972,3
54374,tear gas,Architects,58,0.874,metalcore,0.51,-3.557,4.5e-05,0.336,85.985,3
54159,Rise,Skillet,61,0.921,metal,0.57,-3.892,0.0127,0.381,87.938,3
23832,Story Of My Life (feat. Trippie Redd),ILLENIUM;Sueco;Trippie Redd,65,0.892,dub,0.521,-4.029,0.0539,0.369,96.991,3
43858,Aaoge Tum Kabhi,The Local Train,56,0.863,indian,0.534,-4.616,0.0152,0.374,98.006,3
44832,Hate The Way (feat. blackbear),G-Eazy;blackbear,59,0.865,indie-pop,0.569,-3.844,0.0673,0.361,82.011,3
54194,deep fake,Architects,63,0.854,metal,0.505,-4.167,0.000101,0.319,96.023,3


In [295]:
# Create a playlist based on the index of the songs
playlist = [50201] # [3884, 48383, 48625, 50201]

# Find groups each correspond to
songs = results.iloc[playlist]
clusters = euc_model.predict(X_scaled[playlist])
majority = np.argmax(np.bincount(clusters))

In [296]:
# For each song in majority cluster, pick the songs that are cloest to the songs in the playlist
song_score = []
indices = results[results['cluster'] == str(majority)].index
N = len(playlist)
for song in indices:
    score = float('inf')
    for liked_song in playlist:
        score = min(score, np.mean((X_scaled[song] - X_scaled[liked_song])**2))
    song_score.append(score)

In [297]:
results.iloc[playlist]

Unnamed: 0,track_name,artists,popularity,energy,track_genre,danceability,loudness,acousticness,valence,tempo,cluster
50201,Blue Flame,LE SSERAFIM,75,0.725,k-pop,0.781,-3.571,0.0281,0.765,112.045,8


In [298]:
top_k = len(playlist) + 10
most_similar = np.argsort(song_score)[:top_k]# np.argpartition(song_score, top_k)[:top_k]
results.iloc[indices[most_similar]]

Unnamed: 0,track_name,artists,popularity,energy,track_genre,danceability,loudness,acousticness,valence,tempo,cluster
50201,Blue Flame,LE SSERAFIM,75,0.725,k-pop,0.781,-3.571,0.0281,0.765,112.045,8
17307,Take You Dancing,Jason Derulo,78,0.711,dance,0.789,-4.248,0.0332,0.753,112.985,8
42707,Alors On Danse - DubDogz Remix,Stromae;Dubdogz,70,0.752,house,0.789,-4.274,0.027,0.739,122.98,8
49712,Filter,BTS,76,0.762,k-pop,0.779,-5.188,0.0224,0.859,110.043,8
49889,Blue Hour,TOMORROW X TOGETHER,73,0.82,k-pop,0.71,-4.362,0.00962,0.746,112.005,8
17607,Ain't Your Mama,Jennifer Lopez,71,0.678,dance,0.757,-5.529,0.0472,0.82,120.075,8
27736,Green Green Grass,George Ezra,75,0.738,folk,0.685,-4.413,0.0695,0.8,112.972,8
50440,Blueming,IU,69,0.674,k-pop,0.819,-3.145,0.0849,0.682,106.032,8
49716,HOT,SEVENTEEN,80,0.777,k-pop,0.765,-4.376,0.0539,0.64,111.944,8
49654,Dynamite,BTS,85,0.765,k-pop,0.746,-4.41,0.0112,0.737,114.044,8


# Genre Evaluation
Based on a custom-made grouping of the genres, evaluate the performance of the model.

In [299]:
# Custom genre groupings for each possible genre in the dataset
genre_groupings = [
    ['acoustic', 'folk', 'singer-songwriter', 'bluegrass', 'country', 'honky-tonk', 'rockabilly', 'study', 'guitar', 'piano', 'new-age', 'ambient', 'sleep', 'chill'],
    ['rock', 'alt-rock', 'alternative', 'hard-rock', 'grunge', 'punk', 'punk-rock', 'emo', 'psych-rock', 'rock-n-roll', 'indie', 'indie-pop', 'power-pop', 'goth', 'industrial'],
    ['metal', 'black-metal', 'death-metal', 'heavy-metal', 'hardcore', 'metalcore', 'grindcore', 'hardstyle'],
    ['electronic', 'edm', 'dance', 'electro', 'house', 'deep-house', 'chicago-house', 'progressive-house', 'techno', 'detroit-techno', 'minimal-techno', 'trance', 'dubstep', 'drum-and-bass', 'breakbeat', 'idm', 'trip-hop', 'garage', 'club', 'party', 'synth-pop', 'disco'],
    ['hip-hop', 'r-n-b', 'soul', 'funk', 'groove', 'reggae', 'dancehall', 'dub'],
    ['jazz', 'blues', 'classical', 'opera', 'show-tunes', 'disney', 'pop-film', 'romance', 'sad', 'happy'],
    ['pop', 'k-pop', 'j-pop', 'j-dance', 'j-idol', 'j-rock', 'anime', 'cantopop', 'mandopop'],
    ['latin', 'latino', 'salsa', 'samba', 'brazil', 'mpb', 'pagode', 'forro', 'sertanejo', 'tango', 'spanish', 'afrobeat', 'reggaeton'],
    ['world-music', 'children', 'kids', 'comedy', 'french', 'german', 'indian', 'iranian', 'malay', 'turkish', 'british', 'swedish']
]

# Create genre to group dictionary
genre_groups = {}
for idx, group in enumerate(genre_groupings):
    for genre in group:
        genre_groups[genre] = idx


In [300]:
# Find group majority
group_indices = [genre_groups[genre] for genre in results.iloc[playlist].track_genre]
counts = np.bincount(group_indices)
majority_group = np.argmax(counts)

# Calculate the ratio of the recommended songs that are in the same genre group (higher is better)
total = top_k - len(playlist)
in_group = sum([1 for genre in results.iloc[indices[most_similar]][len(playlist)-1:].track_genre if genre_groups[genre] == majority_group])
print(in_group / total)

0.6
