In [2]:
import numpy as np
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances, silhouette_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from tqdm.notebook import tqdm
import os

# Data Preprocessing

In [163]:
len(X.drop_duplicates(subset=['track_name', 'artists'], keep='first'))

81344

In [260]:
data.columns

Index(['track_id', 'artists', 'album_name', 'track_name', 'popularity',
       'duration_ms', 'explicit', 'danceability', 'energy', 'key', 'loudness',
       'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness',
       'valence', 'tempo', 'time_signature', 'track_genre'],
      dtype='object')

In [326]:
data = pd.read_csv('data/dataset.csv', index_col=0)
data.drop_duplicates(subset='track_id')
features = ['track_name', 'artists', 'popularity', 'track_genre', 'danceability', 'loudness', 'acousticness', 'valence', 'tempo']
X = data[features].copy()
X.drop_duplicates(subset=['track_name', 'artists'], keep='first', inplace=True)
X.reset_index(drop=True, inplace=True)  # fix indices after removing duplicates

# Standardize data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X[['danceability', 'loudness', 'acousticness', 'valence', 'tempo']])

model = KMeans(n_clusters=5, init="k-means++", random_state=42)

In [327]:
clusters = model.fit_predict(X_scaled)
results = X.copy()
results['cluster'] = clusters.astype(str)

In [328]:
results[results['cluster'] == '2'].track_genre.value_counts()

track_genre
grindcore         724
death-metal       630
black-metal       591
minimal-techno    580
heavy-metal       556
                 ... 
reggae             16
classical          12
jazz               12
tango               8
reggaeton           6
Name: count, Length: 113, dtype: int64

# Create Song Search

In [323]:
title = 'call me maybe'
artist = ''
X[X['track_name'].str.contains(title, na=False, case=False) & X['artists'].str.contains(artist, na=False, case=False)]

Unnamed: 0,track_name,artists,popularity,track_genre,danceability,loudness,acousticness,valence,tempo
15117,Call Me Maybe (Glee Cast Version),Glee Cast,41,club,0.754,-5.595,0.114,0.722,120.03
25554,Call Me Maybe,Carly Rae Jepsen,80,electro,0.783,-6.548,0.0114,0.66,120.021
50718,Call Me Maybe,Kids Rock Kidz,16,kids,0.736,-6.875,0.066,0.61,120.064


# Create Playlist Song Recommendation

In [332]:
most_similar = np.argpartition(np.mean((X_scaled - X_scaled[50201].T)**2, axis=1), kth=4)[:4]
results.iloc[most_similar]

Unnamed: 0,track_name,artists,popularity,track_genre,danceability,loudness,acousticness,valence,tempo,cluster
50201,Blue Flame,LE SSERAFIM,75,k-pop,0.781,-3.571,0.0281,0.765,112.045,3
17307,Take You Dancing,Jason Derulo,78,dance,0.789,-4.248,0.0332,0.753,112.985,3
8889,Back to Flow,Krafty Kuts,13,breakbeat,0.773,-3.973,0.0129,0.734,110.027,3
4125,Always,ENHYPEN,59,anime,0.804,-3.836,0.0466,0.73,110.014,3


In [334]:
# Create a playlist based on the index of the songs
playlist = [3884, 48383, 48625, 50201]

# Find groups each correspond to
songs = results.iloc[playlist]
clusters = model.predict(X_scaled[playlist])
majority = np.argmax(np.bincount(clusters))

In [374]:
# For each song in majority cluster, pick the songs that are cloest to the songs in the playlist
song_score = []
indices = results[results['cluster'] == str(majority)].index
N = len(playlist)
for song in indices:
    score = float('inf')
    for liked_song in playlist:
        score = min(score, np.mean((X_scaled[song] - X_scaled[liked_song])**2))
    song_score.append(score)

In [375]:
results.iloc[playlist]

Unnamed: 0,track_name,artists,popularity,track_genre,danceability,loudness,acousticness,valence,tempo,cluster
3884,Cry Baby,Official HIGE DANdism,66,anime,0.318,-4.605,0.0116,0.437,199.844,0
48383,夜に駆ける,YOASOBI,74,j-pop,0.67,-5.221,0.00231,0.789,130.041,3
48625,不可幸力,Vaundy,67,j-pop,0.69,-5.227,0.0157,0.613,94.032,3
50201,Blue Flame,LE SSERAFIM,75,k-pop,0.781,-3.571,0.0281,0.765,112.045,3


In [377]:
top_k = len(playlist) + 5
most_similar = np.argpartition(song_score, top_k)[:top_k]
results.iloc[indices[most_similar]]

Unnamed: 0,track_name,artists,popularity,track_genre,danceability,loudness,acousticness,valence,tempo,cluster
50201,Blue Flame,LE SSERAFIM,75,k-pop,0.781,-3.571,0.0281,0.765,112.045,3
48625,不可幸力,Vaundy,67,j-pop,0.69,-5.227,0.0157,0.613,94.032,3
48383,夜に駆ける,YOASOBI,74,j-pop,0.67,-5.221,0.00231,0.789,130.041,3
1140,YOU,Jor'dan Armstrong,27,afrobeat,0.701,-5.677,0.00647,0.608,93.028,3
38630,Represent,Nas,64,hardcore,0.708,-4.964,0.02,0.618,92.428,3
17307,Take You Dancing,Jason Derulo,78,dance,0.789,-4.248,0.0332,0.753,112.985,3
48465,Into The Night,YOASOBI,60,j-pop,0.647,-5.558,0.00496,0.78,130.024,3
9431,Return of the Mack,Mark Morrison,75,british,0.715,-5.379,0.00631,0.612,95.487,3
44984,Radio,Rammstein,70,industrial,0.652,-5.145,0.00404,0.76,132.03,3
