In [1]:
import numpy as np
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances, silhouette_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from scipy.spatial.distance import correlation
from tqdm.notebook import tqdm
import pickle
import os

# Data Preprocessing

In [152]:
data = pd.read_csv('data/dataset.csv', index_col=0)
data.drop_duplicates(subset='track_id')
features = ['track_name', 'artists', 'popularity', 'track_genre', 'danceability', 'loudness', 'acousticness', 'valence', 'tempo']
X = data[features].copy()
X.drop_duplicates(subset=['track_name', 'artists'], keep='first', inplace=True)
X.reset_index(drop=True, inplace=True)  # fix indices after removing duplicates

# Standardize data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X[['popularity', 'danceability', 'loudness', 'acousticness', 'valence', 'tempo']])
cos_scaled = X_scaled / np.linalg.norm(X_scaled, axis=1)[:, np.newaxis]

euc_model = KMeans(n_clusters=9, init="k-means++", random_state=42)
cos_model = KMeans(n_clusters=9, init="k-means++", random_state=42)

In [153]:
clusters = euc_model.fit_predict(X_scaled)
results = X.copy()
results['cluster'] = clusters.astype(str)

# cos_model.fit(cos_scaled)
# results['cluster'] = clusters.astype(str)

"""
with open('eucd.pkl', 'wb') as handle:
    pickle.dump(euc_model, handle)
with open('cos.pkl', 'wb') as handle:
    pickle.dump(cos_model, handle)
"""


In [119]:
results[results['cluster'] == '2'].track_genre.value_counts()

track_genre
salsa          482
forro          380
dancehall      355
children       341
sertanejo      308
              ... 
heavy-metal      5
metalcore        4
ambient          4
romance          2
black-metal      1
Name: count, Length: 108, dtype: int64

# Create Song Search

In [147]:
title = 'Sweater Weather'
artist = ''
X[X['track_name'].str.contains(title, na=False, case=False) & X['artists'].str.contains(artist, na=False, case=False)]

Unnamed: 0,track_name,artists,popularity,track_genre,danceability,loudness,acousticness,valence,tempo
1897,Sweater Weather,The Neighbourhood,93,alt-rock,0.612,-2.81,0.0495,0.398,124.053
2509,Sweater Weather - Young Saab Remix,The Neighbourhood;Young Saab,61,alt-rock,0.583,-4.179,0.000553,0.421,124.029
13648,sweater weather / i wanna be your girlfriend,untrusted;creamy;11:11 Music Group,57,chill,0.655,-7.991,0.299,0.329,124.045
13677,Sweater Weather (Lofi Remix),89ine,45,chill,0.603,-16.694,0.103,0.076,96.018
25954,Sweater Weather,Gaullin;Julian Perretta,63,electronic,0.57,-5.188,0.00853,0.236,125.086
29957,Sweater Weather,Remzcore,61,french,0.507,-0.181,0.0467,0.23,95.0
44165,Sweater Weather,Swattrex;Lofi By Swattrex,0,indian,0.464,-14.002,0.208,0.0541,96.035


In [121]:
data[data['track_name'].str.contains(title, na=False, case=False) & data['artists'].str.contains(artist, na=False, case=False)]

Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
5294,15HNdxGKNCIO9pgaY4n7FU,Official HIGE DANdism,Traveler,Pretender,60,326842,False,0.538,0.869,8,-3.464,1,0.0275,0.047,0.0,0.14,0.369,91.972,4,anime
5537,2XRTwDgMQajOFOO7dH5pvP,Official HIGE DANdism,One-Man Tour 2021-2022 -Editorial-@Saitama Sup...,Pretender - LIVE,25,344813,False,0.491,0.786,8,-7.452,1,0.0382,0.0178,0.0,0.671,0.251,91.957,4,anime
62587,15HNdxGKNCIO9pgaY4n7FU,Official HIGE DANdism,Traveler,Pretender,60,326842,False,0.538,0.869,8,-3.464,1,0.0275,0.047,0.0,0.14,0.369,91.972,4,j-pop
63231,15HNdxGKNCIO9pgaY4n7FU,Official HIGE DANdism,Traveler,Pretender,60,326842,False,0.538,0.869,8,-3.464,1,0.0275,0.047,0.0,0.14,0.369,91.972,4,j-rock
63528,2XRTwDgMQajOFOO7dH5pvP,Official HIGE DANdism,One-Man Tour 2021-2022 -Editorial-@Saitama Sup...,Pretender - LIVE,25,344813,False,0.491,0.786,8,-7.452,1,0.0382,0.0178,0.0,0.671,0.251,91.957,4,j-rock


# Create Playlist Song Recommendation

In [122]:
most_similar = np.argpartition(np.mean((X_scaled - X_scaled[4146].T)**2, axis=1), kth=7)[:7]
results.iloc[most_similar]

Unnamed: 0,track_name,artists,popularity,track_genre,danceability,loudness,acousticness,valence,tempo,cluster
4146,Pretender,Official HIGE DANdism,60,anime,0.538,-3.464,0.047,0.369,91.972,6
54159,Rise,Skillet,61,metal,0.57,-3.892,0.0127,0.381,87.938,6
54042,Decadence,Disturbed,62,metal,0.528,-3.457,5.7e-05,0.305,91.934,6
54077,One For the Money,Escape the Fate,65,metal,0.525,-3.451,0.000328,0.385,90.013,6
44972,Get Got,Death Grips,59,industrial,0.57,-3.757,0.00628,0.405,87.0,6
54374,tear gas,Architects,58,metalcore,0.51,-3.557,4.5e-05,0.336,85.985,6
74328,Corazon de mimbre,Marea,55,spanish,0.521,-3.548,0.0235,0.323,90.555,6


In [148]:
# Create a playlist based on the index of the songs
playlist = [1897] # [3884, 48383, 48625, 50201]

# Find groups each correspond to
songs = results.iloc[playlist]
clusters = euc_model.predict(X_scaled[playlist])
majority = np.argmax(np.bincount(clusters))

## Find the nearest song based on the correlation distance between the two songs
Intuitively, this is because similar songs won't have exactly the same absolute values between the values of their attributes, but instead they should have similar relative values between their features (i.e. similar feels and vibes). Therefore, it makes sense to instead consider their correlation distance with each other.

In [149]:
# For each song in majority cluster, pick the songs that are cloest to the songs in the playlist
song_score = []
indices = results[results['cluster'] == str(majority)].index
N = len(playlist)
for song in indices:
    score = float('inf')
    for liked_song in playlist:
        score = min(score, correlation(X_scaled[song], X_scaled[liked_song])) # min(score, np.mean((X_scaled[song] - X_scaled[liked_song])**2))
    song_score.append(score)

In [150]:
results.iloc[playlist]

Unnamed: 0,track_name,artists,popularity,track_genre,danceability,loudness,acousticness,valence,tempo,cluster
1897,Sweater Weather,The Neighbourhood,93,alt-rock,0.612,-2.81,0.0495,0.398,124.053,6


In [159]:
top_k = len(playlist) + 10
most_similar = np.argsort(song_score)[len(playlist):len(playlist) + 10]# np.argpartition(song_score, top_k)[:top_k]
results.iloc[indices[most_similar]]

Unnamed: 0,track_name,artists,popularity,track_genre,danceability,loudness,acousticness,valence,tempo,cluster
17354,PILLOWTALK,ZAYN,78,dance,0.584,-4.275,0.117,0.438,124.944,6
17299,Cool for the Summer,Demi Lovato,81,dance,0.583,-5.639,0.00425,0.336,114.06,6
2018,Tongue Tied,Grouplove,79,alt-rock,0.56,-5.835,0.00847,0.371,112.96,6
38501,All The Way Up (feat. Infared),Fat Joe;Remy Ma;French Montana;InfaRed,66,hardcore,0.536,-6.44,0.062,0.393,117.139,6
17380,This Is What You Came For,Calvin Harris;Rihanna,81,dance,0.631,-2.787,0.199,0.465,123.962,6
59768,I Lived,OneRepublic,71,piano,0.593,-5.456,0.0683,0.344,119.987,6
34989,Gone Away,Five Finger Death Punch,54,groove,0.502,-7.57,0.000377,0.3,106.047,6
33536,Crystalline,Amaranthe,51,goth,0.49,-8.826,0.0644,0.281,109.93,6
25585,Outside (feat. Ellie Goulding),Calvin Harris;Ellie Goulding,79,electro,0.646,-4.123,0.213,0.418,128.035,6
37885,Highway to Hell,AC/DC,85,hard-rock,0.574,-4.793,0.061,0.423,115.728,6


# Genre Evaluation
Based on a custom-made grouping of the genres, evaluate the performance of the model.

In [77]:
# Custom genre groupings for each possible genre in the dataset
genre_groupings = [
    ['acoustic', 'folk', 'singer-songwriter', 'bluegrass', 'country', 'honky-tonk', 'rockabilly', 'study', 'guitar', 'piano', 'new-age', 'ambient', 'sleep', 'chill'],
    ['rock', 'alt-rock', 'alternative', 'hard-rock', 'grunge', 'punk', 'punk-rock', 'emo', 'psych-rock', 'rock-n-roll', 'indie', 'indie-pop', 'power-pop', 'goth', 'industrial'],
    ['metal', 'black-metal', 'death-metal', 'heavy-metal', 'hardcore', 'metalcore', 'grindcore', 'hardstyle'],
    ['electronic', 'edm', 'dance', 'electro', 'house', 'deep-house', 'chicago-house', 'progressive-house', 'techno', 'detroit-techno', 'minimal-techno', 'trance', 'dubstep', 'drum-and-bass', 'breakbeat', 'idm', 'trip-hop', 'garage', 'club', 'party', 'synth-pop', 'disco'],
    ['hip-hop', 'r-n-b', 'soul', 'funk', 'groove', 'reggae', 'dancehall', 'dub'],
    ['jazz', 'blues', 'classical', 'opera', 'show-tunes', 'disney', 'pop-film', 'romance', 'sad', 'happy'],
    ['pop', 'k-pop', 'j-pop', 'j-dance', 'j-idol', 'j-rock', 'anime', 'cantopop', 'mandopop'],
    ['latin', 'latino', 'salsa', 'samba', 'brazil', 'mpb', 'pagode', 'forro', 'sertanejo', 'tango', 'spanish', 'afrobeat', 'reggaeton'],
    ['world-music', 'children', 'kids', 'comedy', 'french', 'german', 'indian', 'iranian', 'malay', 'turkish', 'british', 'swedish']
]

# Create genre to group dictionary
genre_groups = {}
for idx, group in enumerate(genre_groupings):
    for genre in group:
        genre_groups[genre] = idx


In [78]:
# Find group majority
group_indices = [genre_groups[genre] for genre in results.iloc[playlist].track_genre]
counts = np.bincount(group_indices)
majority_group = np.argmax(counts)

# Calculate the ratio of the recommended songs that are in the same genre group (higher is better)
total = top_k - len(playlist)
in_group = sum([1 for genre in results.iloc[indices[most_similar]][len(playlist)-1:].track_genre if genre_groups[genre] == majority_group])
print(in_group / total)

0.1
