In [4]:
import numpy as np
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances, silhouette_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from scipy.spatial.distance import correlation
from tqdm.notebook import tqdm
import pickle
import os

# Data Preprocessing

In [5]:
data = pd.read_csv('..frontend/data/dataset.csv', index_col=0)
data.drop_duplicates(subset='track_id')
features = ['track_name', 'artists', 'popularity', 'track_genre', 'danceability', 'loudness', 'acousticness', 'valence', 'tempo']
X = data[features].copy()
X.drop_duplicates(subset=['track_name', 'artists'], keep='first', inplace=True)
X.reset_index(drop=True, inplace=True)  # fix indices after removing duplicates

# Standardize data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X[['popularity', 'danceability', 'loudness', 'acousticness', 'valence', 'tempo']])
cos_scaled = X_scaled / np.linalg.norm(X_scaled, axis=1)[:, np.newaxis]

euc_model = KMeans(n_clusters=9, init="k-means++", random_state=42)
cos_model = KMeans(n_clusters=9, init="k-means++", random_state=42)

In [8]:
with open('models/eucd.pkl', 'rb') as file:
    euc_model = pickle.load(file)
with open('models/cos.pkl', 'rb') as file:
    cos_model = pickle.load(file)

clusters = euc_model.predict(X_scaled)
results = X.copy()
results['cluster'] = clusters.astype(str)

# cos_model.predict(cos_scaled)
# results['cluster'] = clusters.astype(str)

"""
with open('eucd.pkl', 'wb') as handle:
    pickle.dump(euc_model, handle)
with open('cos.pkl', 'wb') as handle:
    pickle.dump(cos_model, handle)
"""


"\nwith open('eucd.pkl', 'wb') as handle:\n    pickle.dump(euc_model, handle)\nwith open('cos.pkl', 'wb') as handle:\n    pickle.dump(cos_model, handle)\n"

In [9]:
results[results['cluster'] == '2'].track_genre.value_counts()

track_genre
salsa          482
forro          380
dancehall      355
children       341
sertanejo      308
              ... 
heavy-metal      5
metalcore        4
ambient          4
romance          2
black-metal      1
Name: count, Length: 108, dtype: int64

# Create Song Search

In [12]:
title = 'Hello'
artist = 'Adele'
X[X['track_name'].str.contains(title, na=False, case=False) & X['artists'].str.contains(artist, na=False, case=False)]

Unnamed: 0,track_name,artists,popularity,track_genre,danceability,loudness,acousticness,valence,tempo
9357,Hello,Adele,74,british,0.578,-6.134,0.33,0.288,78.991


In [11]:
data[data['track_name'].str.contains(title, na=False, case=False) & data['artists'].str.contains(artist, na=False, case=False)]

Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
2003,2QjOHCTQ1Jl3zawyYOpxh6,The Neighbourhood,I Love You.,Sweater Weather,93,240400,False,0.612,0.807,10,-2.81,1,0.0336,0.0495,0.0177,0.101,0.398,124.053,4,alt-rock
2804,5YCPqMoXNlPeUJ9fm1dlz3,The Neighbourhood;Young Saab,Sweater Weather (Young Saab Remix),Sweater Weather - Young Saab Remix,61,267270,False,0.583,0.954,7,-4.179,0,0.0788,0.000553,0.243,0.414,0.421,124.029,4,alt-rock
3003,2QjOHCTQ1Jl3zawyYOpxh6,The Neighbourhood,I Love You.,Sweater Weather,93,240400,False,0.612,0.807,10,-2.81,1,0.0336,0.0495,0.0177,0.101,0.398,124.053,4,alternative
15743,3NKUFgqI2sMrtIRGx5IHwx,untrusted;creamy;11:11 Music Group,sweater weather / i wanna be your girlfriend,sweater weather / i wanna be your girlfriend,57,235161,False,0.655,0.538,10,-7.991,1,0.0378,0.299,0.393,0.0762,0.329,124.045,4,chill
15774,2Wp8cVfQ9g9Fl1e6MDxXSG,89ine,Sweater Weather (Lofi Remix),Sweater Weather (Lofi Remix),45,215872,False,0.603,0.296,0,-16.694,0,0.0424,0.103,0.235,0.106,0.076,96.018,4,chill
32223,6siDDghVE7fxel5o7fE4OF,Gaullin;Julian Perretta,Sweater Weather,Sweater Weather,63,171133,False,0.57,0.873,5,-5.188,0,0.07,0.00853,0.0,0.365,0.236,125.086,4,electronic
36857,3ezoEY4TKU8frUXAYFJstC,Remzcore,Sweater Weather,Sweater Weather,61,230389,False,0.507,0.985,1,-0.181,0,0.192,0.0467,0.000394,0.107,0.23,95.0,4,french
48508,3ezoEY4TKU8frUXAYFJstC,Remzcore,Sweater Weather,Sweater Weather,61,230389,False,0.507,0.985,1,-0.181,0,0.192,0.0467,0.000394,0.107,0.23,95.0,4,hardcore
55681,0BXzHqVTlb88KmbgM56H70,Swattrex;Lofi By Swattrex,Sweater Weather,Sweater Weather,0,305117,False,0.464,0.397,1,-14.002,1,0.0468,0.208,0.865,0.112,0.0541,96.035,4,indian
81853,2QjOHCTQ1Jl3zawyYOpxh6,The Neighbourhood,I Love You.,Sweater Weather,93,240400,False,0.612,0.807,10,-2.81,1,0.0336,0.0495,0.0177,0.101,0.398,124.053,4,pop


# Create Playlist Song Recommendation

In [18]:
most_similar = np.argpartition(np.mean((X_scaled - X_scaled[9357].T)**2, axis=1), kth=7)[:7]
results.iloc[most_similar]

Unnamed: 0,track_name,artists,popularity,track_genre,danceability,loudness,acousticness,valence,tempo,cluster
9390,Chasing Pavements,Adele,74,british,0.616,-6.092,0.291,0.325,80.03,6
9357,Hello,Adele,74,british,0.578,-6.134,0.33,0.288,78.991,6
16793,The Bones,Maren Morris,75,country,0.612,-6.642,0.278,0.334,77.004,6
16766,The Bones - with Hozier,Maren Morris;Hozier,71,country,0.561,-6.0,0.286,0.355,76.826,6
25071,Burn,Ellie Goulding,71,edm,0.559,-5.031,0.31,0.329,87.016,6
28021,You're the Inspiration - 2006 Remaster,Chicago,73,folk,0.583,-6.364,0.368,0.388,74.008,6
61452,Tera Yaar Hoon Main,Rochak Kohli;Arijit Singh,66,pop,0.559,-5.525,0.34,0.309,83.978,6


In [14]:
# Create a playlist based on the index of the songs
playlist = [9357] # [3884, 48383, 48625, 50201]

# Find groups each correspond to
songs = results.iloc[playlist]
clusters = euc_model.predict(X_scaled[playlist])
majority = np.argmax(np.bincount(clusters))

## Find the nearest song based on the correlation distance between the two songs
Intuitively, this is because similar songs won't have exactly the same absolute values between the values of their attributes, but instead they should have similar relative values between their features (i.e. similar feels and vibes). Therefore, it makes sense to instead consider their correlation distance with each other.

In [15]:
# For each song in majority cluster, pick the songs that are cloest to the songs in the playlist
song_score = []
indices = results[results['cluster'] == str(majority)].index
N = len(playlist)
for song in indices:
    score = float('inf')
    for liked_song in playlist:
        score = min(score, correlation(X_scaled[song], X_scaled[liked_song])) # min(score, np.mean((X_scaled[song] - X_scaled[liked_song])**2))
    song_score.append(score)

In [16]:
results.iloc[playlist]

Unnamed: 0,track_name,artists,popularity,track_genre,danceability,loudness,acousticness,valence,tempo,cluster
9357,Hello,Adele,74,british,0.578,-6.134,0.33,0.288,78.991,6


In [17]:
top_k = len(playlist) + 10
most_similar = np.argsort(song_score)[len(playlist):len(playlist) + 10]# np.argpartition(song_score, top_k)[:top_k]
results.iloc[indices[most_similar]]

Unnamed: 0,track_name,artists,popularity,track_genre,danceability,loudness,acousticness,valence,tempo,cluster
61309,Mera Dil Bhi Kitna Pagal Hai - Recreated Version,Stebin Ben,70,pop,0.601,-5.987,0.333,0.375,94.017,6
9390,Chasing Pavements,Adele,74,british,0.616,-6.092,0.291,0.325,80.03,6
27030,ghost girl,Lil Peep,66,emo,0.52,-7.069,0.281,0.317,80.035,6
61452,Tera Yaar Hoon Main,Rochak Kohli;Arijit Singh,66,pop,0.559,-5.525,0.34,0.309,83.978,6
15,Falling in Love at a Coffee Shop,Landon Pigg,58,acoustic,0.489,-7.933,0.2,0.238,83.457,6
61463,Temporary pyar,Kaka,65,pop,0.531,-7.213,0.281,0.339,79.942,6
25071,Burn,Ellie Goulding,71,edm,0.559,-5.031,0.31,0.329,87.016,6
16793,The Bones,Maren Morris,75,country,0.612,-6.642,0.278,0.334,77.004,6
41342,Sohne Lagde,Sidhu Moose Wala;The PropheC,64,hip-hop,0.589,-5.651,0.354,0.327,84.384,6
16766,The Bones - with Hozier,Maren Morris;Hozier,71,country,0.561,-6.0,0.286,0.355,76.826,6


# Genre Evaluation
Based on a custom-made grouping of the genres, evaluate the performance of the model.

In [77]:
# Custom genre groupings for each possible genre in the dataset
genre_groupings = [
    ['acoustic', 'folk', 'singer-songwriter', 'bluegrass', 'country', 'honky-tonk', 'rockabilly', 'study', 'guitar', 'piano', 'new-age', 'ambient', 'sleep', 'chill'],
    ['rock', 'alt-rock', 'alternative', 'hard-rock', 'grunge', 'punk', 'punk-rock', 'emo', 'psych-rock', 'rock-n-roll', 'indie', 'indie-pop', 'power-pop', 'goth', 'industrial'],
    ['metal', 'black-metal', 'death-metal', 'heavy-metal', 'hardcore', 'metalcore', 'grindcore', 'hardstyle'],
    ['electronic', 'edm', 'dance', 'electro', 'house', 'deep-house', 'chicago-house', 'progressive-house', 'techno', 'detroit-techno', 'minimal-techno', 'trance', 'dubstep', 'drum-and-bass', 'breakbeat', 'idm', 'trip-hop', 'garage', 'club', 'party', 'synth-pop', 'disco'],
    ['hip-hop', 'r-n-b', 'soul', 'funk', 'groove', 'reggae', 'dancehall', 'dub'],
    ['jazz', 'blues', 'classical', 'opera', 'show-tunes', 'disney', 'pop-film', 'romance', 'sad', 'happy'],
    ['pop', 'k-pop', 'j-pop', 'j-dance', 'j-idol', 'j-rock', 'anime', 'cantopop', 'mandopop'],
    ['latin', 'latino', 'salsa', 'samba', 'brazil', 'mpb', 'pagode', 'forro', 'sertanejo', 'tango', 'spanish', 'afrobeat', 'reggaeton'],
    ['world-music', 'children', 'kids', 'comedy', 'french', 'german', 'indian', 'iranian', 'malay', 'turkish', 'british', 'swedish']
]

# Create genre to group dictionary
genre_groups = {}
for idx, group in enumerate(genre_groupings):
    for genre in group:
        genre_groups[genre] = idx


In [78]:
# Find group majority
group_indices = [genre_groups[genre] for genre in results.iloc[playlist].track_genre]
counts = np.bincount(group_indices)
majority_group = np.argmax(counts)

# Calculate the ratio of the recommended songs that are in the same genre group (higher is better)
total = top_k - len(playlist)
in_group = sum([1 for genre in results.iloc[indices[most_similar]][len(playlist)-1:].track_genre if genre_groups[genre] == majority_group])
print(in_group / total)

0.1
