In [2]:
import numpy as np
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances, silhouette_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from tqdm.notebook import tqdm
import os

# Data Preprocessing

In [31]:
data = pd.read_csv('data/dataset.csv', index_col=0)
data.drop_duplicates(subset='track_id')
features = ['track_name', 'artists', 'popularity', 'track_genre', 'danceability', 'loudness', 'acousticness', 'valence', 'tempo']
X = data[features].copy()
X.drop_duplicates(subset=['track_name', 'artists'], keep='first', inplace=True)
X.reset_index(drop=True, inplace=True)  # fix indices after removing duplicates

# Standardize data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X[['danceability', 'loudness', 'acousticness', 'valence', 'tempo']])

model = KMeans(n_clusters=9, init="k-means++", random_state=42)

In [32]:
clusters = model.fit_predict(X_scaled)
results = X.copy()
results['cluster'] = clusters.astype(str)

In [33]:
results[results['cluster'] == '2'].track_genre.value_counts()

track_genre
grindcore        622
death-metal      478
black-metal      414
heavy-metal      369
metalcore        267
                ... 
latino             3
comedy             3
reggae             3
reggaeton          3
chicago-house      1
Name: count, Length: 113, dtype: int64

# Create Song Search

In [34]:
title = 'up town funk'
artist = ''
X[X['track_name'].str.contains(title, na=False, case=False) & X['artists'].str.contains(artist, na=False, case=False)]

Unnamed: 0,track_name,artists,popularity,track_genre,danceability,loudness,acousticness,valence,tempo


# Create Playlist Song Recommendation

In [35]:
most_similar = np.argpartition(np.mean((X_scaled - X_scaled[50201].T)**2, axis=1), kth=4)[:4]
results.iloc[most_similar]

Unnamed: 0,track_name,artists,popularity,track_genre,danceability,loudness,acousticness,valence,tempo,cluster
50201,Blue Flame,LE SSERAFIM,75,k-pop,0.781,-3.571,0.0281,0.765,112.045,0
17307,Take You Dancing,Jason Derulo,78,dance,0.789,-4.248,0.0332,0.753,112.985,0
8889,Back to Flow,Krafty Kuts,13,breakbeat,0.773,-3.973,0.0129,0.734,110.027,0
4125,Always,ENHYPEN,59,anime,0.804,-3.836,0.0466,0.73,110.014,0


In [41]:
# Create a playlist based on the index of the songs
playlist = [50201]# [3884, 48383, 48625, 50201]

# Find groups each correspond to
songs = results.iloc[playlist]
clusters = model.predict(X_scaled[playlist])
majority = np.argmax(np.bincount(clusters))

In [42]:
# For each song in majority cluster, pick the songs that are cloest to the songs in the playlist
song_score = []
indices = results[results['cluster'] == str(majority)].index
N = len(playlist)
for song in indices:
    score = float('inf')
    for liked_song in playlist:
        score = min(score, np.mean((X_scaled[song] - X_scaled[liked_song])**2))
    song_score.append(score)

In [43]:
results.iloc[playlist]

Unnamed: 0,track_name,artists,popularity,track_genre,danceability,loudness,acousticness,valence,tempo,cluster
50201,Blue Flame,LE SSERAFIM,75,k-pop,0.781,-3.571,0.0281,0.765,112.045,0


In [44]:
top_k = len(playlist) + 3
most_similar = np.argpartition(song_score, top_k)[:top_k]
results.iloc[indices[most_similar]]

Unnamed: 0,track_name,artists,popularity,track_genre,danceability,loudness,acousticness,valence,tempo,cluster
50201,Blue Flame,LE SSERAFIM,75,k-pop,0.781,-3.571,0.0281,0.765,112.045,0
17307,Take You Dancing,Jason Derulo,78,dance,0.789,-4.248,0.0332,0.753,112.985,0
8889,Back to Flow,Krafty Kuts,13,breakbeat,0.773,-3.973,0.0129,0.734,110.027,0
4125,Always,ENHYPEN,59,anime,0.804,-3.836,0.0466,0.73,110.014,0


# Genre Evaluation
Based on a custom-made grouping of the genres, evaluate the performance of the model.

In [45]:
# Custom genre groupings for each possible genre in the dataset
genre_groupings = [
    ['acoustic', 'folk', 'singer-songwriter', 'bluegrass', 'country', 'honky-tonk', 'rockabilly', 'study', 'guitar', 'piano', 'new-age', 'ambient', 'sleep', 'chill'],
    ['rock', 'alt-rock', 'alternative', 'hard-rock', 'grunge', 'punk', 'punk-rock', 'emo', 'psych-rock', 'rock-n-roll', 'indie', 'indie-pop', 'power-pop', 'goth', 'industrial'],
    ['metal', 'black-metal', 'death-metal', 'heavy-metal', 'hardcore', 'metalcore', 'grindcore', 'hardstyle'],
    ['electronic', 'edm', 'dance', 'electro', 'house', 'deep-house', 'chicago-house', 'progressive-house', 'techno', 'detroit-techno', 'minimal-techno', 'trance', 'dubstep', 'drum-and-bass', 'breakbeat', 'idm', 'trip-hop', 'garage', 'club', 'party', 'synth-pop', 'disco'],
    ['hip-hop', 'r-n-b', 'soul', 'funk', 'groove', 'reggae', 'dancehall', 'dub'],
    ['jazz', 'blues', 'classical', 'opera', 'show-tunes', 'disney', 'pop-film', 'romance', 'sad', 'happy'],
    ['pop', 'k-pop', 'j-pop', 'j-dance', 'j-idol', 'j-rock', 'cantopop', 'mandopop', 'british', 'swedish'],
    ['latin', 'latino', 'salsa', 'samba', 'brazil', 'mpb', 'pagode', 'forro', 'sertanejo', 'tango', 'spanish', 'afrobeat', 'reggaeton'],
    ['world-music', 'anime', 'children', 'kids', 'comedy', 'french', 'german', 'indian', 'iranian', 'malay', 'turkish']
]

# Create genre to group dictionary
genre_groups = {}
for idx, group in enumerate(genre_groupings):
    for genre in group:
        genre_groups[genre] = idx


In [46]:
# Find group majority
group_indices = [genre_groups[genre] for genre in results.iloc[playlist].track_genre]
counts = np.bincount(group_indices)
majority_group = np.argmax(counts)

# Calculate the ratio of the recommended songs that are in the same genre group (higher is better)
total = top_k - len(playlist)
in_group = sum([1 for genre in results.iloc[indices[most_similar]][len(playlist)-1:].track_genre if genre_groups[genre] == majority_group])
print(in_group / total)

0.3333333333333333
