In [1]:
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import os

# Data Preprocessing

## Load Data

In [2]:
data = pd.read_csv('data/dataset.csv', index_col=0)
data.head()

Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,73,230666,False,0.676,0.461,1,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic
1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,55,149610,False,0.42,0.166,1,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,acoustic
2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,57,210826,False,0.438,0.359,0,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic
3,6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,Crazy Rich Asians (Original Motion Picture Sou...,Can't Help Falling In Love,71,201933,False,0.266,0.0596,0,-18.515,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3,acoustic
4,5vjLSffimiIP26QG5WcN2K,Chord Overstreet,Hold On,Hold On,82,198853,False,0.618,0.443,2,-9.681,1,0.0526,0.469,0.0,0.0829,0.167,119.949,4,acoustic


In [3]:
features = ['track_name', 'track_genre', 'danceability', 'loudness', 'acousticness', 'valence', 'tempo']
X = data[features].copy()

In [4]:
X_train, X_test = train_test_split(X, test_size=0.25, random_state=42)

## Standardize Data
Since danceability, acousticness, and valence are already on a \[0, 1\] scale, we just need to standardize the tempo and loudness data

In [5]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train[['danceability', 'loudness', 'acousticness', 'valence', 'tempo']])

In [6]:
X_train_scaled

array([[ 0.01123353,  0.62958632, -0.79986331, -0.30136515, -0.07159454],
       [ 0.34029685,  0.91292818, -0.7264848 ,  1.63690055,  0.56523708],
       [-1.3973884 ,  0.44108859,  0.89987173, -0.98863864, -0.68824429],
       ...,
       [ 0.35761597, -0.63334851,  0.84273273,  1.56740098, -0.10644503],
       [-0.84317649, -1.37464066,  1.9043153 , -1.37474734, -0.59685783],
       [-0.01185864, -2.06652657,  1.85319093, -1.51104372, -0.23893033]])

# K-Means - Euclidean Distance

In [7]:
kmeans = KMeans(n_clusters=8, init="k-means++", random_state=42)

In [8]:
clusters = kmeans.fit_predict(X_train_scaled)

In [9]:
results = X_train.copy()  # create a copy of the original data, which includes the song name and genre
results['clusters'] = clusters.astype(str)  # add the predicted clusters to each data point
results.head()

Unnamed: 0,track_name,track_genre,danceability,loudness,acousticness,valence,tempo,clusters
86480,Cansado de ser,punk,0.569,-5.09,0.0488,0.396,119.938,1
53743,I Just Called,house,0.626,-3.662,0.0732,0.898,138.997,7
103085,Someday At Christmas,soul,0.325,-6.04,0.614,0.218,101.483,4
8838,Tipsy,blues,0.772,-5.276,0.651,0.791,104.009,3
113095,Cornerstone - Live,world-music,0.391,-6.698,0.0477,0.129,142.977,5


## Plot Clusters
First, perform PCA on the grouped data  
Then, plot and color based on their cluster

In [10]:
pca = PCA(n_components=3)
components = pca.fit_transform(X_train_scaled)

In [11]:
# add components to each data point
results['pca1'] = components[:, 0]
results['pca2'] = components[:, 1]
results['pca3'] = components[:, 2]

In [12]:
fig = px.scatter_3d(results, x='pca1', y='pca2', z='pca3', color='clusters', labels={'clusters': 'Cluster', 'pca1': 'Principal Component 1', 'pca2': 'Principal Component 2', 'pca3': 'Principal Component 3'}, hover_data=['track_name', 'track_genre'], title='Clusters Visualized with PCA (3 Components)')
fig.show(renderer='iframe')

## Plot Two Clusters Against Each Other

In [13]:
group1 = results[results['clusters'].isin(['1', '2'])]

fig = px.scatter_3d(group1, x='pca1', y='pca2', z='pca3', color='clusters', labels={'clusters': 'Cluster'}, hover_data=['track_name', 'track_genre', 'loudness', 'acousticness', 'valence', 'tempo'], title='Clusters Visualized with PCA (3 Components)')
fig.show(renderer='iframe')

# K-Means - Cosine Distance
In order to use the distance metric of $1-\cos \theta$, where $\cos \theta=\frac{x\cdot y}{\lVert x \rVert\lVert y \rVert}$, observe that when $x$ and $y$ are unit vectors, we have
$$
x\cdot y=\cos \theta
$$
Therefore, the Euclidean distance is just,
$$
\begin{aligned}
\lVert x-y \rVert ^{2}&= (x-y)\cdot(x-y) \\
 & = x\cdot x-2x\cdot y+y\cdot y \\
 & = 2-2x\cdot y \\
 & =2(1-\cos\theta)
\end{aligned}
$$
So normalizing the vectors and minimizing the Euclidean distance is the same as minimizing the cosine distance.

In [41]:
X_train_unit = X_train_scaled / np.linalg.norm(X_train_scaled, axis=1)[:, np.newaxis]
cosCluster = KMeans(n_clusters=7, init="k-means++", random_state=42)

In [42]:
cosClusters = cosCluster.fit_predict(X_train_unit)

In [43]:
resultsCos = X_train.copy()  # create a copy of the original data, which includes the song name and genre
resultsCos['clusters'] = cosClusters.astype(str)  # add the predicted clusters to each data point
resultsCos.head()

Unnamed: 0,track_name,track_genre,danceability,loudness,acousticness,valence,tempo,clusters
86480,Cansado de ser,punk,0.569,-5.09,0.0488,0.396,119.938,2
53743,I Just Called,house,0.626,-3.662,0.0732,0.898,138.997,1
103085,Someday At Christmas,soul,0.325,-6.04,0.614,0.218,101.483,5
8838,Tipsy,blues,0.772,-5.276,0.651,0.791,104.009,0
113095,Cornerstone - Live,world-music,0.391,-6.698,0.0477,0.129,142.977,6


## Visualize Cosine Distance Clustering

In [44]:
pca = PCA(n_components=3)
components = pca.fit_transform(X_train_unit)

In [45]:
# add components to each data point
resultsCos['pca1'] = components[:, 0]
resultsCos['pca2'] = components[:, 1]
resultsCos['pca3'] = components[:, 2]

In [46]:
fig = px.scatter_3d(resultsCos, x='pca1', y='pca2', z='pca3', color='clusters', labels={'clusters': 'Cluster', 'pca1': 'Principal Component 1', 'pca2': 'Principal Component 2', 'pca3': 'Principal Component 3'}, hover_data=['track_name', 'track_genre'], title='Clusters Visualized with PCA (3 Components)')
fig.show(renderer='iframe')