In [None]:
import pandas as pd 
import numpy as np

Let's do some basic data cleaning! First let's drop some duplicate tracks.

In [None]:
spotify_df = pd.read_csv('SpotifyFeatures.csv')
spotify_df[spotify_df['track_id'] == "6iOvnACn4ChlAw4lWUU4dd"]

For this id, we can see that if a song appears across generes, it will be a duplicate entry. For simplicity, I'm going to remove duplicate IDs and keep the first ID.

In [None]:
spotify_df = spotify_df.drop_duplicates(subset=["track_id"], keep="first")
spotify_df[spotify_df['track_id'] == "6iOvnACn4ChlAw4lWUU4dd"]

In [None]:
import seaborn as sns

sns.histplot(spotify_df, x="popularity")

In this histogram, we can see that the dataset contains a lot of (no judgement!) low-popularity songs. Let's add a baseline popularity to analyze more relevant songs.

In [None]:
sns.histplot(spotify_df[spotify_df['popularity'] > 10], x="popularity")

In [None]:
spotify_df = spotify_df[spotify_df["popularity"] > 10]

I want to store track metadata for retrieval in the future, and eventually the results of our cluster analysis. Let's store the data in our DuckDB instance and stand up some initial tables.

In [None]:
from setup_db import DB_NAME
import duckdb

with duckdb.connect(database=DB_NAME) as con:
    con.execute("CREATE TABLE track AS SELECT * FROM spotify_df")
    con.execute("ALTER TABLE track ADD PRIMARY KEY (track_id)")
    con.execute("CREATE TABLE cluster (id INTEGER PRIMARY KEY, x FLOAT, y FLOAT, size FLOAT)")
    con.execute(
        """
        CREATE TABLE track_to_cluster 
        (track_id VARCHAR, cluster_id INTEGER, 
        FOREIGN KEY (track_id) REFERENCES track (track_id),
        FOREIGN KEY (cluster_id) REFERENCES cluster (id))
        """
        )


Great! Now we can start some analysis to store in our db. We have a few categorical variables, like genre and key with many options, and mode that we could consider to be a boolean. Because of this mix of data types, I will first calculate a gower distance matrix to find similary between tracks. We'll at this point also define our train & test split.

In [None]:
import gower

from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
# use our track id as index for convenience & drop metadata
spotify_df = shuffle(spotify_df)
df = spotify_df.set_index('track_id').drop(columns=['artist_name','track_name']).head(10000)

dist_matrix = gower.gower_matrix(df)

indices = np.arange(len(df))
train_idx, test_idx = train_test_split(indices,train_size=0.8,random_state=42)
dist_matrix_train = dist_matrix[np.ix_(train_idx, train_idx)]
dist_matrix_test = dist_matrix[np.ix_(test_idx, test_idx)]


With our distance matrix computed, we can do a basic parameter sweep with sklearn's Agglomerative Clustering function. I chose this as it accepts non-euclidean distances as a metric.

In [None]:
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score

cluster_scores = {}
for n in range(2, 15):
    cluster = AgglomerativeClustering(metric="precomputed", n_clusters=n, linkage="average")
    cluster.fit(dist_matrix_train)
    pred_labels = cluster.fit_predict(dist_matrix_test)

    score = silhouette_score(dist_matrix_test, pred_labels, metric="precomputed")
    cluster_scores[n] = score

best_cluster = max(cluster_scores, key=cluster_scores.get)



In [None]:
cluster = AgglomerativeClustering(n_clusters=best_cluster, metric="precomputed", linkage="average")
cluster.fit(dist_matrix_train)
cluster.fit_predict(dist_matrix)

df["cluster_label"] = cluster.labels_

In [None]:
from sklearn.manifold import TSNE

t_sne = TSNE(n_components=2, metric='precomputed', init='random', random_state=42)
xy = t_sne.fit_transform(dist_matrix)
sns.scatterplot(x=xy[:, 0], y=xy[:, 1], hue=df['cluster_label'])

Our gower distance calc resulted in some overlapping clusters - let's try a different method to find similar song clusters! This time I'll use k-means clustering and encode the categorical variables.

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

df: pd.DataFrame = spotify_df.set_index('track_id').drop(columns=['artist_name','track_name']).head(10000)
categorical_df = df.select_dtypes(exclude=['number']).columns
def encode(column: pd.Series):
    unique_vals = column.unique()
    encoding_map = {col: i for i,col in enumerate(unique_vals)}
    column = column.map(encoding_map)
    return column

df = df.apply(lambda x: encode(x) if x.name in categorical_df else x)
scaler = StandardScaler()
df_znorm = scaler.fit_transform(df)
best_cluster = {}

for n in range(2,20):
    kmeans = KMeans(n_clusters=n, random_state=42)
    cluster = kmeans.fit(df_znorm)
    best_cluster[n] = cluster.inertia_

sns.pointplot(x=best_cluster.keys(), y=best_cluster.values())


In [None]:
best_cluster_n = 12
kmeans = KMeans(n_clusters=best_cluster_n, random_state=42)
kmeans.fit_predict(df_znorm)
centroids = kmeans.cluster_centers_
df["cluster_labels"] = kmeans.labels_



In [None]:
from sklearn.manifold import TSNE
t_sne = TSNE(n_components=2, random_state=42)
xy_centroids = t_sne.fit_transform(np.vstack([df_znorm, centroids]))

xy = xy_centroids[:-len(centroids)]
centroid_coords = xy_centroids[-len(centroids):]

sns.scatterplot(x=xy[:,0], y=xy[:,1], hue=df['cluster_labels'], palette='plasma')

In [None]:
from cluster_grid import generate_cluster_positions_df

cluster_info = generate_cluster_positions_df(df['cluster_labels'], n_clusters=best_cluster_n, layout_type='circular')
cluster_info = cluster_info.rename(columns={'cluster_id':'id', 'x_position':'x', 'y_position':'y','cluster_size':'size'})
with duckdb.connect(database=DB_NAME) as con:
    con.execute("INSERT INTO cluster BY NAME SELECT * FROM cluster_info")

cluster_info

In [None]:
trackdf = df[['cluster_labels']].reset_index().rename(columns={'cluster_labels':'cluster_id'})
with duckdb.connect(database=DB_NAME) as con:
    con.execute("INSERT INTO track_to_cluster BY NAME SELECT * FROM trackdf")

trackdf


In [None]:
df_znorm