In [None]:
VOC_ARTIST_STYLE_WEIGHT = 0.7   # Artist style (energy/intimacy/warmth) weighting
MOOD_WEIGHT = 0.4               # Mood weighting
KMEANS_RANDOM_STATE = 43        # K-Means random state seed. Fixing this means consistent results between runs
MAXIMUM_KMEANS_CLUSTERS = 13    # Maximum number of clusters to pass to K-Means for the Elbow plot
TOP_MOOD_COUNT = 8              # Maximum number of moods to be counted as "top" moods
EXAMPLE_ARTIST_COUNT = 3        # Maximum numbe of artists to use as examples for a cluster

In [None]:
%run database.ipynb
%run pathutils.ipynb
%run export.ipynb

In [None]:
connection = connect()
artists_df = load_artists(connection)
artist_moods_df = load_artist_moods(connection)
display(artists_df)
display(artist_moods_df)

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Generate a matrix mapping artists (rows) to moods (columns)
artist_moods_matrix_df = (
    artist_moods_df
    .assign(val=1)
    .pivot_table(index="Artist_Id", columns="Mood_Name", values="val",
                 fill_value=0, aggfunc="max")
)

# Merge the artist moods into the artists dataframe and fill any missing columns with 0
df = artists_df.merge(artist_moods_matrix_df, how="left", on="Artist_Id")
if len(artist_moods_matrix_df.columns) > 0:
    df[artist_moods_matrix_df.columns] = df[artist_moods_matrix_df.columns].fillna(0)

# Generate a matrix mapping artists (rows) to the vocal type and ensemble type categories (columns)
# Column name is "Category_Value" e.g. Ensemble_Trio
artist_categories_matrix_df = pd.get_dummies(df[["VocalPresence", "EnsembleType"]], prefix=["Vocals", "Ensemble"])

# Generate a matrix mapping artists (rows) to energy, intimacy and warmth (columns) as floating point numbers
artist_styles_matrix_df = df[["Energy", "Intimacy", "Warmth"]].astype(float)

# Get the mood related column names from the artist moods matrix and use them to extract the
# artist/mood mapping from the artists dataframe. Combine this with the styles and categories
# dataframe
mood_columns = list(artist_moods_matrix_df.columns)
artist_vibe_df = pd.concat([artist_styles_matrix_df, artist_categories_matrix_df, df[mood_columns] if mood_columns else pd.DataFrame(index=df.index)], axis=1)

# Scale style only (so Energy/Intimacy/Warmth become comparable)
scaler = StandardScaler()
artist_vibe_scaled_df = artist_vibe_df.copy()
artist_vibe_scaled_df[["Energy", "Intimacy", "Warmth"]] = scaler.fit_transform(artist_vibe_scaled_df[["Energy", "Intimacy", "Warmth"]])

# Apply weightings to the categories and moods
category_columns = list(artist_categories_matrix_df.columns)
if category_columns:
    artist_vibe_scaled_df[category_columns] = artist_vibe_scaled_df[category_columns] * VOC_ARTIST_STYLE_WEIGHT

if mood_columns:
    artist_vibe_scaled_df[mood_columns] = artist_vibe_scaled_df[mood_columns] * MOOD_WEIGHT

display(df)
print("Rows:", len(df), "Features:", artist_vibe_scaled_df.shape[1], "Moods:", len(mood_columns))

# Simple Catalogue Style Map

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 5))
plt.scatter(df["Energy"], df["Intimacy"], c=df["Warmth"])
plt.xlabel("Energy (0–5)")
plt.ylabel("Intimacy (0–5)")
plt.title("Style Map (Colour = Warmth)")
plt.colorbar(label="Warmth (0–5)")

# Export the chart
export_chart("catalogue-style-coloured-by-warmth", "", "png")

plt.show()

# K-Means

Each artist is treated as a point in multi-dimensional space where the dimensions (features) are the style parameters, vocal presence, ensemble type and the moods. K-Means:

- Places k ‘centres of gravity’ randomly in the artist feature space
- Calculates the distance between each artist and all of the centres of gravity
- Assigns each artist to the centre of gravity closest to them
- Moves each centre of gravity to the average position of all the artists in it

This is repeated internally a number of times. At the end of each run, the total _inertia_ for each cluster - the sum of squared distances from every artist to its assigned centre - is calculated (this is a measure of how tight the clusters are overall).

The number of runs is deemed sufficient when one of the following is true:

- The centres of gravity stop moving, within a tolerance limit
- A maximum number of iterations is reached

The "artist_clusters" returned by the K-Means fit is a 1D array of integers, one per artist, in which the value indicates the 0-based index of the cluster the artist was assigned to.

# Silhouette Score

The silhouette score does the following

- Measures the distance to the other artists in the same cluster
- Averages the result to give an _in-cluster_ distance, _a_
- For every other cluster, measures the average distance to the artists in that cluster 
- Takes the _smallest_ of those distances, _b_
- Calculates a score:

$$
\text{silhouette} = \frac{b - a}{\max(a, b)}
$$

- The average score over all artists is calculated to give the silhouette score for the current number of clusters, _K_

For a given _K_, silhouette scoring checks each artist’s “fit in its current cluster vs temptation to join another", then averages that confidence across the whole catalogue.

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Extract the per-artist feature values from the data frame. This results in a list of lists, one per artist, where the per-artist
# list contains the values from their row. This is the raw data for K-Means
per_artist_values = artist_vibe_scaled_df.values

inertia = []
silhouette = []

# Iterate up to the maximum number of clusters
for number_of_clusters in range(2, MAXIMUM_KMEANS_CLUSTERS):
    # Use K-Means to calculate the intertia
    km = KMeans(n_clusters=number_of_clusters, random_state=KMEANS_RANDOM_STATE, n_init="auto")
    artist_clusters = km.fit_predict(per_artist_values)
    inertia.append(km.inertia_)

    # Calculate the silhouette score
    silhouette.append(silhouette_score(per_artist_values, artist_clusters))

# Elbow Plot

Plotting inertia versus the number of clusters usually results in a curve with these characteristics:

- Steep initial decline in inertia as "k" increases
- A relatively sharp "elbow" after which inertia doesn't decrease much

The elbow marks the point of diminishing returns

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 5))
plt.plot(list(range(2, MAXIMUM_KMEANS_CLUSTERS)), inertia, marker="o")
plt.xlabel("K")
plt.ylabel("Inertia")
plt.title("Elbow plot")

# Export the chart
export_chart("catalogue-clustering-elbow-plot", "", "png")

plt.show()

# Silhouette Score Chart

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 5))
plt.plot(list(range(2, MAXIMUM_KMEANS_CLUSTERS)), silhouette, marker="o")
plt.xlabel("K")
plt.ylabel("Silhouette Score")
plt.title("Silhouette Score vs K")

# Export the chart
export_chart("catalogue-clustering-silhouette-score", "", "png")

plt.show()

# Knee Detection

Automatically detect the knee in the elbow plot, to identify the point of diminishing returns. The silhouette chart can be used to confirm clear separation at the auto-detected value (and that value can be overridden if need be).

In [None]:
from kneed import KneeLocator

kl = KneeLocator(range(2, MAXIMUM_KMEANS_CLUSTERS), inertia, curve="convex", direction="decreasing")
K = kl.knee

print(f"Auto-detected K = {K}")

In [None]:
def bucket(x, lo, hi, labels):
    # labels: (low, mid, high)
    if x < lo: return labels[0]
    if x < hi: return labels[1]
    return labels[2]

# Cluster Calculation

The next cell calculates the clustering using K-Means and the optimal value for K then builds a summary data frame containing the properties for each cluster

In [None]:
# Determine cluster for each artist using K-Means and the optimal value of k and add the cluster to the artist values data frame
km = KMeans(n_clusters=K, random_state=42, n_init="auto")
df["cluster"] = km.fit_predict(per_artist_values)

cluster_rows = []
for c in sorted(df["cluster"].unique()):
    # Get the subset of the dataframe for cluster number c and calculate the mean values for the
    # style parameters in that subset
    cluster_subset_df = df[df["cluster"] == c]
    cluster_means_se = cluster_subset_df[["Energy", "Intimacy", "Warmth"]].mean()

    # Determine the top moods by prevalence inside the cluster
    if mood_columns:
        mood_prev = cluster_subset_df[mood_columns].mean().sort_values(ascending=False)
        top_moods = list(mood_prev.head(TOP_MOOD_COUNT).index)
    else:
        top_moods = []

    # Determine the most common Vocals/Ensemble codes in the cluster
    top_vocal_presence = cluster_subset_df["VocalPresence"].value_counts().head(1).index[0]
    top_ensemble_type = cluster_subset_df["EnsembleType"].value_counts().head(1).index[0]

    energy_tag = bucket(cluster_means_se["Energy"], 2.0, 3.5, ("low energy", "mid energy", "high energy"))
    intimacy_tag = bucket(cluster_means_se["Intimacy"], 2.0, 3.5, ("low intimacy", "mid intimacy", "high intimacy"))
    warmth_tag = bucket(cluster_means_se["Warmth"], 2.0, 3.5, ("cool", "warm", "very warm"))
    moods_tag = ", ".join(top_moods[:2]) if top_moods else ""

    cluster_label = (
        f"{energy_tag} / {intimacy_tag}\n"
        f"{warmth_tag}\n"
        f"{moods_tag}"
    )

    # Find some example artists in this cluster
    examples = cluster_subset_df.sort_values(["Warmth","Intimacy"], ascending=False)["Artist_Name"].head(EXAMPLE_ARTIST_COUNT).tolist()

    cluster_rows.append({
        "cluster": c,
        "n_artists": len(cluster_subset_df),
        "mean_energy": round(cluster_means_se["Energy"], 2),
        "mean_intimacy": round(cluster_means_se["Intimacy"], 2),
        "mean_warmth": round(cluster_means_se["Warmth"], 2),
        "top_vocal_presence": top_vocal_presence,
        "top_ensemble_type": top_ensemble_type,
        "vibe_label": cluster_label,
        "top_moods": ", ".join(top_moods),
        "example_artists": ", ".join(examples)
    })

cluster_summary_df = pd.DataFrame(cluster_rows).sort_values("n_artists", ascending=False)
cluster_summary_df

In [None]:
import pandas as pd

# Export the data
export_to_spreadsheet("style-clusters", {
    "Style Clusters": cluster_summary_df
})

In [None]:
centroids = df.groupby("cluster")[["Energy","Intimacy"]].mean()
artist_clusters = dict(zip(cluster_summary_df["cluster"], cluster_summary_df["vibe_label"]))

plt.figure(figsize=(12, 5))
plt.scatter(df["Energy"], df["Intimacy"], c=df["cluster"])

for c, (x, y) in centroids.iterrows():
    plt.text(x, y, artist_clusters[c], fontsize=8)

plt.xlabel("Energy (0–5)")
plt.ylabel("Intimacy (0–5)")
plt.title(f"Clustered Style Map (k={K}, Colour=Cluster)")

# Export the chart
export_chart("catalogue-style-with-clustering", "", "png")

plt.show()