# Label Clusters

Based on the centeroid of each label, we will try to find label clusters.

In [None]:
from mediaeval2021.dataloaders import acousticbrainz
import pandas as pd

dataset_path = '../data/mediaeval2019/'

dataloader = acousticbrainz.AcousticBrainzLoader(
    training_path=dataset_path + 'accousticbrainz-train.pickle',
    test_path=dataset_path + 'accousticbrainz-test.pickle',
    validation_path=dataset_path + 'accousticbrainz-validation.pickle',
)

train_data = dataloader.load_train()

features = dataloader.columns[2:]
train_features = pd.DataFrame(train_data[0], columns=features)

labels = list(map(lambda c: c[13:], dataloader.mlb.classes_))
train_labels = pd.DataFrame(train_data[1], columns=labels)


train_data = train_features.merge(train_labels, left_index=True, right_index=True)

centeroids = {}
for label in labels:
    centeroids[label] = train_data.loc[train_data[label] == 1][features].mean()

representative_samples = pd.DataFrame(centeroids).T

In [None]:
from sklearn import cluster
import matplotlib.pyplot as plt

Nc = range(1, 20)
kmeans = [cluster.KMeans(n_clusters=i) for i in Nc]
score = [kmeans[i].fit(representative_samples).score(representative_samples) for i in range(len(kmeans))]
plt.plot(Nc,score)
plt.xticks(Nc)
plt.xlabel('Number of Clusters')
plt.ylabel('Score')
plt.title('Elbow Curve')
plt.grid(True)
plt.show()

In [None]:
clustering = pd.DataFrame(representative_samples.index, columns=['label'])


n_clusters = 4

clustering['k-means_cluster'] = cluster.KMeans(n_clusters=n_clusters, random_state=0).fit(representative_samples).labels_
clustering['spectral_cluster'] = cluster.SpectralClustering(n_clusters=n_clusters).fit(representative_samples).labels_
clustering['agglomerative_cluster'] = cluster.AgglomerativeClustering(n_clusters=n_clusters).fit(representative_samples).labels_

display(clustering)

In [None]:
from sklearn.decomposition import PCA

pc = PCA(n_components=2).fit_transform(representative_samples)
clustering['pca_0'] = pc[... ,0]
clustering['pca_1'] = pc[..., 1]

clustering.plot.scatter(x='pca_0', y='pca_1', c='k-means_cluster', colormap='rainbow')
plt.show()
clustering.plot.scatter(x='pca_0', y='pca_1', c='k-means_cluster', colormap='rainbow')
plt.show()