# KMeans Clustering

In this notebook, we're gonna cluster our tracks.

In [26]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.cluster import AffinityPropagation
from sklearn import metrics

In [12]:
input_data_path = "features_msd_lda_sp.csv"
NO_OF_CLUSTER = 100
dataset = pd.read_csv(input_data_path)
dataset.drop(["Unnamed: 0"], 1, inplace=True)
dataset

Unnamed: 0,genre,track_id,artist_name,title,loudness_x,tempo_x,time_signature,key_x,mode_x,duration,...,key_y,loudness_y,mode_y,speechiness,acousticness,instrumentalness,liveness,valence,tempo_y,id
0,classic pop and rock,TRNJTPB128F427AE9F,Blue Oyster Cult,Screams,-10.659,148.462,1,4,0,189.80526,...,4,-9.009,0,0.0806,0.32600,0.014300,0.2240,0.4930,149.356,2KsnSzoJlmfT44jFOqRP1E
1,classic pop and rock,TRLFJHA128F427AEEA,Blue Oyster Cult,Dance The Night Away,-13.494,112.909,1,10,0,158.19710,...,10,-11.838,0,0.0359,0.90300,0.000000,0.1810,0.1930,80.510,4qV4ErzHMz4XYGWxMkOFBw
2,classic pop and rock,TRCQZAG128F427DB97,Blue Oyster Cult,Debbie Denise,-12.786,117.429,4,7,1,250.22649,...,7,-7.034,1,0.0303,0.36100,0.002490,0.1170,0.4670,117.720,2o2jnahWDsvyo8v0vV3dif
3,classic pop and rock,TRSIZRN128F427DB95,Blue Oyster Cult,Morning Final,-11.952,100.901,1,0,1,254.17098,...,9,-6.756,0,0.0659,0.05170,0.033700,0.1520,0.5050,100.563,4YsdlcE3GqCHpZD4jNDJgD
4,classic pop and rock,TRDYTEO128F427DB90,Blue Oyster Cult,The Revenge Of Vera Gemini,-11.839,132.361,4,2,1,230.32118,...,9,-7.110,1,0.0659,0.00182,0.000002,0.0814,0.6570,132.659,4aumrXau5uVKj8xXcQELP3
5,classic pop and rock,TRKSICM128F427DB8B,Blue Oyster Cult,True Confessions,-13.760,121.001,4,10,1,177.55383,...,10,-7.493,1,0.0433,0.10200,0.000009,0.0539,0.8140,121.065,6sH5tkQoXL3YKV0hpXZoWj
6,classic pop and rock,TRJPXIV128F426697A,Blue Oyster Cult,Redeemed,-10.799,185.836,4,2,1,231.39220,...,11,-9.794,0,0.0374,0.08460,0.001370,0.3210,0.5170,186.627,1P37l3UBUo49cqATq6Qccz
7,classic pop and rock,TRXWSIN128F4265A40,Blue Oyster Cult,Workshop Of The Telescopes,-11.413,120.171,4,9,1,241.16200,...,2,-8.954,1,0.1140,0.25700,0.011700,0.1110,0.3540,120.413,55f36XW2w1moYF70wsaHLB
8,classic pop and rock,TRNVQPE128F426BBD2,Blue Oyster Cult,Godzilla,-12.083,88.548,4,6,0,469.89016,...,4,-7.719,1,0.0554,0.16400,0.000008,0.6050,0.6960,184.024,6N0AnkdDFZUetw8KAGHV7e
9,classic pop and rock,TRUUZXH128F426C1AD,Blue Oyster Cult,E.T.I. (Extra Terrestrial Intelligence),-7.264,97.298,4,9,1,226.03710,...,9,-5.875,1,0.0486,0.24500,0.002490,0.4350,0.6860,97.335,5KBdHzTROSlD3dACh91sZx


In [6]:
string_features = ["track_id", "id", "artist_name", "title"]

In [9]:
def create_input_for_clustering(dataset):
    genres = dataset.genre.unique()
    for index, genre in zip(range(0, len(genres)),genres):
        dataset.loc[dataset['genre'] == genre, 'genre'] = index 
    dataset.drop(string_features, 1, inplace = True)
    return dataset.as_matrix()

In [42]:
X = create_input_for_clustering(dataset.copy())
kmeans = KMeans(n_clusters=319)
kmeans = kmeans.fit(X)
# Getting the cluster labels
labels = kmeans.predict(X)
# Centroid values
centroids = kmeans.cluster_centers_
inertia = kmeans.inertia_
print("cost:", inertia)
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X, labels, metric='sqeuclidean'))

cost: 3027906020.94
Silhouette Coefficient: 0.180


In [43]:
d = kmeans.transform(X)[:, 80]
#This gives an array of len(X) distances. The indices of the 50 closest to centroid j are

ind = np.argsort(d)[::-1][:50]
#so the 50 points closest to the centroids are

dataset.iloc[ind]

Unnamed: 0,genre,track_id,artist_name,title,loudness_x,tempo_x,time_signature,key_x,mode_x,duration,...,key_y,loudness_y,mode_y,speechiness,acousticness,instrumentalness,liveness,valence,tempo_y,id
9572,dance and electronica,TRRZKVG128F4240CDC,Boys Noize,eBong,-10.024,104.926,3,0,1,167.99302,...,7,-9.943,1,0.318,0.00184,4.9e-05,0.146,0.403,127.005,1c7RNRHb8CdtH6ypoogpyQ
669,classic pop and rock,TRRQSYS12903CA7026,Slapp Happy,Let's Travel Light,-16.624,85.238,1,9,1,294.84363,...,9,-16.366,1,0.209,0.901,0.00575,0.627,0.347,82.584,255wUmfIIxZykq0eolkBHr
3219,classic pop and rock,TRCULIS128F4288E5D,The Monochrome Set,Mr Bizarro,-8.964,157.643,1,1,1,211.87873,...,1,-9.142,1,0.177,0.0734,0.896,0.087,0.114,157.167,18h0Ev4jmYL0ZosEeUyQaX
1229,classic pop and rock,TRYHPFZ128F932ED7A,Lisa Loeb,H.A.P.P.Y,-13.925,0.0,1,11,0,5.85098,...,1,-12.189,1,0.0,0.954,0.0,0.0,0.0,0.0,5Pq2P7l8lRhoJbnMWySA29
9758,metal,TREQUAJ128F424C3B7,MNEMIC,Deathbox,-6.452,229.849,5,4,0,271.82975,...,4,-6.437,0,0.158,0.000163,0.0226,0.121,0.0868,114.846,03BnVQPFtABwFuZXmxxHLd
3671,classic pop and rock,TRZMWPC128F9331FD7,Horace Andy,Sunshine Dub,-17.213,130.041,5,1,1,135.31383,...,1,-13.402,1,0.0554,0.0331,0.345,0.358,0.683,130.032,3laSEXKVoTLP8WqwjK8h0K
9608,dance and electronica,TRLOPDO12903CFF6A2,Sub Focus,Last Jungle,-7.091,96.699,3,5,0,219.19302,...,5,-7.353,0,0.0374,0.00152,0.899,0.132,0.0531,144.975,6CqpBDG4dpDyk8JFiZr2ZU
3077,classic pop and rock,TRFDOFZ12903CC9F2A,Jens Lekman,Sipping On The Sweet Nectar (Ultracity Vocal Dub),-8.326,122.494,3,10,0,442.8273,...,1,-8.242,1,0.0382,0.0671,0.225,0.653,0.595,122.539,65MacfYFFN1Ppy4XYyPQiU
676,classic pop and rock,TROHGDM12903CA701F,Slapp Happy,I'm All Alone,-12.961,127.451,4,7,1,229.48526,...,7,-13.136,1,0.0319,0.893,0.0311,0.677,0.189,127.474,5Avdm3PjiZ49cXyzZVcwGd
9357,dance and electronica,TRZUPWR128F92C1E06,Calvin Harris,Rock n Roll Attitude,-6.597,120.014,4,6,1,197.95546,...,10,-6.833,0,0.0555,0.311,0.257,0.695,0.736,119.97,6GrwzUi1BYTawlEamAq9rN


In [14]:
centroids

array([[   2.28275862,  -12.89373793,  118.38911034, ...,    0.20792138,
           0.40304069,  117.91501379],
       [   1.42214533,   -8.40457785,  127.83630104, ...,    0.21656294,
           0.73310381,  122.05736678],
       [   3.52631579,  -14.01742105,  111.3555    , ...,    0.25273158,
           0.47378158,  113.81889474],
       ..., 
       [   1.74789916,   -9.51544538,  124.65721849, ...,    0.2061021 ,
           0.70345294,  117.26466387],
       [   2.06511628,  -11.34283256,  118.55092093, ...,    0.17593442,
           0.44837814,  118.13103721],
       [   1.07002801,   -8.96269188,  127.93059944, ...,    0.20089076,
           0.59683669,  123.04839776]])

In [44]:
X = create_input_for_clustering(dataset.copy())
af = AffinityPropagation().fit(X)
cluster_centers_indices = af.cluster_centers_indices_
labels = af.labels_

n_clusters_ = len(cluster_centers_indices)
print('Estimated number of clusters: %d' % n_clusters_)
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X, labels, metric='sqeuclidean'))



Estimated number of clusters: 319
Silhouette Coefficient: 0.158


In [33]:
dataset.iloc[cluster_centers_indices]
X[cluster_centers_indices]

Index(['genre', 'track_id', 'artist_name', 'title', 'loudness_x', 'tempo_x',
       'time_signature', 'key_x', 'mode_x', 'duration', 'avg_timbre1',
       'avg_timbre2', 'avg_timbre3', 'avg_timbre4', 'avg_timbre5',
       'avg_timbre6', 'avg_timbre7', 'avg_timbre8', 'avg_timbre9',
       'avg_timbre10', 'avg_timbre11', 'avg_timbre12', 'var_timbre1',
       'var_timbre2', 'var_timbre3', 'var_timbre4', 'var_timbre5',
       'var_timbre6', 'var_timbre7', 'var_timbre8', 'var_timbre9',
       'var_timbre10', 'var_timbre11', 'var_timbre12', '0', '1', '2', '3', '4',
       '5', '6', '7', '8', '9', 'danceability', 'energy', 'key_y',
       'loudness_y', 'mode_y', 'speechiness', 'acousticness',
       'instrumentalness', 'liveness', 'valence', 'tempo_y', 'id'],
      dtype='object')