# KMeans Clustering

In this notebook, we're gonna cluster our tracks.

In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.cluster import AffinityPropagation
from sklearn import metrics
from sklearn.model_selection import train_test_split

In [2]:
input_data_path = "features_msd_lda_sp.csv"
NO_OF_CLUSTER = 100
dataset = pd.read_csv(input_data_path)
dataset.drop(["Unnamed: 0"], 1, inplace=True)
dataset


Unnamed: 0,genre,track_id,artist_name,title,loudness_x,tempo_x,time_signature,key_x,mode_x,duration,...,key_y,loudness_y,mode_y,speechiness,acousticness,instrumentalness,liveness,valence,tempo_y,id
0,classic pop and rock,TRNJTPB128F427AE9F,Blue Oyster Cult,Screams,-10.659,148.462,1,4,0,189.80526,...,4,-9.009,0,0.0806,0.32600,0.014300,0.2240,0.4930,149.356,2KsnSzoJlmfT44jFOqRP1E
1,classic pop and rock,TRLFJHA128F427AEEA,Blue Oyster Cult,Dance The Night Away,-13.494,112.909,1,10,0,158.19710,...,10,-11.838,0,0.0359,0.90300,0.000000,0.1810,0.1930,80.510,4qV4ErzHMz4XYGWxMkOFBw
2,classic pop and rock,TRCQZAG128F427DB97,Blue Oyster Cult,Debbie Denise,-12.786,117.429,4,7,1,250.22649,...,7,-7.034,1,0.0303,0.36100,0.002490,0.1170,0.4670,117.720,2o2jnahWDsvyo8v0vV3dif
3,classic pop and rock,TRSIZRN128F427DB95,Blue Oyster Cult,Morning Final,-11.952,100.901,1,0,1,254.17098,...,9,-6.756,0,0.0659,0.05170,0.033700,0.1520,0.5050,100.563,4YsdlcE3GqCHpZD4jNDJgD
4,classic pop and rock,TRDYTEO128F427DB90,Blue Oyster Cult,The Revenge Of Vera Gemini,-11.839,132.361,4,2,1,230.32118,...,9,-7.110,1,0.0659,0.00182,0.000002,0.0814,0.6570,132.659,4aumrXau5uVKj8xXcQELP3
5,classic pop and rock,TRKSICM128F427DB8B,Blue Oyster Cult,True Confessions,-13.760,121.001,4,10,1,177.55383,...,10,-7.493,1,0.0433,0.10200,0.000009,0.0539,0.8140,121.065,6sH5tkQoXL3YKV0hpXZoWj
6,classic pop and rock,TRJPXIV128F426697A,Blue Oyster Cult,Redeemed,-10.799,185.836,4,2,1,231.39220,...,11,-9.794,0,0.0374,0.08460,0.001370,0.3210,0.5170,186.627,1P37l3UBUo49cqATq6Qccz
7,classic pop and rock,TRXWSIN128F4265A40,Blue Oyster Cult,Workshop Of The Telescopes,-11.413,120.171,4,9,1,241.16200,...,2,-8.954,1,0.1140,0.25700,0.011700,0.1110,0.3540,120.413,55f36XW2w1moYF70wsaHLB
8,classic pop and rock,TRNVQPE128F426BBD2,Blue Oyster Cult,Godzilla,-12.083,88.548,4,6,0,469.89016,...,4,-7.719,1,0.0554,0.16400,0.000008,0.6050,0.6960,184.024,6N0AnkdDFZUetw8KAGHV7e
9,classic pop and rock,TRUUZXH128F426C1AD,Blue Oyster Cult,E.T.I. (Extra Terrestrial Intelligence),-7.264,97.298,4,9,1,226.03710,...,9,-5.875,1,0.0486,0.24500,0.002490,0.4350,0.6860,97.335,5KBdHzTROSlD3dACh91sZx


In [4]:
string_features = ["track_id", "id", "artist_name", "title"]

In [5]:
datacopy = dataset.copy()
datacopy.drop(string_features, 1, inplace = True)
genres = dataset.genre.unique()
for index, genre in zip(range(0, len(genres)),genres):
    datacopy.loc[datacopy['genre'] == genre, 'genre'] = index 
for column in datacopy.columns:
    max_of_column = datacopy[column].max()
    datacopy[column] = datacopy[column].apply(lambda x: x / max_of_column)

datacopy    

Unnamed: 0,genre,loudness_x,tempo_x,time_signature,key_x,mode_x,duration,avg_timbre1,avg_timbre2,avg_timbre3,...,energy,key_y,loudness_y,mode_y,speechiness,acousticness,instrumentalness,liveness,valence,tempo_y
0,0.0,-3.720419,0.569809,0.142857,0.363636,0.0,0.138218,0.752362,-0.578831,0.260673,...,0.786787,0.363636,-2.289454,0.0,0.084842,0.327309,0.014547,0.224449,0.496475,0.611887
1,0.0,-4.709948,0.433354,0.142857,0.909091,0.0,0.115201,0.647695,-0.405620,0.182626,...,0.145145,0.909091,-3.008386,0.0,0.037789,0.906627,0.000000,0.181363,0.194361,0.329836
2,0.0,-4.462827,0.450702,0.571429,0.636364,1.0,0.182217,0.733766,0.114112,0.339619,...,0.657658,0.636364,-1.787548,1.0,0.031895,0.362450,0.002533,0.117234,0.470292,0.482279
3,0.0,-4.171728,0.387266,0.142857,0.000000,1.0,0.185090,0.754109,0.210597,0.422476,...,0.899900,0.818182,-1.716900,0.0,0.069368,0.051908,0.034283,0.152305,0.508560,0.411990
4,0.0,-4.132286,0.508012,0.571429,0.181818,1.0,0.167722,0.745090,0.148492,0.162593,...,0.854855,0.818182,-1.806861,1.0,0.069368,0.001827,0.000002,0.081563,0.661631,0.543482
5,0.0,-4.802792,0.464411,0.571429,0.909091,1.0,0.129296,0.700382,-0.092106,0.313078,...,0.889890,0.909091,-1.904193,1.0,0.045579,0.102410,0.000009,0.054008,0.819738,0.495983
6,0.0,-3.769284,0.713253,0.571429,0.181818,1.0,0.168502,0.728208,-0.236663,0.251993,...,0.659660,1.000000,-2.488945,0.0,0.039368,0.084940,0.001394,0.321643,0.520645,0.764580
7,0.0,-3.983595,0.461226,0.571429,0.818182,1.0,0.175617,0.703682,-0.422029,0.228799,...,0.754755,0.181818,-2.275476,1.0,0.120000,0.258032,0.011902,0.111222,0.356495,0.493312
8,0.0,-4.217452,0.339854,0.571429,0.545455,0.0,0.342179,0.703932,0.139947,0.348543,...,0.869870,0.363636,-1.961626,1.0,0.058316,0.164659,0.000008,0.606212,0.700906,0.753916
9,0.0,-2.535428,0.373437,0.571429,0.818182,1.0,0.164602,0.841962,-0.010919,0.292341,...,0.878879,0.818182,-1.493011,1.0,0.051158,0.245984,0.002533,0.435872,0.690836,0.398765


In [6]:
def create_input_for_clustering(dataset):
    genres = dataset.genre.unique()
    for index, genre in zip(range(0, len(genres)),genres):
        dataset.loc[dataset['genre'] == genre, 'genre'] = index 
    #dataset.drop(string_features, 1, inplace = True)
    return dataset.as_matrix()

In [10]:
X = create_input_for_clustering(datacopy.copy())
kmeans = KMeans(n_clusters=319)
kmeans = kmeans.fit(X)
# Getting the cluster labels
labels = kmeans.predict(X)
# Centroid values
centroids = kmeans.cluster_centers_
inertia = kmeans.inertia_
print("inertia:", inertia)
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X, labels, metric='sqeuclidean'))

inertia: 11994.1789906
Silhouette Coefficient: 0.131


In [8]:
labels

array([ 19, 307,  53, ...,  35, 121,  43], dtype=int32)

array([[ 0.        , -3.72041885,  0.5698089 , ...,  0.2244489 ,
         0.49647533,  0.61188655],
       [ 0.        , -4.70994764,  0.43335368, ...,  0.18136273,
         0.19436052,  0.329836  ],
       [ 0.        , -4.46282723,  0.45070179, ...,  0.11723447,
         0.47029204,  0.48227915],
       ..., 
       [ 9.        , -3.5469459 ,  0.34140098, ...,  0.11723447,
         0.48439074,  0.36458124],
       [ 9.        , -3.31378709,  0.34149693, ...,  0.77354709,
         0.49848943,  0.36456895],
       [ 9.        , -4.1095986 ,  0.32691223, ...,  0.1242485 ,
         0.32628399,  0.39730265]])

In [None]:
decide_weight()

In [22]:
import random 
random_track_indices = random.sample(range(len(X)), 10)
# print(random_track_indices)
test_input = create_input_for_clustering(datacopy.copy())[random_track_indices]
# print(test_input.shape)
cluster_numbers = kmeans.predict(test_input)
for i, (cluster_number, random_track_ind) in enumerate(zip(cluster_numbers, random_track_indices)):
    index = kmeans.labels_ == cluster_number
#     print(index)
    indices = [ind for ind, value in zip(range(0, len(labels)), index) if value == True]
#     print(indices)
    print(X[indices[0]].shape)
    print(test_input[i].shape)
    d = ([np.linalg.norm(X[ind]-test_input[i]) for ind in indices])
    #print(dataset.loc[index]["title"] + " by " + dataset.loc[index]["artist_name"])
    ind = np.argsort(d)[::-1][:10]
    indices = [indices[x] for x in ind]
    print(dataset.iloc[random_track_ind]["title"] + " by " + dataset.iloc[random_track_ind]["artist_name"])
    print ('======')
    print(dataset.iloc[indices]["title"] + " by " + dataset.iloc[indices]["artist_name"])

    """
    print("Playlist for " + dataset.iloc[random_track_ind]["title"] + " by " + dataset.iloc[random_track_ind]["artist_name"])
    d = kmeans.transform(X)[:, cluster_number]
    
    print(cluster_number)
    #This gives an array of len(X) distances. The indices of the 50 closest to centroid j are
    ind = np.argsort(d)[::-1][:10]
    print(ind)
    #so the 50 points closest to the centroids are
    print(dataset.iloc[ind]["title"] + " by " + dataset.iloc[ind]["artist_name"])"""

(52,)
(52,)
Love Will Conquer All by Lionel Richie
11238                 Big Spliff by Black Uhuru
11503                    Reaction by Bob Marley
5894         Trening za umiranje by Hladno Pivo
10834     Confrontations by Organized Konfusion
11345                  Reflection by Belleruche
11549                     Caution by Bob Marley
5888     Pjevajte nesto ljubavno by Hladno Pivo
11543                   400 Years by Bob Marley
11336       When It Hurts So Bad by Lauryn Hill
11351         Like 4 The Hard Way by Belleruche
dtype: object
(52,)
(52,)
Elusive Butterfly by Glen Campbell
4041                              Magazine by Jesus Jones
2370                        Couch Surfer by Bran Van 3000
90                             Breaxxbaxx by Land Of Talk
3567                                     Feel by Big Star
5208    Diggin The New by Joe Strummer And The Mescaleros
3713                                     Words by Boyzone
2540                                 Window Bird by Stars
23

In [14]:
centroids

array([[   2.28275862,  -12.89373793,  118.38911034, ...,    0.20792138,
           0.40304069,  117.91501379],
       [   1.42214533,   -8.40457785,  127.83630104, ...,    0.21656294,
           0.73310381,  122.05736678],
       [   3.52631579,  -14.01742105,  111.3555    , ...,    0.25273158,
           0.47378158,  113.81889474],
       ..., 
       [   1.74789916,   -9.51544538,  124.65721849, ...,    0.2061021 ,
           0.70345294,  117.26466387],
       [   2.06511628,  -11.34283256,  118.55092093, ...,    0.17593442,
           0.44837814,  118.13103721],
       [   1.07002801,   -8.96269188,  127.93059944, ...,    0.20089076,
           0.59683669,  123.04839776]])

In [44]:
X = create_input_for_clustering(dataset.copy())
af = AffinityPropagation().fit(X)
cluster_centers_indices = af.cluster_centers_indices_
labels = af.labels_

n_clusters_ = len(cluster_centers_indices)
print('Estimated number of clusters: %d' % n_clusters_)
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X, labels, metric='sqeuclidean'))



Estimated number of clusters: 319
Silhouette Coefficient: 0.158


In [98]:
import random 
random_track_indices = random.sample(range(len(X)), 10)
print(random_track_indices)
test_input = create_input_for_clustering(datacopy.copy())[random_track_indices]
print(test_input.shape)
cluster_numbers = af.predict(test_input)
for cluster_number, random_track_ind in zip(cluster_numbers, random_track_indices):
    index = labels == cluster_number
    print(index)
    indices = [ind for ind, value in zip(range(0, len(labels)), index) if value == True]
    print(indices)
    print(dataset.iloc[indices]["title"] + " by " + dataset.iloc[indices]["artist_name"])

[21, 9072, 4033, 9118, 8592, 4728, 9952, 38, 7896, 6478]
(10, 52)
138
[False False False ..., False False False]
[6, 21, 155, 343, 354, 412, 630, 658, 757, 957, 977, 1005, 1025, 1239, 1351, 1517, 1529, 1637, 1680, 1693, 1810, 1936, 2448, 2605, 2636, 2651, 2725, 2946, 2996, 3131, 3208, 3366, 3492, 3572, 3611, 3614, 3685, 3827, 3855, 3856, 4039, 4117, 4125, 4347, 4440, 4592, 4667, 4818, 4901, 4961, 4963, 5059, 5178, 5389, 5404, 5515, 5569, 5617, 5638, 5666, 5679, 5725]
6                            Redeemed by Blue Oyster Cult
21                      Sole Survivor by Blue Oyster Cult
155                                Changes by Imagination
343                    Lost In Space by Lighthouse Family
354                         Restless by Lighthouse Family
412                              Slow Down by Wallis Bird
630                           I Love You by Mick Hucknall
658                                Strayed by Slapp Happy
757                   River boat song (Live) by J.J. Cale
957   

In [33]:
dataset.iloc[cluster_centers_indices]
X[cluster_centers_indices]

Index(['genre', 'track_id', 'artist_name', 'title', 'loudness_x', 'tempo_x',
       'time_signature', 'key_x', 'mode_x', 'duration', 'avg_timbre1',
       'avg_timbre2', 'avg_timbre3', 'avg_timbre4', 'avg_timbre5',
       'avg_timbre6', 'avg_timbre7', 'avg_timbre8', 'avg_timbre9',
       'avg_timbre10', 'avg_timbre11', 'avg_timbre12', 'var_timbre1',
       'var_timbre2', 'var_timbre3', 'var_timbre4', 'var_timbre5',
       'var_timbre6', 'var_timbre7', 'var_timbre8', 'var_timbre9',
       'var_timbre10', 'var_timbre11', 'var_timbre12', '0', '1', '2', '3', '4',
       '5', '6', '7', '8', '9', 'danceability', 'energy', 'key_y',
       'loudness_y', 'mode_y', 'speechiness', 'acousticness',
       'instrumentalness', 'liveness', 'valence', 'tempo_y', 'id'],
      dtype='object')