# KMeans Clustering

In this notebook, we're gonna cluster our tracks.

In [2]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.cluster import AffinityPropagation
from sklearn import metrics
from sklearn.model_selection import train_test_split

In [3]:
input_data_path = "features_msd_lda_sp.csv"
NO_OF_CLUSTER = 100
dataset = pd.read_csv(input_data_path)
dataset.drop(["Unnamed: 0"], 1, inplace=True)
dataset


Unnamed: 0,genre,track_id,artist_name,title,loudness_x,tempo_x,time_signature,key_x,mode_x,duration,...,key_y,loudness_y,mode_y,speechiness,acousticness,instrumentalness,liveness,valence,tempo_y,id
0,classic pop and rock,TRNJTPB128F427AE9F,Blue Oyster Cult,Screams,-10.659,148.462,1,4,0,189.80526,...,4,-9.009,0,0.0806,0.32600,0.014300,0.2240,0.4930,149.356,2KsnSzoJlmfT44jFOqRP1E
1,classic pop and rock,TRLFJHA128F427AEEA,Blue Oyster Cult,Dance The Night Away,-13.494,112.909,1,10,0,158.19710,...,10,-11.838,0,0.0359,0.90300,0.000000,0.1810,0.1930,80.510,4qV4ErzHMz4XYGWxMkOFBw
2,classic pop and rock,TRCQZAG128F427DB97,Blue Oyster Cult,Debbie Denise,-12.786,117.429,4,7,1,250.22649,...,7,-7.034,1,0.0303,0.36100,0.002490,0.1170,0.4670,117.720,2o2jnahWDsvyo8v0vV3dif
3,classic pop and rock,TRSIZRN128F427DB95,Blue Oyster Cult,Morning Final,-11.952,100.901,1,0,1,254.17098,...,9,-6.756,0,0.0659,0.05170,0.033700,0.1520,0.5050,100.563,4YsdlcE3GqCHpZD4jNDJgD
4,classic pop and rock,TRDYTEO128F427DB90,Blue Oyster Cult,The Revenge Of Vera Gemini,-11.839,132.361,4,2,1,230.32118,...,9,-7.110,1,0.0659,0.00182,0.000002,0.0814,0.6570,132.659,4aumrXau5uVKj8xXcQELP3
5,classic pop and rock,TRKSICM128F427DB8B,Blue Oyster Cult,True Confessions,-13.760,121.001,4,10,1,177.55383,...,10,-7.493,1,0.0433,0.10200,0.000009,0.0539,0.8140,121.065,6sH5tkQoXL3YKV0hpXZoWj
6,classic pop and rock,TRJPXIV128F426697A,Blue Oyster Cult,Redeemed,-10.799,185.836,4,2,1,231.39220,...,11,-9.794,0,0.0374,0.08460,0.001370,0.3210,0.5170,186.627,1P37l3UBUo49cqATq6Qccz
7,classic pop and rock,TRXWSIN128F4265A40,Blue Oyster Cult,Workshop Of The Telescopes,-11.413,120.171,4,9,1,241.16200,...,2,-8.954,1,0.1140,0.25700,0.011700,0.1110,0.3540,120.413,55f36XW2w1moYF70wsaHLB
8,classic pop and rock,TRNVQPE128F426BBD2,Blue Oyster Cult,Godzilla,-12.083,88.548,4,6,0,469.89016,...,4,-7.719,1,0.0554,0.16400,0.000008,0.6050,0.6960,184.024,6N0AnkdDFZUetw8KAGHV7e
9,classic pop and rock,TRUUZXH128F426C1AD,Blue Oyster Cult,E.T.I. (Extra Terrestrial Intelligence),-7.264,97.298,4,9,1,226.03710,...,9,-5.875,1,0.0486,0.24500,0.002490,0.4350,0.6860,97.335,5KBdHzTROSlD3dACh91sZx


In [4]:
string_features = ["track_id", "id", "artist_name", "title"]

In [5]:
datacopy = dataset.copy()
datacopy.drop(string_features, 1, inplace = True)
genres = dataset.genre.unique()
for index, genre in zip(range(0, len(genres)),genres):
    datacopy.loc[datacopy['genre'] == genre, 'genre'] = index 
for column in datacopy.columns:
    max_of_column = datacopy[column].max()
    datacopy[column] = datacopy[column].apply(lambda x: x / max_of_column)

datacopy    

Unnamed: 0,genre,loudness_x,tempo_x,time_signature,key_x,mode_x,duration,avg_timbre1,avg_timbre2,avg_timbre3,...,energy,key_y,loudness_y,mode_y,speechiness,acousticness,instrumentalness,liveness,valence,tempo_y
0,0.0,-3.720419,0.569809,0.142857,0.363636,0.0,0.138218,0.752362,-0.578831,0.260673,...,0.786787,0.363636,-2.289454,0.0,0.084842,0.327309,0.014547,0.224449,0.496475,0.611887
1,0.0,-4.709948,0.433354,0.142857,0.909091,0.0,0.115201,0.647695,-0.405620,0.182626,...,0.145145,0.909091,-3.008386,0.0,0.037789,0.906627,0.000000,0.181363,0.194361,0.329836
2,0.0,-4.462827,0.450702,0.571429,0.636364,1.0,0.182217,0.733766,0.114112,0.339619,...,0.657658,0.636364,-1.787548,1.0,0.031895,0.362450,0.002533,0.117234,0.470292,0.482279
3,0.0,-4.171728,0.387266,0.142857,0.000000,1.0,0.185090,0.754109,0.210597,0.422476,...,0.899900,0.818182,-1.716900,0.0,0.069368,0.051908,0.034283,0.152305,0.508560,0.411990
4,0.0,-4.132286,0.508012,0.571429,0.181818,1.0,0.167722,0.745090,0.148492,0.162593,...,0.854855,0.818182,-1.806861,1.0,0.069368,0.001827,0.000002,0.081563,0.661631,0.543482
5,0.0,-4.802792,0.464411,0.571429,0.909091,1.0,0.129296,0.700382,-0.092106,0.313078,...,0.889890,0.909091,-1.904193,1.0,0.045579,0.102410,0.000009,0.054008,0.819738,0.495983
6,0.0,-3.769284,0.713253,0.571429,0.181818,1.0,0.168502,0.728208,-0.236663,0.251993,...,0.659660,1.000000,-2.488945,0.0,0.039368,0.084940,0.001394,0.321643,0.520645,0.764580
7,0.0,-3.983595,0.461226,0.571429,0.818182,1.0,0.175617,0.703682,-0.422029,0.228799,...,0.754755,0.181818,-2.275476,1.0,0.120000,0.258032,0.011902,0.111222,0.356495,0.493312
8,0.0,-4.217452,0.339854,0.571429,0.545455,0.0,0.342179,0.703932,0.139947,0.348543,...,0.869870,0.363636,-1.961626,1.0,0.058316,0.164659,0.000008,0.606212,0.700906,0.753916
9,0.0,-2.535428,0.373437,0.571429,0.818182,1.0,0.164602,0.841962,-0.010919,0.292341,...,0.878879,0.818182,-1.493011,1.0,0.051158,0.245984,0.002533,0.435872,0.690836,0.398765


In [6]:
def create_input_for_clustering(dataset):
    genres = dataset.genre.unique()
    for index, genre in zip(range(0, len(genres)),genres):
        dataset.loc[dataset['genre'] == genre, 'genre'] = index 
    #dataset.drop(string_features, 1, inplace = True)
    return dataset.as_matrix()

In [7]:
X = create_input_for_clustering(datacopy.copy())
kmeans = KMeans(n_clusters=319)
kmeans = kmeans.fit(X)
# Getting the cluster labels
labels = kmeans.predict(X)
# Centroid values
centroids = kmeans.cluster_centers_
inertia = kmeans.inertia_
print("inertia:", inertia)
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X, labels, metric='sqeuclidean'))

inertia: 12027.7726043
Silhouette Coefficient: 0.127


In [8]:
labels

array([242, 259, 145, ..., 114, 184, 221], dtype=int32)

In [9]:
def weighted_norm(arr1, arr2, ind=-1, weight=10):
    if ind != -1:
        arr1[ind] *= weight
        arr2[ind] *= weight
    return np.linalg.norm(arr1-arr2)

In [99]:
all_distances = [[weighted_norm(X[ind1], X[ind2]) for ind1 in range(0,X.shape[0])] for ind2 in range(0,X.shape[0])]

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.


In [101]:
all_distances[0]

[0.0,
 198.86990638193726,
 178525.81958040578,
 57.281323933849606,
 210.88565130325978,
 190190.36850149589,
 20.001504441390086,
 206.19832827616645,
 209.33645274735969,
 14701.169173965909,
 139745641.44853878,
 2901422.8970781243,
 1573.8519946224815,
 226.40537323476011,
 1958.3490456809336,
 1541.0685462336603,
 64.410255104816571,
 10.368591249822673,
 1573.8515004522621,
 21771.054712844601,
 2422388.5895813387,
 25051.86791116578,
 207.81233373619244,
 1960.8901079026466,
 210.60368398387817,
 211.4204086525539,
 200.43479357872971,
 2073722.7700135396,
 26.93516520519654,
 19567.751065458571,
 55.585803421518023,
 55.669117283039483,
 211.92770650956598,
 58.624460242195262,
 2032.046410553462,
 23214.510906888598,
 1724294.5616273617,
 16370.80065033776,
 11.80880919952406,
 6.0753788206397568,
 19252.630377476191,
 2372579.1867859568,
 210.95124738999309,
 21.254040475657813,
 202.5210841179491,
 20149.70779929743,
 202.42621871809899,
 253824.42186665937,
 216518.1956912

In [126]:
import random 
random_track_indices = random.sample(range(len(X)), 10)
# print(random_track_indices)
test_input = create_input_for_clustering(datacopy.copy())[random_track_indices]
# print(test_input.shape)
cluster_numbers = kmeans.predict(test_input)
column_dict = {k: v for v, k in enumerate(list(dataset))}
seed_rec_distances = []
rec_to_rec_distances = []
seed_to_all_distances = []
for i, (cluster_number, random_track_ind) in enumerate(zip(cluster_numbers, random_track_indices)):
    index = kmeans.labels_ == cluster_number
    indices = [ind for ind, value in zip(range(0, len(labels)), index) if value == True]
    d = ([weighted_norm(X[ind], test_input[i]) for ind in indices])
    #print(dataset.loc[index]["title"] + " by " + dataset.loc[index]["artist_name"])
    ind = np.argsort(d)[::-1][:10]
    indices = [indices[x] for x in ind]
    print(dataset.iloc[random_track_ind]["title"] + " by " + dataset.iloc[random_track_ind]["artist_name"])
    print ('======')
    print(dataset.iloc[indices]["title"] + " by " + dataset.iloc[indices]["artist_name"])
    seed_rec_distances.append(np.mean([all_distances[ind][random_track_ind] for ind in indices]))
    rec_to_rec_distances.append(np.mean([np.mean([all_distances[ind1][ind2] for ind1 in indices if ind1 != ind2]) for ind2 in indices]))
    seed_to_all_distances.append(np.mean([all_distances[ind][random_track_ind] for ind in range(0,X.shape[0])]))
    """
    print("Playlist for " + dataset.iloc[random_track_ind]["title"] + " by " + dataset.iloc[random_track_ind]["artist_name"])
    d = kmeans.transform(X)[:, cluster_number]
    
    print(cluster_number)
    #This gives an array of len(X) distances. The indices of the 50 closest to centroid j are
    ind = np.argsort(d)[::-1][:10]
    print(ind)
    #so the 50 points closest to the centroids are
    print(dataset.iloc[ind]["title"] + " by " + dataset.iloc[ind]["artist_name"])"""
print ('mean distance between seed and recommended songs: ' + str(np.mean(seed_rec_distances)))
print ('mean distance between all recommended songs: ' + str(np.mean(rec_to_rec_distances)))
print ('mean distance between seed and all songs: ' + str(np.mean(seed_to_all_distances)))
print ('normalized mean distance between seed and recommended songs: ' + str(np.mean(seed_rec_distances)/np.mean(seed_to_all_distances)))

The Chokin' Kind by Joss Stone
11047               You Give Good Love by Whitney Houston
11230                        Puff She Puff by Black Uhuru
11624                          Solid by Ashford & Simpson
11263                        Love And Affection by Sizzla
11508                       Concrete jungle by Bob Marley
10992                              Angel by Lionel Richie
11506      Three Little Birds by Bob Marley & The Wailers
11499    Soul Shakedown Party by Bob Marley & The Wailers
11194                  Politics Time Again by Buju Banton
10910                               Vibin' by Boyz II Men
dtype: object
Southern Point by Grizzly Bear
1148           Aunt Eggma Blowtorch by Neutral Milk Hotel
1812    I Wish That I Could See You Soon (Lisa Li Lund...
1531              Song Instead of a Kiss by Alannah Myles
1307    Butcher's Tale (Western Front 1914) by The Zom...
194          To Give [The Reason I Live] by Frankie Valli
4486                         All By Myself by Eric Car

In [89]:
centroids

array([[  4.44089210e-16,  -3.22478185e+00,   5.05309556e-01, ...,
          1.22589624e-01,   7.51519899e-01,   5.15260778e-01],
       [  8.97826087e+00,  -2.89239700e+00,   5.08233347e-01, ...,
          1.63341901e-01,   5.88116818e-01,   4.75934492e-01],
       [  2.00000000e+00,  -3.98402414e+00,   4.59413113e-01, ...,
          1.42320057e-01,   4.07909533e-01,   5.21287399e-01],
       ..., 
       [  9.00000000e+00,  -1.97697278e+00,   5.60042547e-01, ...,
          2.82543465e-01,   5.47127188e-01,   5.00041467e-01],
       [  4.44089210e-16,  -2.05514003e+00,   4.63069454e-01, ...,
          2.44283806e-01,   6.90979715e-01,   4.95634532e-01],
       [  4.44444444e+00,  -4.94198177e+00,   4.87213013e-01, ...,
          1.93943442e-01,   5.55779344e-01,   4.97844383e-01]])

In [127]:
X = create_input_for_clustering(datacopy.copy())
af = AffinityPropagation().fit(X)
cluster_centers_indices = af.cluster_centers_indices_
labels = af.labels_

n_clusters_ = len(cluster_centers_indices)
print('Estimated number of clusters: %d' % n_clusters_)
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X, labels, metric='sqeuclidean'))



Estimated number of clusters: 209
Silhouette Coefficient: 0.110


In [128]:
import random 
random_track_indices = random.sample(range(len(X)), 10)
print(random_track_indices)
test_input = create_input_for_clustering(datacopy.copy())[random_track_indices]
print(test_input.shape)
cluster_numbers = af.predict(test_input)
for cluster_number, random_track_ind in zip(cluster_numbers, random_track_indices):
    index = labels == cluster_number
    print(index)
    indices = [ind for ind, value in zip(range(0, len(labels)), index) if value == True]
    print(indices)
    print(dataset.iloc[indices]["title"] + " by " + dataset.iloc[indices]["artist_name"])

[10689, 11648, 4434, 8130, 7371, 2759, 3356, 3446, 4737, 3366]
(10, 52)
[False False False ..., False False False]
[9873, 9952, 10553, 10568, 10581, 10589, 10592, 10601, 10628, 10665, 10675, 10676, 10677, 10686, 10689, 10694, 10703, 10705, 10726, 10730, 10732, 10736, 10737, 10741, 10742, 10754]
9873                             Psychorama by Dog Eat Dog
9952                 Closing The Circle by Beyond Twilight
10553      Sorry (Digitally Remastered 02) by Natalie Cole
10568    There Ain't Nothing Stronger Than Love by Nata...
10581                            Your Eyes by Natalie Cole
10589                      Like A Hurricane by Jeff Healey
10592           Sittin' On Top Of The World by Jeff Healey
10601    Nervous Breakdown (Pogo And Swift Mix) by Carl...
10628               Secret Love by Richard "Groove" Holmes
10665       Take Me Out To The Ball Game by Curtis Stigers
10675                             Viaticum by Ulf Wakenius
10676            The Girl For Me Tonight by Peter Cinco

[6692, 7131, 7145, 7222, 7237, 7247, 7291, 7373, 7708, 7857, 8056, 8087, 8091, 8093, 8100, 8120, 8130, 8674, 8677, 8727, 8782]
6692         Thanksgiving (Live) by Loudon Wainwright III
7131                     Uirchill An Chreagain by Clannad
7145                            Tá 'Mé Mo Shuí by Clannad
7222                    Mining For Gold by Cowboy Junkies
7237       I Am A Man Of Constant Sorrow by John Hartford
7247              Moment of Forever by Kris Kristofferson
7291       Epitaph (Black And Blue) by Kris Kristofferson
7373                   Woozy With Cider by James Yorkston
7708               All The Morning Birds by Jolie Holland
7857                               Simply by Sara Hickman
8056                    God Bless The Child by Mary Black
8087                           Small Hours by John Martyn
8091                          Go Down Easy by John Martyn
8093                       Lay It All Down by John Martyn
8100                     Before Sleep Comes by Luka Bloom
812

206                       Turn Of The Tide by Carly Simon
242               Halfway 'Round The World by Carly Simon
350                Sweetest Operator by Lighthouse Family
355             Postcard From Heaven by Lighthouse Family
357                Let It All Change by Lighthouse Family
358               When I Was Younger by Lighthouse Family
361     (I Wish I Knew How It Would Feel To Be) Free/O...
434                             Mandocello by Cheap Trick
468                 World's Greatest Lover by Cheap Trick
543                    Dream To Remember by Milli Vanilli
637                        He Goes On by Belinda Carlisle
698                                I'm Free by Jon Secada
703                          Tu Mejor Amigo by Jon Secada
724                          Solo Tu Imagen by Jon Secada
825                       Break Through by Colbie Caillat
836                      Dreams Collide by Colbie Caillat
847                           Mistletoe by Colbie Caillat
856           

In [129]:
dataset.iloc[cluster_centers_indices]
X[cluster_centers_indices]

array([[ 0.        , -3.77905759,  0.48412762, ...,  0.08677355,
         0.82175227,  0.5152013 ],
       [ 0.        , -2.42198953,  0.47419468, ...,  0.17034068,
         0.89627392,  0.50980987],
       [ 0.        , -3.61640489,  0.44507517, ...,  0.18136273,
         0.50352467,  0.47922701],
       ..., 
       [ 9.        , -3.96753927,  0.64042572, ...,  0.08156313,
         0.89526687,  0.6834828 ],
       [ 9.        , -6.39720768,  0.31749742, ...,  0.11623246,
         0.06022155,  0.50828175],
       [ 9.        , -6.72495637,  0.62047538, ...,  0.09689379,
         0.43907351,  0.33181887]])