# KMeans Clustering

In this notebook, we're gonna cluster our tracks.

In [2]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.cluster import AffinityPropagation
from sklearn import metrics
from sklearn.model_selection import train_test_split

In [3]:
input_data_path = "features_msd_lda_sp.csv"
NO_OF_CLUSTER = 100
dataset = pd.read_csv(input_data_path)
dataset.drop(["Unnamed: 0"], 1, inplace=True)
dataset


Unnamed: 0,genre,track_id,artist_name,title,loudness_x,tempo_x,time_signature,key_x,mode_x,duration,...,key_y,loudness_y,mode_y,speechiness,acousticness,instrumentalness,liveness,valence,tempo_y,id
0,classic pop and rock,TRNJTPB128F427AE9F,Blue Oyster Cult,Screams,-10.659,148.462,1,4,0,189.80526,...,4,-9.009,0,0.0806,0.32600,0.014300,0.2240,0.4930,149.356,2KsnSzoJlmfT44jFOqRP1E
1,classic pop and rock,TRLFJHA128F427AEEA,Blue Oyster Cult,Dance The Night Away,-13.494,112.909,1,10,0,158.19710,...,10,-11.838,0,0.0359,0.90300,0.000000,0.1810,0.1930,80.510,4qV4ErzHMz4XYGWxMkOFBw
2,classic pop and rock,TRCQZAG128F427DB97,Blue Oyster Cult,Debbie Denise,-12.786,117.429,4,7,1,250.22649,...,7,-7.034,1,0.0303,0.36100,0.002490,0.1170,0.4670,117.720,2o2jnahWDsvyo8v0vV3dif
3,classic pop and rock,TRSIZRN128F427DB95,Blue Oyster Cult,Morning Final,-11.952,100.901,1,0,1,254.17098,...,9,-6.756,0,0.0659,0.05170,0.033700,0.1520,0.5050,100.563,4YsdlcE3GqCHpZD4jNDJgD
4,classic pop and rock,TRDYTEO128F427DB90,Blue Oyster Cult,The Revenge Of Vera Gemini,-11.839,132.361,4,2,1,230.32118,...,9,-7.110,1,0.0659,0.00182,0.000002,0.0814,0.6570,132.659,4aumrXau5uVKj8xXcQELP3
5,classic pop and rock,TRKSICM128F427DB8B,Blue Oyster Cult,True Confessions,-13.760,121.001,4,10,1,177.55383,...,10,-7.493,1,0.0433,0.10200,0.000009,0.0539,0.8140,121.065,6sH5tkQoXL3YKV0hpXZoWj
6,classic pop and rock,TRJPXIV128F426697A,Blue Oyster Cult,Redeemed,-10.799,185.836,4,2,1,231.39220,...,11,-9.794,0,0.0374,0.08460,0.001370,0.3210,0.5170,186.627,1P37l3UBUo49cqATq6Qccz
7,classic pop and rock,TRXWSIN128F4265A40,Blue Oyster Cult,Workshop Of The Telescopes,-11.413,120.171,4,9,1,241.16200,...,2,-8.954,1,0.1140,0.25700,0.011700,0.1110,0.3540,120.413,55f36XW2w1moYF70wsaHLB
8,classic pop and rock,TRNVQPE128F426BBD2,Blue Oyster Cult,Godzilla,-12.083,88.548,4,6,0,469.89016,...,4,-7.719,1,0.0554,0.16400,0.000008,0.6050,0.6960,184.024,6N0AnkdDFZUetw8KAGHV7e
9,classic pop and rock,TRUUZXH128F426C1AD,Blue Oyster Cult,E.T.I. (Extra Terrestrial Intelligence),-7.264,97.298,4,9,1,226.03710,...,9,-5.875,1,0.0486,0.24500,0.002490,0.4350,0.6860,97.335,5KBdHzTROSlD3dACh91sZx


In [4]:
string_features = ["track_id", "id", "artist_name", "title"]

In [7]:
datacopy = dataset.copy()
datacopy.drop(string_features, 1, inplace = True)
genres = dataset.genre.unique()
for index, genre in zip(range(0, len(genres)),genres):
    datacopy.loc[datacopy['genre'] == genre, 'genre'] = index 
for column in datacopy.columns:
    max_of_column = datacopy[column].abs().max()
    datacopy[column] = datacopy[column].apply(lambda x: x / max_of_column)

datacopy

Unnamed: 0,genre,loudness_x,tempo_x,time_signature,key_x,mode_x,duration,avg_timbre1,avg_timbre2,avg_timbre3,...,energy,key_y,loudness_y,mode_y,speechiness,acousticness,instrumentalness,liveness,valence,tempo_y
0,0.0,-0.263471,0.569809,0.142857,0.363636,0.0,0.138218,0.752362,-0.301305,0.260673,...,0.786787,0.363636,-0.255212,0.0,0.084842,0.327309,0.014547,0.224449,0.496475,0.611887
1,0.0,-0.333548,0.433354,0.142857,0.909091,0.0,0.115201,0.647695,-0.211142,0.182626,...,0.145145,0.909091,-0.335354,0.0,0.037789,0.906627,0.000000,0.181363,0.194361,0.329836
2,0.0,-0.316047,0.450702,0.571429,0.636364,1.0,0.182217,0.733766,0.059400,0.339619,...,0.657658,0.636364,-0.199263,1.0,0.031895,0.362450,0.002533,0.117234,0.470292,0.482279
3,0.0,-0.295432,0.387266,0.142857,0.000000,1.0,0.185090,0.754109,0.109624,0.422476,...,0.899900,0.818182,-0.191388,0.0,0.069368,0.051908,0.034283,0.152305,0.508560,0.411990
4,0.0,-0.292639,0.508012,0.571429,0.181818,1.0,0.167722,0.745090,0.077296,0.162593,...,0.854855,0.818182,-0.201416,1.0,0.069368,0.001827,0.000002,0.081563,0.661631,0.543482
5,0.0,-0.340123,0.464411,0.571429,0.909091,1.0,0.129296,0.700382,-0.047945,0.313078,...,0.889890,0.909091,-0.212266,1.0,0.045579,0.102410,0.000009,0.054008,0.819738,0.495983
6,0.0,-0.266932,0.713253,0.571429,0.181818,1.0,0.168502,0.728208,-0.123193,0.251993,...,0.659660,1.000000,-0.277450,0.0,0.039368,0.084940,0.001394,0.321643,0.520645,0.764580
7,0.0,-0.282109,0.461226,0.571429,0.818182,1.0,0.175617,0.703682,-0.219683,0.228799,...,0.754755,0.181818,-0.253654,1.0,0.120000,0.258032,0.011902,0.111222,0.356495,0.493312
8,0.0,-0.298670,0.339854,0.571429,0.545455,0.0,0.342179,0.703932,0.072848,0.348543,...,0.869870,0.363636,-0.218669,1.0,0.058316,0.164659,0.000008,0.606212,0.700906,0.753916
9,0.0,-0.179553,0.373437,0.571429,0.818182,1.0,0.164602,0.841962,-0.005684,0.292341,...,0.878879,0.818182,-0.166431,1.0,0.051158,0.245984,0.002533,0.435872,0.690836,0.398765


In [8]:
def create_input_for_clustering(dataset):
    genres = dataset.genre.unique()
    for index, genre in zip(range(0, len(genres)),genres):
        dataset.loc[dataset['genre'] == genre, 'genre'] = index 
    #dataset.drop(string_features, 1, inplace = True)
    return dataset.as_matrix()

## KMeans Clustering

In [9]:
X = create_input_for_clustering(datacopy.copy())
kmeans = KMeans(n_clusters=319)
kmeans = kmeans.fit(X)
# Getting the cluster labels
labels = kmeans.predict(X)
# Centroid values
centroids = kmeans.cluster_centers_
inertia = kmeans.inertia_
print("inertia:", inertia)
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X, labels, metric='sqeuclidean'))

inertia: 7304.43861566
Silhouette Coefficient: 0.108


In [10]:
labels

array([ 96,  49,   6, ..., 186, 138, 126], dtype=int32)

In [11]:
def weighted_norm(arr1, arr2, ind=-1, weight=10):
    if ind != -1:
        arr1[ind] *= weight
        arr2[ind] *= weight
    return np.linalg.norm(arr1-arr2)

In [12]:
all_distances = [[weighted_norm(X[ind1], X[ind2]) for ind1 in range(0,X.shape[0])] for ind2 in range(0,X.shape[0])]

In [15]:
max_distance = np.max(all_distances)
max_distance

9.9547324656676821

In [16]:
import random 
random_track_indices = random.sample(range(len(X)), 10)
# print(random_track_indices)
test_input = create_input_for_clustering(datacopy.copy())[random_track_indices]
# print(test_input.shape)
cluster_numbers = kmeans.predict(test_input)
column_dict = {k: v for v, k in enumerate(list(dataset))}
seed_rec_distances = []
rec_to_rec_distances = []
seed_to_all_distances = []
for i, (cluster_number, random_track_ind) in enumerate(zip(cluster_numbers, random_track_indices)):
    index = kmeans.labels_ == cluster_number
    indices = [ind for ind, value in zip(range(0, len(labels)), index) if value == True]
    d = ([weighted_norm(X[ind], test_input[i]) for ind in indices])
    ind = np.argsort(d)[::-1][:10]
    indices = [indices[x] for x in ind]
    print(dataset.iloc[random_track_ind]["title"] + " by " + dataset.iloc[random_track_ind]["artist_name"])
    print ('======')
    print(dataset.iloc[indices]["title"] + " by " + dataset.iloc[indices]["artist_name"])
    seed_rec_distances.append(np.mean([all_distances[ind][random_track_ind] for ind in indices]))
    rec_to_rec_distances.append(np.mean([np.mean([all_distances[ind1][ind2] for ind1 in indices if ind1 != ind2]) for ind2 in indices]))
    seed_to_all_distances.append(np.mean([all_distances[ind][random_track_ind] for ind in range(0,X.shape[0])]))
print ('mean distance between seed and recommended songs: ' + str(np.mean(seed_rec_distances)/max_distance))
print ('mean distance between all recommended songs: ' + str(np.mean(rec_to_rec_distances)/max_distance))
print ('mean distance between seed and all songs: ' + str(np.mean(seed_to_all_distances)/max_distance))
print ('normalized mean distance between seed and recommended songs: ' + str(np.mean(seed_rec_distances)/np.mean(seed_to_all_distances)))

Superhuman by Velvet Revolver
4175                       Let's Be Natural by The Rutles
309                        Sidetracked by Clifford T Ward
2471                    Hearts by Huey Lewis And The News
3582                             Jesus Christ by Big Star
3243                        Fat Fun by The Monochrome Set
5676                       Raised Eyebrows by The Feelies
3276               Do It Good by KC And The Sunshine Band
2825                             D-days by Hazel O'Connor
2468    Some Of My Lies Are True (Sooner Or Later) by ...
2473               Stop Trying by Huey Lewis And The News
dtype: object
Shall We Dance by Stacey Kent
10753                           Eterna by Egberto Gismonti
10524                      Ain't Misbehavin' by Hank Jones
10629                      Quero-Te Assim by Altemar Dutra
10531                I'm Glad There Is You by Natalie Cole
10576              The Very Thought Of You by Natalie Cole
10768                                 Bill by Jane 

## Affinity Propagation

In [None]:
X = create_input_for_clustering(datacopy.copy())
af = AffinityPropagation().fit(X)
cluster_centers_indices = af.cluster_centers_indices_
labels = af.labels_

n_clusters_ = len(cluster_centers_indices)
print('Estimated number of clusters: %d' % n_clusters_)
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X, labels, metric='sqeuclidean'))



In [18]:
import random 
random_track_indices = random.sample(range(len(X)), 10)
test_input = create_input_for_clustering(datacopy.copy())[random_track_indices]
cluster_numbers = af.predict(test_input)
column_dict = {k: v for v, k in enumerate(list(dataset))}
seed_rec_distances = []
rec_to_rec_distances = []
seed_to_all_distances = []
for i, (cluster_number, random_track_ind) in enumerate(zip(cluster_numbers, random_track_indices)):
    index = labels == cluster_number
    indices = [ind for ind, value in zip(range(0, len(labels)), index) if value == True]
    d = ([weighted_norm(X[ind], test_input[i]) for ind in indices])
    ind = np.argsort(d)[::-1][:10]
    indices = [indices[x] for x in ind]
    print(dataset.iloc[random_track_ind]["title"] + " by " + dataset.iloc[random_track_ind]["artist_name"])
    print ('======')
    print(dataset.iloc[indices]["title"] + " by " + dataset.iloc[indices]["artist_name"])
    seed_rec_distances.append(np.mean([all_distances[ind][random_track_ind] for ind in indices]))
    rec_to_rec_distances.append(np.mean([np.mean([all_distances[ind1][ind2] for ind1 in indices if ind1 != ind2]) for ind2 in indices]))
    seed_to_all_distances.append(np.mean([all_distances[ind][random_track_ind] for ind in range(0,X.shape[0])]))
    

NameError: name 'af' is not defined

In [129]:
dataset.iloc[cluster_centers_indices]
X[cluster_centers_indices]

array([[ 0.        , -3.77905759,  0.48412762, ...,  0.08677355,
         0.82175227,  0.5152013 ],
       [ 0.        , -2.42198953,  0.47419468, ...,  0.17034068,
         0.89627392,  0.50980987],
       [ 0.        , -3.61640489,  0.44507517, ...,  0.18136273,
         0.50352467,  0.47922701],
       ..., 
       [ 9.        , -3.96753927,  0.64042572, ...,  0.08156313,
         0.89526687,  0.6834828 ],
       [ 9.        , -6.39720768,  0.31749742, ...,  0.11623246,
         0.06022155,  0.50828175],
       [ 9.        , -6.72495637,  0.62047538, ...,  0.09689379,
         0.43907351,  0.33181887]])