In [37]:
import pandas as pd
import numpy as np
from collections import defaultdict
from collections import Counter

In [38]:
from sklearn.cluster import KMeans

In [63]:
data = pd.read_csv("SpotifyFeatures.csv")
data["mode"] = 1 * (data["mode"] == "Major")
del data["artist_name"]
del data["track_id"]
del data["track_name"]
del data["key"]
del data["time_signature"]
data.head()

Unnamed: 0,genre,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,mode,speechiness,tempo,valence
0,Movie,0,0.611,0.389,99373,0.91,0.0,0.346,-1.828,1,0.0525,166.969,0.814
1,Movie,1,0.246,0.59,137373,0.737,0.0,0.151,-5.559,0,0.0868,174.003,0.816
2,Movie,3,0.952,0.663,170267,0.131,0.0,0.103,-13.879,0,0.0362,99.488,0.368
3,Movie,0,0.703,0.24,152427,0.326,0.0,0.0985,-12.178,1,0.0395,171.758,0.227
4,Movie,4,0.95,0.331,82625,0.225,0.123,0.202,-21.15,1,0.0456,140.576,0.39


In [84]:
for label in ("popularity", "acousticness", "danceability", "duration_ms", "energy", "instrumentalness", "liveness", "mode", "speechiness", "tempo", "valence"):
    data[label] = data[label] / max(data[label])
    
for label in ("loudness",):
    data[label] = data[label] / min(data[label])

    
data.head()

Unnamed: 0,genre,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,mode,speechiness,tempo,valence
0,Movie,0.0,0.613454,0.393327,0.017896,0.910911,0.0,0.346,0.034848,1.0,0.054292,0.68739,0.814
1,Movie,0.01,0.246988,0.596562,0.024739,0.737738,0.0,0.151,0.105973,0.0,0.089762,0.716348,0.816
2,Movie,0.03,0.955823,0.670374,0.030663,0.131131,0.0,0.103,0.264579,0.0,0.037435,0.409579,0.368
3,Movie,0.0,0.705823,0.242669,0.02745,0.326326,0.0,0.0985,0.232152,1.0,0.040848,0.707105,0.227
4,Movie,0.04,0.953815,0.334681,0.01488,0.225225,0.123123,0.202,0.403187,1.0,0.047156,0.578733,0.39


In [85]:
def run_kmeans(data, labels, num_clust=26):
    kmeans = KMeans(n_clusters=num_clust, random_state=1).fit(data)
    counters = defaultdict(Counter)

    for predicted, actual in zip(kmeans.labels_, labels):
        counters[predicted][actual] += 1

    clust_labels = {}
    for cluster, counts in counters.items():
        clust_labels[cluster] = [a[0] for a in counts.most_common()]
    
    return kmeans, clust_labels 

def correct_top_n(labels, predicted, cluster_counts, top=1):
    correct = 0
    for label, prediction in zip(labels, predicted):
        if label in cluster_counts[prediction][:top]:
            correct += 1
    return correct / len(labels)
    

In [None]:
train = data.sample(frac=0.9,random_state=200)
test = data.drop(train.index)

for i in range(26, 261, 26):
    kmeans, cluster_counts = run_kmeans(train.drop(["genre"], axis=1), train["genre"], num_clust=i)
    predicted = kmeans.predict(test.drop(["genre"], axis=1))
    for j in range(1, 6):
        print(i, j, correct_top_n(test["genre"], predicted, cluster_counts, top=j))

[ 9  0 18 ... 11 25 25]
26 1 0.22283332617195892
26 2 0.3643277617840416
26 3 0.4588149357624715
26 4 0.5305289391139948
26 5 0.5917157220813819
[25 17 14 ... 29 22 20]
52 1 0.2456494650453315
52 2 0.3935891376272934
52 3 0.4921153267735144
52 4 0.5676535040604993
52 5 0.6339964766037898
[47 67 76 ... 45 19 24]
78 1 0.2578954152881021
78 2 0.40463197696901987
78 3 0.5081424827052808
78 4 0.5872470244489323
78 5 0.653804838224552
[12 45 80 ... 90 46 13]
104 1 0.2658445408842865
104 2 0.418596657070425
104 3 0.5160916083014652
104 4 0.5973875306148756
104 5 0.6601641387014996
[109 102  63 ...   5  13  16]
130 1 0.2695827783268165
130 2 0.42362394190693076
130 3 0.5266188286856014
130 4 0.6064108623727066
130 5 0.670003867142182
[152  88  48 ...  62  86  82]
156 1 0.2732780475228806
156 2 0.43483865423452067
156 3 0.536974176083874
156 4 0.6153052893911399
156 5 0.675890516908005
