In [26]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction import DictVectorizer
from sklearn.cluster import KMeans
from random import shuffle

import os, cPickle

PROCESSED_DATA_DIR = "processed_data"
NUM_CLUSTERS = 30

In [27]:
# K Means attempts to cluster data by splitting data into groups of equal variance.
# Requires number of clusters to be specified.
# Centroid: mean of cluster.
# Aims to choose centroids that minimize the inertia, or intra-cluster sum of squared distance from the mean.

# Drawbacks
# Note that inertia makes the assumption that clusters are convex and isotropic (identical in all directions).
# Inertia responds poorly to elongated clusters.
# Inertia is not a normalized metric. PCA can reduce the inflation of Euclidean distances that occur with high-dimensional spaces.
# 1. Choose initial centroid, k samples from the dataset.
# 2. Assign each sample to its nearest centroid
# 3. Create new centroids by taking the mean value of all the samples assigned to each previous centroid.
# K means will always converge, but this might be a local minimum, heavily dependent on centroid initialization.
# As such, centroid initialization is done several times.

# In other words, k-means is EM w/small, all-equal diagonal covar matrix.

In [28]:
def get_data():
    ret = []
    file_paths = []
    vec = DictVectorizer()

    dirs_list        = next(os.walk(PROCESSED_DATA_DIR))[1]
    joined_dirs_list = [os.path.join(PROCESSED_DATA_DIR, d) for d in dirs_list]

    for subdir in joined_dirs_list:
        # Walk files in every subdirectory.
        for root, dirs, files in os.walk(subdir):
            for file_item in files:
                file_path = os.path.join(subdir, file_item)

                # Read file and vectorize lyrics.
                with open(file_path) as f:
                    ret.append(cPickle.load(f))
                
                file_paths.append(file_path)

    return vec.fit_transform(ret).toarray(), file_paths

In [29]:
data, file_paths = get_data()

In [30]:
assert(len(data) == len(file_paths))

In [31]:
print data

[[ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 ..., 
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]]


In [32]:
# n_jobs = -1 means that KMeans should be as parallel as possible.
estimator = KMeans(n_clusters=NUM_CLUSTERS, n_jobs=-1)

In [33]:
estimator.fit(data)

KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=30, n_init=10,
    n_jobs=-1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0)

In [34]:
estimator.fit_predict(data)

array([27, 19,  4, ...,  0,  6,  6], dtype=int32)

In [35]:
song_labels = estimator.labels_
songs_by_label = [[] for i in xrange(NUM_CLUSTERS)]

for i in xrange(len(data)):
    songs_by_label[song_labels[i]].append(file_paths[i])

In [38]:
for i in xrange(NUM_CLUSTERS):
    print "{}: {}".format(i, len(songs_by_label[i]))

0: 450
1: 1
2: 1
3: 5
4: 285
5: 4
6: 707
7: 6
8: 1
9: 1
10: 5
11: 2
12: 6
13: 7
14: 5
15: 1
16: 3
17: 2
18: 97
19: 299
20: 1
21: 51
22: 75
23: 2
24: 1
25: 5
26: 1
27: 189
28: 1
29: 2


In [39]:
print songs_by_label[0]



In [36]:
# Verify center clusters are around 25 and 10.
print(estimator.cluster_centers_)

[[  4.44444444e-03  -9.97465999e-18  -8.56519716e-18 ...,  -4.28259858e-18
   -4.28259858e-18  -1.71303943e-17]
 [  0.00000000e+00   0.00000000e+00   0.00000000e+00 ...,   0.00000000e+00
    0.00000000e+00   0.00000000e+00]
 [  0.00000000e+00   0.00000000e+00   0.00000000e+00 ...,   0.00000000e+00
    0.00000000e+00   0.00000000e+00]
 ..., 
 [ -1.12757026e-17   2.81892565e-18  -2.81892565e-18 ...,  -1.40946282e-18
   -1.40946282e-18  -5.63785130e-18]
 [  0.00000000e+00   0.00000000e+00   0.00000000e+00 ...,   0.00000000e+00
    0.00000000e+00   0.00000000e+00]
 [  0.00000000e+00   0.00000000e+00   0.00000000e+00 ...,   0.00000000e+00
    0.00000000e+00   0.00000000e+00]]
