In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction import DictVectorizer
from sklearn.cluster import KMeans
from random import shuffle

import os, cPickle

PROCESSED_DATA_DIR = "processed_data"

In [2]:
# K Means attempts to cluster data by splitting data into groups of equal variance.
# Requires number of clusters to be specified.
# Centroid: mean of cluster.
# Aims to choose centroids that minimize the inertia, or intra-cluster sum of squared distance from the mean.

# Drawbacks
# Note that inertia makes the assumption that clusters are convex and isotropic (identical in all directions).
# Inertia responds poorly to elongated clusters.
# Inertia is not a normalized metric. PCA can reduce the inflation of Euclidean distances that occur with high-dimensional spaces.
# 1. Choose initial centroid, k samples from the dataset.
# 2. Assign each sample to its nearest centroid
# 3. Create new centroids by taking the mean value of all the samples assigned to each previous centroid.
# K means will always converge, but this might be a local minimum, heavily dependent on centroid initialization.
# As such, centroid initialization is done several times.

# In other words, k-means is EM w/small, all-equal diagonal covar matrix.

In [3]:
def get_data():
    ret = []
    vec = DictVectorizer()

    dirs_list        = next(os.walk(PROCESSED_DATA_DIR))[1]
    joined_dirs_list = [os.path.join(PROCESSED_DATA_DIR, d) for d in dirs_list]

    for subdir in joined_dirs_list:
        # Walk files in every subdirectory.
        for root, dirs, files in os.walk(subdir):
            for file_item in files:
                file_path = os.path.join(subdir, file_item)

                # Read file and vectorize lyrics.
                with open(file_path) as f:
                    ret.append(cPickle.load(f))

    return vec.fit_transform(ret).toarray()

In [4]:
data = get_data()

In [5]:
print data

[[ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 ..., 
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]]


In [6]:
estimator = KMeans(n_jobs=-1)

In [7]:
estimator.fit(data)

KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=8, n_init=10,
    n_jobs=-1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0)

In [9]:
estimator.fit_predict(data)

array([2, 2, 1, ..., 5, 5, 5], dtype=int32)

In [11]:
len(estimator.labels_)

2216

In [8]:
# Verify center clusters are around 25 and 10.
print(estimator.cluster_centers_)

[[  0.00000000e+00   0.00000000e+00   0.00000000e+00 ...,   0.00000000e+00
    0.00000000e+00   0.00000000e+00]
 [  5.61797753e-03   5.61797753e-03  -7.48099499e-18 ...,  -3.74049750e-18
   -3.74049750e-18  -1.49619900e-17]
 [  2.28658537e-03  -1.60461922e-17   1.52439024e-03 ...,   7.62195122e-04
    7.62195122e-04   3.04878049e-03]
 ..., 
 [  6.19834711e-03   2.06611570e-03  -8.89045781e-18 ...,  -4.44522891e-18
   -4.44522891e-18  -1.77809156e-17]
 [  0.00000000e+00   0.00000000e+00   0.00000000e+00 ...,   0.00000000e+00
    0.00000000e+00   0.00000000e+00]
 [  0.00000000e+00   0.00000000e+00   0.00000000e+00 ...,   0.00000000e+00
    0.00000000e+00   0.00000000e+00]]
