In [24]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction import DictVectorizer
from sklearn.cluster import KMeans
from random import shuffle

import os, cPickle

PROCESSED_DATA_DIR = "processed_data"
NUM_CLUSTERS = 10

In [25]:
# K Means attempts to cluster data by splitting data into groups of equal variance.
# Requires number of clusters to be specified.
# Centroid: mean of cluster.
# Aims to choose centroids that minimize the inertia, or intra-cluster sum of squared distance from the mean.

# Drawbacks
# Note that inertia makes the assumption that clusters are convex and isotropic (identical in all directions).
# Inertia responds poorly to elongated clusters.
# Inertia is not a normalized metric. PCA can reduce the inflation of Euclidean distances that occur with high-dimensional spaces.
# 1. Choose initial centroid, k samples from the dataset.
# 2. Assign each sample to its nearest centroid
# 3. Create new centroids by taking the mean value of all the samples assigned to each previous centroid.
# K means will always converge, but this might be a local minimum, heavily dependent on centroid initialization.
# As such, centroid initialization is done several times.

# In other words, k-means is EM w/small, all-equal diagonal covar matrix.

In [26]:
def get_data():
    ret = []
    file_paths = []
    vec = DictVectorizer()

    dirs_list        = next(os.walk(PROCESSED_DATA_DIR))[1]
    joined_dirs_list = [os.path.join(PROCESSED_DATA_DIR, d) for d in dirs_list]

    for subdir in joined_dirs_list:
        # Walk files in every subdirectory.
        for root, dirs, files in os.walk(subdir):
            for file_item in files:
                file_path = os.path.join(subdir, file_item)

                # Read file and vectorize lyrics.
                with open(file_path) as f:
                    ret.append(cPickle.load(f))
                
                file_paths.append(file_path)

    return vec.fit_transform(ret).toarray(), file_paths, ret

In [27]:
data, file_paths, freqs = get_data()

In [28]:
assert(len(data) == len(file_paths))

In [29]:
print data

[[ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 ..., 
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]]


In [30]:
# n_jobs = -1 means that KMeans should be as parallel as possible.
estimator = KMeans(n_clusters=NUM_CLUSTERS, n_jobs=-1)

In [31]:
estimator.fit_predict(data)

array([0, 4, 1, ..., 8, 8, 8], dtype=int32)

In [32]:
song_labels = estimator.labels_
songs_by_label = [[] for i in xrange(NUM_CLUSTERS)]

for i in xrange(len(data)):
    songs_by_label[song_labels[i]].append(file_paths[i])

In [33]:
for i in xrange(NUM_CLUSTERS):
    print "{}: {}".format(i, len(songs_by_label[i]))

0: 33
1: 309
2: 38
3: 265
4: 514
5: 10
6: 5
7: 9
8: 1032
9: 1


In [34]:
words = [i for i in xrange(len(songs_by_label))]
for i in xrange(len(songs_by_label)):
    temp = {}
    for song_name in songs_by_label[i]:
        idx = file_paths.index(song_name)
        song_freq = freqs[idx]
        for k, v in song_freq.iteritems():
            if k in temp:
                temp[k] += v
            else:
                temp[k] = v
    words[i] = temp

In [35]:
sorted_top_words = [sorted(d, key=d.get, reverse=True) for d in words]

In [38]:
num_select = 30
for cluster in sorted_top_words:
    print cluster[0:num_select]

[u'like', u'ya', u'im', u'know', u'got', u'me', u'aint', u'get', u'dont', u'na', u'nigga', u'it', u'let', u'back', u'cause', u'yeah', u'go', u'yall', u'bitch', u'yo', u'head', u'tell', u'see', u'gon', u'shit', u'say', u'wan', u'lyrics', u'oh', u'that']
[u'im', u'like', u'get', u'got', u'dont', u'know', u'cause', u'me', u'aint', u'shit', u'back', u'lyrics', u'nigga', u'see', u'one', u'na', u'make', u'it', u'fuck', u'man', u'go', u'cant', u'em', u'bitch', u'thats', u'say', u'yeah', u'take', u'come', u'time']
[u'love', u'la', u'yeah', u'me', u'like', u'know', u'one', u'im', u'got', u'now', u'dont', u'right', u'get', u'would', u'see', u'wa', u'thats', u'lyrics', u'hood', u'never', u'go', u'cant', u'girls', u'say', u'give', u'make', u'momma', u'back', u'aint', u'you']
[u'nigga', u'im', u'get', u'shit', u'got', u'like', u'dont', u'fuck', u'niggas', u'aint', u'bitch', u'know', u'cause', u'ya', u'ass', u'me', u'niggaz', u'see', u'yo', u'lyrics', u'back', u'na', u'thats', u'money', u'em', u'man

In [37]:
# Verify center clusters are around 25 and 10.
print(estimator.cluster_centers_)

[[  1.30104261e-18   8.67361738e-19   3.25260652e-19 ...,   1.62630326e-19
    1.62630326e-19   6.50521303e-19]
 [  6.47249191e-03   3.23624595e-03  -6.72205347e-18 ...,  -3.36102673e-18
   -3.36102673e-18  -1.34441069e-17]
 [  1.30104261e-18   8.67361738e-19   3.25260652e-19 ...,   1.62630326e-19
    1.62630326e-19   6.50521303e-19]
 ..., 
 [  4.33680869e-19   0.00000000e+00   1.08420217e-19 ...,   5.42101086e-20
    5.42101086e-20   2.16840434e-19]
 [  1.93798450e-03  -1.51788304e-17   9.68992248e-04 ...,  -5.47522097e-18
    9.68992248e-04   3.87596899e-03]
 [  0.00000000e+00   0.00000000e+00   0.00000000e+00 ...,   0.00000000e+00
    0.00000000e+00   0.00000000e+00]]
