In [7]:
from sklearn.cluster import KMeans

def Kmeans(X,k=2):
    # Clusters data into k clusters, where k can be a vector of different integers, 
    # in which case clustering is made for each k.
    if isinstance(k, int):
        kmeans = KMeans(n_clusters=k).fit(X)
        return kmeans.labels_
    
    labels = np.zeros([X.shape[0],len(k)])
    
    for i in range(len(k)):
        kmeans = KMeans(n_clusters=k[i]).fit(X)
        labels[:,i] = kmeans.labels_
    
    return labels

In [9]:
from sklearn.decomposition import PCA
from sklearn.manifold import Isomap


nrDataPoints = 1000
nrFeatures = 4

X = CreateData(nrDataPoints, nrFeatures)

nrFeaturesKeep = 2

pca = PCA(nrFeaturesKeep)
pca = pca.fit(X)

isomap = Isomap(n_components=nrFeaturesKeep)
isomap = isomap.fit(X)

Y_PCA = pca.transform(X)
Y_ISO = isomap.transform(X)


In [8]:
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
import random
import math
from sklearn.utils import shuffle

def CreateData(nrDataPoints = 1000, nrFeatures = 30):
    mean = np.zeros(nrFeatures)
    var = np.random.rand(nrFeatures)
    sigma = np.diag(var)
    data = np.random.multivariate_normal(mean,sigma,nrDataPoints)
    return data


In [35]:
from sklearn.metrics import silhouette_score

def EvaluateClustering(X,labels):
    if len(labels.shape)==1:
        score = silhouette_score(X,labels)
        return score
    score = np.zeros(labels.shape[1])
    for i in range(labels.shape[1]):
        score[i] = silhouette_score(X,labels[:,i])
    return score

In [24]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans, MiniBatchKMeans
from numpy.random import random_sample
from math import sqrt, log


# returns series of random values sampled between min and max values of passed col
def get_rand_data(col):
	rng = col.max() - col.min()
	return pd.Series(random_sample(len(col))*rng + col.min())

def iter_kmeans(df, n_clusters, num_iters=5):
	rng =  range(1, num_iters + 1)
	vals = pd.Series(index=rng)
	for i in rng:
		k = KMeans(n_clusters=n_clusters)
		k.fit(df)
		#print "Ref k: %s" % k.get_params()['n_clusters']
		vals[i] = k.inertia_
	return vals

def gap_statistic(data, max_k=10):
	df = pd.DataFrame(data)
	gaps = pd.Series(index = range(1, max_k + 1))
	for k in range(1, max_k + 1):
		km_act = KMeans(n_clusters=k, n_init=3)
		km_act.fit(df)

		# get ref dataset
		ref = df.apply(get_rand_data)
		ref_inertia = iter_kmeans(ref, n_clusters=k).mean()

		gap = log(ref_inertia) - log(km_act.inertia_)

		#print "Ref: %s   Act: %s  Gap: %s" % ( ref_inertia, km_act.inertia_, gap)
		gaps[k] = gap

	return gaps


In [25]:

gap_statistic(X,15)

1     1.412842
2     1.400787
3     1.305895
4     1.237148
5     1.239167
6     1.211493
7     1.179472
8     1.176165
9     1.164047
10    1.177190
11    1.179122
12    1.146588
13    1.174855
14    1.132608
15    1.151829
dtype: float64