In [1]:
import gensim
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, silhouette_samples
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import time
from spherecluster import VonMisesFisherMixture
from spherecluster import SphericalKMeans

ImportError: cannot import name 'VonMisesFisherMixture'

In [2]:
wem_newpath = "../Charter-school-identities/data/wem_model_train250_nostem_unlapped_300d.txt"

In [3]:
model = gensim.models.KeyedVectors.load_word2vec_format(wem_newpath)

In [4]:
word_vecs = model[model.vocab]

In [5]:
row_sums = np.linalg.norm(word_vecs, axis=1)

In [8]:
row_sums.shape

(421373,)

In [6]:
unit_vecs = word_vecs / row_sums[:, np.newaxis]

In [8]:
# visualization from http://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_silhouette_analysis.html#sphx-glr-auto-examples-cluster-plot-kmeans-silhouette-analysis-py


def visualize(data, n, seed):
    fig, (ax1) = plt.subplots(1, 1)
    fig.set_size_inches(18, 20)

    # The 1st subplot is the silhouette plot
    # The silhouette coefficient can range from -1, 1 but in this example all
    # lie within [-0.1, 1]
    ax1.set_xlim([-1, 1])
    # The (n_clusters+1)*10 is for inserting blank space between silhouette
    # plots of individual clusters, to demarcate them clearly.
    ax1.set_ylim([0, len(data) + (n+1)*10])

    # Initialize the clusterer with n_clusters value and a random generator
    # seed of 10 for reproducibility.
    clusterer = KMeans(n_clusters=n, max_iter = 600, random_state = seed)
    cluster_labels = clusterer.fit_predict(data)
    silhouette_avg = silhouette_score(data, cluster_labels)
    # Compute the silhouette scores for each sample
    sample_silhouette_values = silhouette_samples(data, cluster_labels)

    y_lower = 10
    for i in range(n):
        # Aggregate the silhouette scores for samples belonging to
        # cluster i, and sort them
        ith_cluster_silhouette_values = \
            sample_silhouette_values[cluster_labels == i]

        ith_cluster_silhouette_values.sort()

        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i

        color = cm.nipy_spectral(float(i) / n)
        ax1.fill_betweenx(np.arange(y_lower, y_upper),
                          0, ith_cluster_silhouette_values,
                          facecolor=color, edgecolor=color, alpha=0.7)

        # Label the silhouette plots with their cluster numbers at the middle
        ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

        # Compute the new y_lower for next plot
        y_lower = y_upper + 10  # 10 for the 0 samples

    ax1.set_title("The silhouette plot for the various clusters.")
    ax1.set_xlabel("The silhouette coefficient values")
    ax1.set_ylabel("Cluster label")

    # The vertical line for average silhouette score of all the values
    ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

    ax1.set_yticks([])  # Clear the yaxis labels / ticks
    ax1.set_xticks([-1,-.5,-.2,-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

    plt.suptitle(("Silhouette analysis for KMeans clustering on sample data "
                  "with n_clusters = %d" % n),
                 fontsize=14, fontweight='bold')

    plt.show()

In [9]:
def sil_score_large(data, labels, size, iterations = 100):
    total = 0
    for i in range(iterations):
        total += silhouette_score(data,labels, metric = 'cosine', sample_size = size)
    return total/iterations

In [11]:
def Kmeans_cluster(data,r_state):
    sil_scores =  []
    score_max = -2
    n_max = 1
    start = time.time()
    for n in range(2,15):
        clusterer = KMeans(n_clusters=n, random_state=r_state)
        cluster_labels = clusterer.fit_predict(data)
        silhouette_avg = sil_score_large(data, cluster_labels, 10000)
        print("For n_clusters =", n, "The average silhouette_score is :", silhouette_avg)
        if silhouette_avg > score_max:
            n_max = n
            score_max = silhouette_avg
        end = time.time()
        print('Time Elapsed:{:f}, n:{:f}'.format(end - start, n))
    return n_max, score_max

In [16]:
num_clusters, sil_score = cluster(unit_vecs,10)

For n_clusters = 2 The average silhouette_score is : 0.0523329461738
Time Elapsed:1031.281104, n:2.000000
For n_clusters = 3 The average silhouette_score is : 0.0583830866218
Time Elapsed:2027.176679, n:3.000000
For n_clusters = 4 The average silhouette_score is : 0.0513328320906
Time Elapsed:3035.462379, n:4.000000
For n_clusters = 5 The average silhouette_score is : 0.0541665776446
Time Elapsed:4086.143611, n:5.000000
For n_clusters = 6 The average silhouette_score is : 0.0499594318867
Time Elapsed:5214.791464, n:6.000000
For n_clusters = 7 The average silhouette_score is : 0.052997966595
Time Elapsed:6302.545661, n:7.000000
For n_clusters = 8 The average silhouette_score is : 0.0308546454087
Time Elapsed:7328.889948, n:8.000000
For n_clusters = 9 The average silhouette_score is : 0.0291169623658
Time Elapsed:8269.395342, n:9.000000
For n_clusters = 10 The average silhouette_score is : 0.0262006434053
Time Elapsed:9138.801212, n:10.000000
For n_clusters = 11 The average silhouette_sc

KeyboardInterrupt: 

In [15]:
def skmeans_cluster(data):
    sil_scores =  []
    score_max = -2
    n_max = 1
    start = time.time()
    for n in range(2,15):
        skm = SphericalKMeans(n_clusters=n)
        skm.fit(data)
        silhouette_avg = sil_score_large(data, skm.labels_, 10000)
        print("For n_clusters =", n, "The average silhouette_score is :", silhouette_avg)
        if silhouette_avg > score_max:
            n_max = n
            score_max = silhouette_avg
        end = time.time()
        print('Time Elapsed:{:f}, n:{:f}'.format(end - start, n))
    return n_max, score_max

In [12]:
def vmf_cluster(data):
    sil_scores =  []
    score_max = -2
    n_max = 1
    start = time.time()
    for n in range(2,15):
        vmf_hard = VonMisesFisherMixture(n_clusters=n, posterior_type='hard')
        vmf_hard.fit(data)
        silhouette_avg = sil_score_large(data, vmf_hard.labels_, 10000)
        print("For n_clusters =", n, "The average silhouette_score is :", silhouette_avg)
        if silhouette_avg > score_max:
            n_max = n
            score_max = silhouette_avg
        end = time.time()
        print('Time Elapsed:{:f}, n:{:f}'.format(end - start, n))
    return n_max, score_max

In [18]:
start = time.time()
clusterer = KMeans(n_clusters=3, random_state=10)
cluster_labels = clusterer.fit_predict(unit_vecs)
end = time.time()
print('Time Elapsed:{:f}, n:{:f}'.format(end - start, 3))

Time Elapsed:74.169621, n:3.000000


In [19]:
start = time.time()
skm = SphericalKMeans(n_clusters=3)
skm.fit(unit_vecs)
end = time.time()
print('Time Elapsed:{:f}, n:{:f}'.format(end - start, 3))

Time Elapsed:120.679900, n:3.000000


In [None]:
n, score = skmeans_cluster(unit_vecs)

For n_clusters = 2 The average silhouette_score is : 0.112996454388
Time Elapsed:509.523777, n:2.000000
For n_clusters = 3 The average silhouette_score is : 0.0854820228368
Time Elapsed:1014.226720, n:3.000000
For n_clusters = 4 The average silhouette_score is : 0.0830213253945
Time Elapsed:1625.843014, n:4.000000
For n_clusters = 5 The average silhouette_score is : 0.0732832474262
Time Elapsed:2350.614228, n:5.000000
For n_clusters = 6 The average silhouette_score is : 0.0631801504269
Time Elapsed:3232.943921, n:6.000000
For n_clusters = 7 The average silhouette_score is : 0.0490158348158
Time Elapsed:4273.532506, n:7.000000
For n_clusters = 8 The average silhouette_score is : 0.0556406338513
Time Elapsed:4951.484163, n:8.000000
For n_clusters = 9 The average silhouette_score is : 0.0493521600217
Time Elapsed:5974.784078, n:9.000000
For n_clusters = 10 The average silhouette_score is : 0.0309829965606
Time Elapsed:7271.782277, n:10.000000
For n_clusters = 11 The average silhouette_sco