ENH demo sparse KMeans on 20news set (it's slow!)

Conflicts: examples/document_clustering.py
forkloop · Dec 21, 2011 · 216a012 · 216a012
1 parent 0ce0f32
commit 216a012
Showing 1 changed file with 45 additions and 19 deletions.
diff --git a/examples/document_clustering.py b/examples/document_clustering.py
@@ -1,33 +1,54 @@
 """
-===============================================
-Clustering text documents using MiniBatchKmeans
-===============================================
+=======================================
+Clustering text documents using k-means
+=======================================
 
 This is an example showing how the scikit-learn can be used to cluster
 documents by topics using a bag-of-words approach. This example uses
 a scipy.sparse matrix to store the features instead of standard numpy arrays.
 
+Two algorithms are demoed: ordinary k-means and its faster cousin minibatch
+k-means.
+
 """
-print __doc__
 
 # Author: Peter Prettenhofer <peter.prettenhofer@gmail.com>
+#         Lars Buitinck <L.J.Buitinck@uva.nl>
 # License: Simplified BSD
 
-from time import time
-import logging
-import numpy as np
-
 from sklearn.datasets import fetch_20newsgroups
 from sklearn.feature_extraction.text import Vectorizer
 from sklearn import metrics
 
-from sklearn.cluster import MiniBatchKMeans
+from sklearn.cluster import KMeans, MiniBatchKMeans
+
+import logging
+from optparse import OptionParser
+import sys
+from time import time
+
+import numpy as np
 
 
 # Display progress logs on stdout
 logging.basicConfig(level=logging.INFO,
                     format='%(asctime)s %(levelname)s %(message)s')
 
+# parse commandline arguments
+op = OptionParser()
+op.add_option("--no-minibatch",
+              action="store_false", dest="minibatch", default=True,
+              help="Use ordinary k-means algorithm.")
+
+print __doc__
+op.print_help()
+
+(opts, args) = op.parse_args()
+if len(args) > 0:
+    op.error("this script takes no arguments.")
+    sys.exit(1)
+
+
 ###############################################################################
 # Load some categories from the training set
 categories = [
@@ -61,23 +82,28 @@
 print "n_samples: %d, n_features: %d" % X.shape
 print
 
+
 ###############################################################################
-# Sparse MiniBatchKmeans
+# Do the actual clustering
+
+if opts.minibatch:
+    km = MiniBatchKMeans(k=true_k, init='k-means++', n_init=1,
+                         init_size=1000,
+                         batch_size=1000, verbose=1)
+else:
+    km = KMeans(k=true_k, init='random', max_iter=100, n_init=1, verbose=1)
 
-mbkm = MiniBatchKMeans(k=true_k, init='k-means++', n_init=1,
-                       init_size=1000,
-                       batch_size=1000, verbose=1)
-print "Clustering sparse data with %s" % mbkm
+print "Clustering sparse data with %s" % km
 t0 = time()
-mbkm.fit(X)
+km.fit(X)
 print "done in %0.3fs" % (time() - t0)
 print
 
-print "Homogeneity: %0.3f" % metrics.homogeneity_score(labels, mbkm.labels_)
-print "Completeness: %0.3f" % metrics.completeness_score(labels, mbkm.labels_)
-print "V-measure: %0.3f" % metrics.v_measure_score(labels, mbkm.labels_)
+print "Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_)
+print "Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_)
+print "V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_)
 print "Adjusted Rand-Index: %.3f" % \
-    metrics.adjusted_rand_score(labels, mbkm.labels_)
+    metrics.adjusted_rand_score(labels, km.labels_)
 print "Silhouette Coefficient: %0.3f" % metrics.silhouette_score(
     X, labels, sample_size=1000)