Skip to content

Commit

Permalink
ENH demo sparse KMeans on 20news set (it's slow!)
Browse files Browse the repository at this point in the history
Conflicts:

	examples/document_clustering.py
  • Loading branch information
larsmans committed Dec 21, 2011
1 parent 0ce0f32 commit 216a012
Showing 1 changed file with 45 additions and 19 deletions.
64 changes: 45 additions & 19 deletions examples/document_clustering.py
@@ -1,33 +1,54 @@
"""
===============================================
Clustering text documents using MiniBatchKmeans
===============================================
=======================================
Clustering text documents using k-means
=======================================
This is an example showing how the scikit-learn can be used to cluster
documents by topics using a bag-of-words approach. This example uses
a scipy.sparse matrix to store the features instead of standard numpy arrays.
Two algorithms are demoed: ordinary k-means and its faster cousin minibatch
k-means.
"""
print __doc__

# Author: Peter Prettenhofer <peter.prettenhofer@gmail.com>
# Lars Buitinck <L.J.Buitinck@uva.nl>
# License: Simplified BSD

from time import time
import logging
import numpy as np

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import Vectorizer
from sklearn import metrics

from sklearn.cluster import MiniBatchKMeans
from sklearn.cluster import KMeans, MiniBatchKMeans

import logging
from optparse import OptionParser
import sys
from time import time

import numpy as np


# Display progress logs on stdout
logging.basicConfig(level=logging.INFO,
format='%(asctime)s %(levelname)s %(message)s')

# parse commandline arguments
op = OptionParser()
op.add_option("--no-minibatch",
action="store_false", dest="minibatch", default=True,
help="Use ordinary k-means algorithm.")

print __doc__
op.print_help()

(opts, args) = op.parse_args()
if len(args) > 0:
op.error("this script takes no arguments.")
sys.exit(1)


###############################################################################
# Load some categories from the training set
categories = [
Expand Down Expand Up @@ -61,23 +82,28 @@
print "n_samples: %d, n_features: %d" % X.shape
print


###############################################################################
# Sparse MiniBatchKmeans
# Do the actual clustering

if opts.minibatch:
km = MiniBatchKMeans(k=true_k, init='k-means++', n_init=1,
init_size=1000,
batch_size=1000, verbose=1)
else:
km = KMeans(k=true_k, init='random', max_iter=100, n_init=1, verbose=1)

mbkm = MiniBatchKMeans(k=true_k, init='k-means++', n_init=1,
init_size=1000,
batch_size=1000, verbose=1)
print "Clustering sparse data with %s" % mbkm
print "Clustering sparse data with %s" % km
t0 = time()
mbkm.fit(X)
km.fit(X)
print "done in %0.3fs" % (time() - t0)
print

print "Homogeneity: %0.3f" % metrics.homogeneity_score(labels, mbkm.labels_)
print "Completeness: %0.3f" % metrics.completeness_score(labels, mbkm.labels_)
print "V-measure: %0.3f" % metrics.v_measure_score(labels, mbkm.labels_)
print "Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_)
print "Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_)
print "V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_)
print "Adjusted Rand-Index: %.3f" % \
metrics.adjusted_rand_score(labels, mbkm.labels_)
metrics.adjusted_rand_score(labels, km.labels_)
print "Silhouette Coefficient: %0.3f" % metrics.silhouette_score(
X, labels, sample_size=1000)

Expand Down

0 comments on commit 216a012

Please sign in to comment.