In [1]:
# Author: Olivier Grisel <olivier.grisel@ensta.org>
#         Lars Buitinck <L.J.Buitinck@uva.nl>
# License: BSD 3 clause

from __future__ import print_function
from time import time

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.datasets import fetch_20newsgroups

n_samples = 2000
n_features = 1000
n_topics = 10
n_top_words = 20

# Load the 20 newsgroups dataset and vectorize it. We use a few heuristics
# to filter out useless terms early on: the posts are stripped of headers,
# footers and quoted replies, and common English words, words occurring in
# only one document or in at least 95% of the documents are removed.

t0 = time()
print("Loading dataset and extracting TF-IDF features...")
dataset = fetch_20newsgroups(shuffle=True, random_state=1,
                             remove=('headers', 'footers', 'quotes'))

vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=n_features,
                             stop_words='english')
tfidf = vectorizer.fit_transform(dataset.data[:n_samples])
print("done in %0.3fs." % (time() - t0))

# Fit the NMF model
print("Fitting the NMF model with n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
nmf = NMF(n_components=n_topics, random_state=1).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

feature_names = vectorizer.get_feature_names()

for topic_idx, topic in enumerate(nmf.components_):
    print("Topic #%d:" % topic_idx)
    print(" ".join([feature_names[i]
                    for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()



Loading dataset and extracting TF-IDF features...
done in 12.937s.
Fitting the NMF model with n_samples=2000 and n_features=1000...
done in 14.685s.
Topic #0:
people think did time law government israel rights say case said make state true evidence don mr point gun let

Topic #1:
thanks know does mail advance hi info interested anybody email looking help appreciated card information list send post need video

Topic #2:
game team year games win play season players nhl toronto runs division flyers think goal hockey player won defense teams

Topic #3:
windows file dos using program use files window problem help os application running drivers version ms screen ftp available code

Topic #4:
edu soon com send university internet ftp mail mit information article pub cc mac hope email address contact blood program

Topic #5:
key chip clipper keys encryption government use public secure phone enforcement data nsa law doesn communications going security used encrypted

Topic #6:
car new 00 10 bi



In [2]:
tfidf

<2000x1000 sparse matrix of type '<type 'numpy.float64'>'
	with 51752 stored elements in Compressed Sparse Row format>

In [10]:
nmf.components_.shape

(10, 1000)