In [42]:
# From Sklearn website
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.datasets import fetch_20newsgroups
n_samples = 2000
n_features = 1000
n_topics = 10
n_top_words = 20
# random_state serves as a random seed for shuffle.
dataset = fetch_20newsgroups(shuffle=True, random_state=1,
                             remove=('headers', 'footers', 'quotes'))
data_samples = dataset.data[:n_samples]

In [20]:
data_samples[0]

u"Well i'm not sure about the story nad it did seem biased. What\nI disagree with is your statement that the U.S. Media is out to\nruin Israels reputation. That is rediculous. The U.S. media is\nthe most pro-israeli media in the world. Having lived in Europe\nI realize that incidences such as the one described in the\nletter have occured. The U.S. media as a whole seem to try to\nignore them. The U.S. is subsidizing Israels existance and the\nEuropeans are not (at least not to the same degree). So I think\nthat might be a reason they report more clearly on the\natrocities.\n\tWhat is a shame is that in Austria, daily reports of\nthe inhuman acts commited by Israeli soldiers and the blessing\nreceived from the Government makes some of the Holocaust guilt\ngo away. After all, look how the Jews are treating other races\nwhen they got power. It is unfortunate.\n"

In [47]:
# Use tf-idf features for NMF.
# max_df: when building the vocabulary ignore terms that have a document frequency strictly higher than 
# the given threshold (corpus-specific stop words).
# If float: proportion in documents. If integer: real counts.
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,
                                   max_features=n_features,
                                   stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(data_samples)

In [44]:
def print_top_words(model, feature_names, n_top_words):
    # Pick n_top_words for each topic.
    # argsort returns the index with sorted order. 
    # topic.argsort()[:-n_top_words - 1:-1]])) returns the indexs for the highest n_top_words.
    # model.components_ has shape [ntopic * nfeature], captures the word frequency in each topic.
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))


In [48]:
print("Fitting the NMF model with tf-idf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
# Alpha is constant which times the regularization term. Default is zero.
# l1_ratio: the regularization mixing parameter, with 0 <= l1_ratio <= 1. 
#           For l1_ratio = 0 the penalty is an elementwise L2 penalty (aka Frobenius Norm). 
#           For l1_ratio = 1 it is an elementwise L1 penalty. 
#           For 0 < l1_ratio < 1, the penalty is a combination of L1 and L2.
# init (Default): ‘nndsvdar’ if n_components < n_features, otherwise random
#           nndsvdar: NNDSVD with zeros filled with small random values
nmf = NMF(n_components=n_topics, random_state=1,
          alpha=.1, l1_ratio=.5).fit(tfidf)

print("\nTopics in NMF model:")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)

Fitting the NMF model with tf-idf features, n_samples=2000 and n_features=1000...

Topics in NMF model:
Topic #0:
just people don like think know good time make use way really right say ve want government did ll going
Topic #1:
windows file dos files program using use os problem help running drivers pc ftp ms version available screen software work
Topic #2:
god jesus bible faith christian christ christians does heaven sin believe lord life church mary atheism belief human love religion
Topic #3:
edu soon com send university internet mit ftp mail cc article pub information hope mac email blood home contact program
Topic #4:
thanks know does mail advance hi info interested email anybody card looking help like appreciated information video send list need
Topic #5:
drive drives hard disk floppy software mac scsi computer controller power apple mb rom pc problem card internal problems cable
Topic #6:
window manager application motif problem display graphics use standard time possible try us

In [27]:
# Use tf (raw term count) features for LDA.
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=n_features,
                                stop_words='english')
tf = tf_vectorizer.fit_transform(data_samples)

Extracting tf features for LDA...


In [29]:
print("Fitting LDA models with tf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))

# doc_topic_prior: Prior of document topic distribution theta (alpha). If None, defaults to 1 / n_topics.
# topic_word_prior: Prior of topic word distribution beta (eta). If None, defaults to 1 / n_topics. 
# These two parameters controls the sparsity of the document topic/topic word distributions. Small value-> sparse.
lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
lda.fit(tf)
print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)

Fitting LDA models with tf features, n_samples=2000 and n_features=1000...

Topics in LDA model:
Topic #0:
edu com mail send graphics ftp pub available contact university list faq ca information cs 1993 program sun uk mit
Topic #1:
don like just know think ve way use right good going make sure ll point got need really time doesn
Topic #2:
christian think atheism faith pittsburgh new bible radio games alt lot just religion like book read play time subject believe
Topic #3:
drive disk windows thanks use card drives hard version pc software file using scsi help does new dos controller 16
Topic #4:
hiv health aids disease april medical care research 1993 light information study national service test led 10 page new drug
Topic #5:
god people does just good don jesus say israel way life know true fact time law want believe make think
Topic #6:
55 10 11 18 15 team game 19 period play 23 12 13 flyers 20 25 22 17 24 16
Topic #7:
car year just cars new engine like bike good oil insurance better 