https://medium.com/mlreview/topic-modeling-with-scikit-learn-e80d33668730

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import NMF, LatentDirichletAllocation

def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
documents = dataset.data

no_features = 1000

# NMF is able to use tf-idf
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(documents)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tf = tf_vectorizer.fit_transform(documents)
tf_feature_names = tf_vectorizer.get_feature_names()

no_topics = 20

# Run NMF
nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)

# Run LDA
lda = LatentDirichletAllocation(n_topics=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)

no_top_words = 20
display_topics(nmf, tfidf_feature_names, no_top_words)
display_topics(lda, tf_feature_names, no_top_words)



Topic 0:
people time right did good said say make way government point really years going course long believe state fact world
Topic 1:
window problem using server application screen display motif manager running widget program problems set error mouse work code fine run
Topic 2:
god jesus bible christ faith believe christian christians sin church lord hell truth life man love belief say christianity father
Topic 3:
game team year games season players play hockey win league player teams nhl good runs best better hit division points
Topic 4:
new 00 sale 10 price offer shipping condition 20 15 50 interested 12 asking 30 space 11 25 used sell
Topic 5:
thanks mail advance hi looking info help information address appreciated email post know anybody send interested appreciate need reply tell
Topic 6:
windows file files dos program version ftp ms directory running pc run using os software drivers disk graphics win programs
Topic 7:
edu soon cs university ftp internet article email pub david s

comparar grupos con topicos
