In [39]:
import os
from glob import glob
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import NMF, LatentDirichletAllocation
import nltk

In [24]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(str(topic_idx))
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [44]:
documents = []

for topic in ['business', 'entertainment', 'politics', 'sport', 'tech']:
    for file in glob(os.path.join('data/bbc', topic, "*")):
        try:
            documents.append(
                open(file, encoding='utf8').read()
            )
        except ValueError as e:
            # some text files can't be decoded
            # 'utf-8' codec can't decode
            pass
len(documents)

2224

In [48]:
dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
# documents = dataset.data

no_features = 100

# NMF is able to use tf-idf
tfidf_vectorizer = TfidfVectorizer(max_df=0.85, min_df=2, max_features=no_features, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(documents)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=0.85, min_df=2, max_features=no_features, stop_words='english')
tf = tf_vectorizer.fit_transform(documents)
tf_feature_names = tf_vectorizer.get_feature_names()

no_topics = 5

# Run NMF
nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)

# Run LDA
lda = LatentDirichletAllocation(n_components=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)

no_top_words = 10

In [49]:
display_topics(lda, tf_feature_names, no_top_words)

0
said year new mobile company people net market firm users
1
labour election said economy year government england tax growth country
2
people technology tv says use make said like bbc used
3
game music best film games said year time players win
4
said mr new government blair told uk people brown party


In [50]:
display_topics(nmf, tfidf_feature_names, no_top_words)

0
game england win play players games time said half good
1
mr labour said blair election party government minister brown told
2
people said users technology music use mobile net new service
3
film best director won tv year british number million uk
4
said year company market 2004 growth firm economy sales new


In [12]:
dataset.data[1]

"\n\n\n\n\n\n\nYeah, do you expect people to read the FAQ, etc. and actually accept hard\natheism?  No, you need a little leap of faith, Jimmy.  Your logic runs out\nof steam!\n\n\n\n\n\n\n\nJim,\n\nSorry I can't pity you, Jim.  And I'm sorry that you have these feelings of\ndenial about the faith you need to get by.  Oh well, just pretend that it will\nall end happily ever after anyway.  Maybe if you start a new newsgroup,\nalt.atheist.hard, you won't be bummin' so much?\n\n\n\n\n\n\nBye-Bye, Big Jim.  Don't forget your Flintstone's Chewables!  :) \n--\nBake Timmons, III"