In [35]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.datasets import fetch_20newsgroups
import numpy as np
import pandas as pd

In [36]:
data = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))

df = pd.DataFrame([train_data.data, train_data.target.tolist()]).T

df.columns = ['text', 'source']

In [37]:
df_text = [x for x in train['text']]

In [38]:
# Creating the tf-idf matrix.
vectorizer = TfidfVectorizer(stop_words='english')

text_tfidf=vectorizer.fit_transform(df_text)

terms = vectorizer.get_feature_names()

In [44]:
# Number of topics. I chose 6 because the 20 newgroups are separated into 6 categories
ntopics=20

# Linking words to topics
def word_topic(tfidf,solution, wordlist):
    
    # Loading scores for each word on each topic/component.
    words_by_topic=tfidf.T * solution

    # Linking the loadings to the words in an easy-to-read way.
    components=pd.DataFrame(words_by_topic,index=wordlist)
    
    return components

# Extracts the top N words and their loadings for each topic.
def top_words(components, n_top_words):
    n_topics = range(components.shape[1])
    index= np.repeat(n_topics, n_top_words, axis=0)
    topwords=pd.Series(index=index)
    for column in range(components.shape[1]):
        # Sort the column so that highest loadings are at the top.
        sortedwords=components.iloc[:,column].sort_values(ascending=False)
        # Choose the N highest loadings.
        chosen=sortedwords[:n_top_words]
        # Combine loading and index into a string.
        chosenlist=chosen.index +" "+round(chosen,2).map(str) 
        topwords.loc[column]=[x for x in chosenlist]
    return(topwords)

# Number of words to look at for each topic.
n_top_words = 10

# LSA

In [45]:
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

svd= TruncatedSVD(ntopics)
lsa = make_pipeline(svd, Normalizer(copy=False))

tfidf_lsa = lsa.fit_transform(text_tfidf)

components_lsa = word_topic(text_tfidf, tfidf_lsa, terms)

topwords=pd.DataFrame()

topwords['LSA']=top_words(components_lsa, n_top_words)

# LDA

In [46]:
from sklearn.decomposition import LatentDirichletAllocation as LDA

lda = LDA(n_components=ntopics, 
          doc_topic_prior=None, # Prior = 1/n_documents
          topic_word_prior=1/ntopics,
          learning_decay=0.7, # Convergence rate.
          learning_offset=10.0, # Causes earlier iterations to have less influence on the learning
          max_iter=10, # when to stop even if the model is not converging (to prevent running forever)
          evaluate_every=-1, # Do not evaluate perplexity, as it slows training time.
          mean_change_tol=0.001, # Stop updating the document topic distribution in the E-step when mean change is < tol
          max_doc_update_iter=100, # When to stop updating the document topic distribution in the E-step even if tol is not reached
          n_jobs=-1, # Use all available CPUs to speed up processing time.
          verbose=0, # amount of output to give while iterating
          random_state=0
         )

tfidf_lda = lda.fit_transform(text_tfidf)

components_lda = word_topic(text_tfidf, tfidf_lda, terms)

topwords['LDA']=top_words(components_lda, n_top_words)

# NNMF

In [47]:
from sklearn.decomposition import NMF

nmf = NMF(alpha=0.0, 
          init='nndsvdar', # how starting value are calculated
          l1_ratio=0.0, # Sets whether regularization is L2 (0), L1 (1), or a combination (values between 0 and 1)
          max_iter=200, # when to stop even if the model is not converging (to prevent running forever)
          n_components=ntopics, 
          random_state=0, 
          solver='cd', # Use Coordinate Descent to solve
          tol=0.0001, # model will stop if tfidf-WH <= tol
          verbose=0 # amount of output to give while iterating
         )

tfidf_nmf = nmf.fit_transform(text_tfidf)

components_nmf = word_topic(text_tfidf, tfidf_nmf, terms)

topwords['NNMF']=top_words(components_nmf, n_top_words)

In [48]:
for topic in range(ntopics):
    print('Topic {}:'.format(topic))
    print(topwords.loc[topic])

Topic 0:
            LSA          LDA         NNMF
0    like 87.71    just 3.36    just 3.23
0     don 86.87    know 3.21     don 3.22
0    just 86.78    like 2.95   think 2.61
0    know 80.91     don 2.89    like 2.49
0  people 79.99    does 2.75  people 2.08
0   think 75.08  people 2.71    know 1.99
0    good 62.94   think 2.31    good 1.63
0    does 62.42     use 2.11      ve 1.54
0    time 61.86    good 2.11    time 1.51
0     use 58.63     edu 2.05     say 1.43
Topic 1:
              LSA          LDA           NNMF
1   windows 32.02    like 2.43        use 2.2
1    thanks 30.77    just 2.36       mac 1.42
1      card 20.64    does 2.31  software 1.26
1     drive 19.07    know 2.27     apple 0.97
1       dos 16.84     don 2.08      like 0.97
1  software 16.51   think 2.03      need 0.88
1       use 16.05  thanks 1.79   problem 0.87
1        pc 15.68     use 1.73     modem 0.86
1      file 15.48    time 1.64      used 0.84
1      mail 15.46    good 1.61    memory 0.82
Topic 2:
     

# results
- nmf is by far the best
- nmf topics are consistently related to the newsgroups
- LDA is the worst.
- LDA topics can not be interpreted
- Many LDA topics are duplicates
- LSA is only slightly better than LDA