In [1]:
# import TfidfVectorizer and CountVectorizer from sklearn
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# import fetch_20newsgroups from sklearn.datasets
from sklearn.datasets import fetch_20newsgroups

# import NMF and LatentDirichletAllocation from sklearn
from sklearn.decomposition import NMF, LatentDirichletAllocation

In [2]:
dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
documents = dataset.data

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


* create variable 'no_features' and set its value to 100

In [3]:
no_features = 100

* create variable 'no_topics' and set its value to 100

In [4]:
no_topics = 100

# NMF

* instantiate TfidfVectorizer with following params:
* max_df=0.95, min_df=2, max_features=no_features, stop_words='english'

In [5]:
vectorizer = TfidfVectorizer(max_df = 0.95, min_df=2, max_features=no_features,stop_words='english')

* use fit_transform method of TfidfVectorizer to transform documents

In [7]:
docs_vectorized = vectorizer.fit_transform(documents)
docs_vectorized

<11314x100 sparse matrix of type '<class 'numpy.float64'>'
	with 83200 stored elements in Compressed Sparse Row format>

* get features names from TfidfVectorizer

In [9]:
feat_names = vectorizer.get_feature_names()
print(vectorizer.get_feature_names())

['00', '10', '12', '14', '15', '16', '20', '25', 'a86', 'available', 'ax', 'b8f', 'believe', 'best', 'better', 'bit', 'case', 'com', 'come', 'course', 'data', 'day', 'did', 'didn', 'different', 'does', 'doesn', 'don', 'drive', 'edu', 'fact', 'far', 'file', 'g9v', 'god', 'going', 'good', 'got', 'government', 'help', 'information', 'jesus', 'just', 'key', 'know', 'law', 'let', 'like', 'line', 'list', 'little', 'll', 'long', 'look', 'lot', 'mail', 'make', 'max', 'mr', 'need', 'new', 'number', 'people', 'point', 'power', 'probably', 'problem', 'program', 'question', 'read', 'really', 'right', 'run', 'said', 'say', 'second', 'set', 'software', 'space', 'state', 'sure', 'tell', 'thanks', 'thing', 'things', 'think', 'time', 'true', 'try', 'use', 'used', 'using', 've', 'want', 'way', 'windows', 'work', 'world', 'year', 'years']


* instantiate NMF and fit transformed data

In [22]:
nmf = NMF(n_components=no_topics)
nmf_tf = nmf.fit_transform(docs_vectorized)

# LDA

* instantiate CountVectorizer with following params:
* max_df=0.95, min_df=2, max_features=no_features, stop_words='english'

In [11]:
cvec = CountVectorizer(max_df=0.95,min_df=2,max_features=no_features,stop_words='english')

* use fit_transform method of CountVectorizer to transform documents

In [12]:
docs_vectorized2 = cvec.fit_transform(documents)

* get features names from TfidfVectorizer

In [13]:
feat_names2 = cvec.get_feature_names()
print(feat_names2)

['00', '10', '12', '14', '15', '16', '20', '25', 'a86', 'available', 'ax', 'b8f', 'believe', 'best', 'better', 'bit', 'case', 'com', 'come', 'course', 'data', 'day', 'did', 'didn', 'different', 'does', 'doesn', 'don', 'drive', 'edu', 'fact', 'far', 'file', 'g9v', 'god', 'going', 'good', 'got', 'government', 'help', 'information', 'jesus', 'just', 'key', 'know', 'law', 'let', 'like', 'line', 'list', 'little', 'll', 'long', 'look', 'lot', 'mail', 'make', 'max', 'mr', 'need', 'new', 'number', 'people', 'point', 'power', 'probably', 'problem', 'program', 'question', 'read', 'really', 'right', 'run', 'said', 'say', 'second', 'set', 'software', 'space', 'state', 'sure', 'tell', 'thanks', 'thing', 'things', 'think', 'time', 'true', 'try', 'use', 'used', 'using', 've', 'want', 'way', 'windows', 'work', 'world', 'year', 'years']


* instantiate LatentDirichletAllocation and fit transformed data 

In [14]:
lda_model = LatentDirichletAllocation(n_components=no_topics,max_iter=10,learning_method='online')
lda_nmz = lda_model.fit_transform(docs_vectorized2)

* create a function display_topics that is able to display the top words in a topic for different models

In [19]:
def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        #print([(vectorizer.get_feature_names()[i], topic[i]) for i in topic.argsort()[:-top_n - 1:-1]])
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[0:top_n]])

* display top 1o words from each topic from NMF model

In [24]:
print("NMF Model:")
print_topics(nmf, vectorizer, top_n=1)
print("=" * 20)

NMF Model:
Topic 0:
[('00', 0.0)]
Topic 1:
[('00', 0.0)]
Topic 2:
[('00', 0.0)]
Topic 3:
[('00', 0.0)]
Topic 4:
[('00', 0.0)]
Topic 5:
[('00', 0.0)]
Topic 6:
[('00', 0.0)]
Topic 7:
[('00', 0.0)]
Topic 8:
[('00', 0.0)]
Topic 9:
[('00', 0.0)]
Topic 10:
[('00', 0.0)]
Topic 11:
[('00', 0.0)]
Topic 12:
[('00', 0.0)]
Topic 13:
[('00', 0.0)]
Topic 14:
[('00', 0.0)]
Topic 15:
[('00', 0.0)]
Topic 16:
[('00', 0.0)]
Topic 17:
[('00', 0.0)]
Topic 18:
[('00', 0.0)]
Topic 19:
[('00', 0.0)]
Topic 20:
[('00', 0.0)]
Topic 21:
[('00', 0.0)]
Topic 22:
[('00', 0.0)]
Topic 23:
[('00', 0.0)]
Topic 24:
[('00', 0.0)]
Topic 25:
[('00', 0.0)]
Topic 26:
[('00', 0.0)]
Topic 27:
[('00', 0.0)]
Topic 28:
[('00', 0.0)]
Topic 29:
[('00', 0.0)]
Topic 30:
[('00', 0.0)]
Topic 31:
[('00', 0.0)]
Topic 32:
[('00', 0.0)]
Topic 33:
[('00', 0.0)]
Topic 34:
[('00', 0.0)]
Topic 35:
[('00', 0.0)]
Topic 36:
[('00', 0.0)]
Topic 37:
[('00', 0.0)]
Topic 38:
[('00', 0.0)]
Topic 39:
[('00', 0.0)]
Topic 40:
[('00', 0.0)]
Topic 41:
[('00

* display top 1o words from each topic from LDA model

In [21]:
print("LDA Model:")
print_topics(lda_model, cvec,top_n=1)
print("=" * 20)

LDA Model:
Topic 0:
[('space', 0.010000003026986485)]
Topic 1:
[('drive', 0.010000003272037716)]
Topic 2:
[('a86', 0.010000003402702063)]
Topic 3:
[('file', 0.010000003405676737)]
Topic 4:
[('best', 0.010000003193248187)]
Topic 5:
[('ax', 0.010000002881344876)]
Topic 6:
[('com', 0.010000003349943436)]
Topic 7:
[('com', 0.010000003298285086)]
Topic 8:
[('com', 0.010000003581115299)]
Topic 9:
[('ax', 0.010000003353988809)]
Topic 10:
[('15', 0.010000003062968893)]
Topic 11:
[('file', 0.010000003223393124)]
Topic 12:
[('windows', 0.010000003391136372)]
Topic 13:
[('file', 0.010000003377354091)]
Topic 14:
[('lot', 0.010000003364437769)]
Topic 15:
[('software', 0.010000003334393598)]
Topic 16:
[('state', 0.010000003440732229)]
Topic 17:
[('ax', 0.010000003461838674)]
Topic 18:
[('key', 0.010000003165455585)]
Topic 19:
[('better', 0.01000000328054747)]
Topic 20:
[('thanks', 0.010000003169070713)]
Topic 21:
[('g9v', 0.01000000312018099)]
Topic 22:
[('a86', 0.010000002831598442)]
Topic 23:
[('0