In [1]:
# import TfidfVectorizer and CountVectorizer from sklearn
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# import fetch_20newsgroups from sklearn.datasets
from sklearn.datasets import fetch_20newsgroups

# import NMF and LatentDirichletAllocation from sklearn
from sklearn.decomposition import NMF, LatentDirichletAllocation

In [2]:
dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
documents = dataset.data

In [7]:
len(documents)

11314

* create variable 'no_features' and set its value to 100

In [3]:
no_features = 100

* create variable 'no_topics' and set its value to 100

In [4]:
no_topics = 100

# NMF

* instantiate TfidfVectorizer with following params:
* max_df=0.95, min_df=2, max_features=no_features, stop_words='english'

In [42]:
vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,max_features=no_features, stop_words='english')

* use fit_transform method of TfidfVectorizer to transform documents

In [43]:
data = vectorizer.fit_transform(documents)

* get features names from TfidfVectorizer

In [44]:
feat_names = vectorizer.get_feature_names()

* instantiate NMF and fit transformed data

In [45]:
nmf_model = NMF(n_components=no_topics)
nmf_res = nmf_model.fit_transform(data)

In [46]:
nmf_res.shape

(11314, 100)

# LDA

* instantiate CountVectorizer with following params:
* max_df=0.95, min_df=2, max_features=no_features, stop_words='english'

In [47]:
vectorizer_LDA = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')

* use fit_transform method of CountVectorizer to transform documents

In [48]:
data_2 = vectorizer_LDA.fit_transform(documents)

* get features names from TfidfVectorizer

In [49]:
feat_names = vectorizer_LDA.get_feature_names()

* instantiate LatentDirichletAllocation and fit transformed data 

In [50]:
LDA = LatentDirichletAllocation()
LDA_res = LDA.fit_transform(data_2)

In [51]:
LDA_res.shape

(11314, 10)

* create a function display_topics that is able to display the top words in a topic for different models

In [33]:
def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print(f"Topic {idx}")
        print([(vectorizer.get_feature_names()[i], topic[i]) for i in topic.argsort()[:-top_n - 1:-1]])

* display top 1o words from each topic from NMF model

In [34]:
print_topics(nmf_model, vectorizer)

Topic 0
[('did', 16.40089167562298), ('just', 2.712503284670342e-07), ('ll', 6.068010233491238e-09), ('data', 3.4930943660434774e-10), ('years', 0.0), ('going', 0.0), ('don', 0.0), ('drive', 0.0), ('edu', 0.0), ('fact', 0.0)]
Topic 1
[('thanks', 10.587911306174993), ('14', 6.195771865345562e-06), ('file', 1.2616145626362217e-07), ('data', 3.2150624575724676e-10), ('years', 0.0), ('going', 0.0), ('don', 0.0), ('drive', 0.0), ('edu', 0.0), ('fact', 0.0)]
Topic 2
[('does', 4.436783869191484), ('know', 8.159441789395825e-05), ('just', 2.3191431545248497e-07), ('ll', 4.134329829272207e-09), ('data', 2.829545097634216e-10), ('years', 0.0), ('god', 0.0), ('don', 0.0), ('drive', 0.0), ('edu', 0.0)]
Topic 3
[('edu', 5.083671527080277), ('14', 1.5876118805079931e-06), ('file', 1.061012725885636e-06), ('just', 8.851142891757644e-08), ('ll', 5.013211141444986e-09), ('data', 2.0520602253504658e-10), ('line', 1.9070542953814787e-12), ('going', 0.0), ('don', 0.0), ('drive', 0.0)]
Topic 4
[('just', 3.

* display top 1o words from each topic from LDA model

In [35]:
print_topics(LDA, vectorizer)

Topic 0
[('people', 2715.6902781873864), ('said', 1446.9865325017447), ('right', 1165.5982296289963), ('did', 1036.3312416240676), ('just', 953.6844485586614), ('didn', 904.8211101814129), ('government', 887.6950097940019), ('got', 836.759033219838), ('time', 831.7080806481732), ('ve', 827.9452082297602)]
Topic 1
[('does', 1268.7759468420654), ('available', 892.7080247226344), ('bit', 857.7849649354754), ('software', 824.3420067712168), ('using', 801.1079738506231), ('use', 732.1827232252928), ('set', 570.3986535978464), ('know', 556.0882207033287), ('like', 427.119269443316), ('work', 425.3008552561025)]
Topic 2
[('key', 1219.625810129465), ('00', 1188.9213542874643), ('new', 804.6809924496454), ('used', 124.54554947006018), ('number', 120.63554381136491), ('bit', 107.85229007536347), ('20', 79.84134851796348), ('15', 67.7010566479492), ('10', 61.46155686259713), ('good', 58.176740838233485)]
Topic 3
[('don', 2616.190483060757), ('think', 2219.8495675135932), ('just', 2086.75422218534