In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import NMF, LatentDirichletAllocation


In [3]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print ("Topic %d:" % (topic_idx))
        print (" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [4]:
dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
documents = dataset.data

In [21]:
import nltk
nltk.download('punkt')
with(open('/content/Walden.txt', 'r') as in_file):
    text = in_file.read()
    sents = nltk.sent_tokenize(text)

documents = sents




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [5]:
no_features = 1000

In [22]:
# NMF is able to use tf-idf
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(documents)
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()

# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tf = tf_vectorizer.fit_transform(documents)

tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()

In [23]:
no_topics = 20

In [27]:
# Run NMF
nmf = NMF(n_components=no_topics, random_state=1, l1_ratio=.5, init='nndsvd').fit(tfidf)
display_topics(nmf, tfidf_feature_names, no_top_words)

Topic 0:
pond woods walden heard came winter fish far way lived
Topic 1:
man true civilized animal lost real called wise world value
Topic 2:
men mass say want little government said true wiser state
Topic 3:
life human things near true away merely necessary lived mean
Topic 4:
know things shall tell work winter history use state government
Topic 5:
like earth birds world heard music hills days fruits dinner
Topic 6:
does make work country west evil way hands stand food
Topic 7:
time long say way little saw work written morning greater
Topic 8:
day night morning work came spring year sun evening went
Topic 9:
good doing make say far bad called went god deal
Topic 10:
house better long door winter room ground got soon began
Topic 11:
nature human cause hard question spring knows lives thy wholly
Topic 12:
let come engine feel far government farmers till lives spend
Topic 13:
did state got bread wish stone said till simply came
Topic 14:
new old england clothes years world say place town

In [24]:
# Run LDA
lda = LatentDirichletAllocation(max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)

no_top_words = 10
# display_topics(nmf, tfidf_feature_names, no_top_words)
display_topics(lda, tfidf_feature_names, no_top_words)

Topic 0:
man shall soon better far told body commonly god clothes
Topic 1:
winter summer days work day trees hard right hear evening
Topic 2:
new good old years way year round perchance people going
Topic 3:
like nature heard things life head standing sun merely spring
Topic 4:
does say know long young life human make remember believe
Topic 5:
men did state little let think make live sense love
Topic 6:
pond house water woods ice shore long walden surface came
Topic 7:
day man great government world true night says cold open
Topic 8:
time come said town read thought respect place best food
Topic 9:
end course words fellow labor men change hoe moment ear
