In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import NMF, LatentDirichletAllocation


In [2]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print ("Topic %d:" % (topic_idx))
        print (" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [3]:
dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
documents = dataset.data

In [4]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:

with(open('/content/Walden.txt', 'r') as in_file):
    text = in_file.read()
    sents = nltk.sent_tokenize(text)

documents = sents




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [5]:
no_features = 1000

In [6]:
# NMF is able to use tf-idf
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(documents)
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()

# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tf = tf_vectorizer.fit_transform(documents)

tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()

In [9]:
no_topics = 10
no_top_words = 5

In [10]:
# Run NMF
nmf = NMF(n_components=no_topics, random_state=1, l1_ratio=.5, init='nndsvd').fit(tfidf)
display_topics(nmf, tfidf_feature_names, no_top_words)

Topic 0:
don just people think like
Topic 1:
card video monitor cards drivers
Topic 2:
god jesus bible believe christ
Topic 3:
game team year games season
Topic 4:
new car 00 10 sale
Topic 5:
thanks know does mail advance
Topic 6:
windows file use files window
Topic 7:
edu soon com university cs
Topic 8:
key chip encryption clipper keys
Topic 9:
drive scsi hard drives disk


In [11]:
# Run LDA
lda = LatentDirichletAllocation(max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)


# display_topics(nmf, tfidf_feature_names, no_top_words)
display_topics(lda, tfidf_feature_names, no_top_words)

Topic 0:
people gun armenian armenians war
Topic 1:
government people law mr use
Topic 2:
space program output entry data
Topic 3:
key car chip used keys
Topic 4:
edu file com available mail
Topic 5:
god people does jesus say
Topic 6:
windows use drive thanks does
Topic 7:
ax max b8f g9v a86
Topic 8:
just don like think know
Topic 9:
10 00 25 15 12


In [23]:
def display_topics(model, feature_names, no_top_words, print_topics=False): # Add a flag to control printing
    """
    Displays topics and their top features.

    Args:
        model: The trained topic model.
        feature_names: List of feature names.
        no_top_words: Number of top words to display per topic.
        print_topics: Whether to print topics to console.

    Returns:
        List of tuples containing (topic_idx, top_features).
    """
    topic_results = []
    for topic_idx, topic in enumerate(model.components_):
        topic_results.append((topic_idx,
                             [feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))
        if print_topics: # Print only if the flag is True
            print("Topic %d:" % (topic_idx))
            print(" ".join([feature_names[i]
                            for i in topic.argsort()[:-no_top_words - 1:-1]]))

    return topic_results




In [24]:
output_string = ""
for topic_idx, top_features in display_topics(lda, tfidf_feature_names, 10):
    output_string += f"Topic {topic_idx}: {', '.join(top_features)}\n"

print(output_string)

Topic 0: people, gun, armenian, armenians, war, turkish, states, israel, said, children
Topic 1: government, people, law, mr, use, president, don, think, right, public
Topic 2: space, program, output, entry, data, nasa, use, science, research, build
Topic 3: key, car, chip, used, keys, bike, use, bit, clipper, number
Topic 4: edu, file, com, available, mail, ftp, files, information, image, send
Topic 5: god, people, does, jesus, say, think, believe, don, know, just
Topic 6: windows, use, drive, thanks, does, problem, know, card, like, using
Topic 7: ax, max, b8f, g9v, a86, pl, 145, 1d9, 0t, 34u
Topic 8: just, don, like, think, know, good, time, ve, people, said
Topic 9: 10, 00, 25, 15, 12, 20, 11, 14, 17, 16




