In [2]:
import gensim.corpora as corpora
from gensim.models import LdaModel, Nmf
from gensim.utils import simple_preprocess
from sklearn.datasets import fetch_20newsgroups
from nltk.corpus import stopwords
import nltk

In [3]:
from sklearn.decomposition import NMF


nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

newsgroups = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
documents = newsgroups.data


def preprocess_text(text):
    # Tokenize and remove stop words
    return [word for word in simple_preprocess(text) if word not in stop_words]


processed_documents = [preprocess_text(doc) for doc in documents]
dictionary = corpora.Dictionary(processed_documents)
corpus = [dictionary.doc2bow(text) for text in processed_documents]

def train_and_display_topics(model_type, num_topics, corpus, dictionary):
    if model_type == "LDA":
        model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, passes=10)
    elif model_type == "NMF":
        model = Nmf(corpus=corpus, id2word=dictionary, num_topics=num_topics)
    print(f"\n{model_type} Model with {num_topics} topics:")
    for idx, topic in model.print_topics(num_topics=num_topics, num_words=20):
        print(f"Topic {idx+1}: {topic}")

[nltk_data] Downloading package stopwords to /Users/ajeyk/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [4]:
for i in (10, 20, 30):
    train_and_display_topics("LDA", i, corpus, dictionary)
    train_and_display_topics("NMF", i, corpus, dictionary)


LDA Model with 10 topics:
Topic 1: 0.008*"people" + 0.008*"government" + 0.006*"said" + 0.005*"mr" + 0.005*"gun" + 0.005*"us" + 0.005*"president" + 0.004*"israel" + 0.004*"state" + 0.004*"one" + 0.004*"would" + 0.003*"war" + 0.003*"armenian" + 0.003*"rights" + 0.003*"states" + 0.003*"going" + 0.003*"stephanopoulos" + 0.003*"law" + 0.003*"military" + 0.003*"first"
Topic 2: 0.013*"pts" + 0.012*"period" + 0.009*"la" + 0.008*"pp" + 0.008*"pt" + 0.006*"power" + 0.005*"play" + 0.005*"pens" + 0.004*"har" + 0.004*"hawks" + 0.004*"calgary" + 0.004*"st" + 0.004*"shots" + 0.004*"oilers" + 0.003*"scorer" + 0.003*"lemieux" + 0.003*"ny" + 0.003*"hartford" + 0.003*"saves" + 0.003*"finland"
Topic 3: 0.021*"key" + 0.010*"van" + 0.009*"det" + 0.008*"chip" + 0.008*"bos" + 0.008*"tor" + 0.008*"chi" + 0.007*"pit" + 0.007*"cal" + 0.007*"min" + 0.007*"la" + 0.007*"cubs" + 0.007*"stl" + 0.006*"encrypted" + 0.006*"que" + 0.006*"win" + 0.006*"buf" + 0.006*"clipper" + 0.006*"keys" + 0.006*"serial"
Topic 4: 0.01

In [5]:
train_and_display_topics("LDA", 50, corpus, dictionary)
train_and_display_topics("NMF", 50, corpus, dictionary)


LDA Model with 50 topics:
Topic 1: 0.032*"master" + 0.029*"pgp" + 0.016*"jumper" + 0.012*"planes" + 0.010*"sweden" + 0.010*"pins" + 0.009*"jumpers" + 0.009*"canada" + 0.008*"switzerland" + 0.008*"alomar" + 0.008*"finland" + 0.007*"emergency" + 0.007*"sw" + 0.007*"contacts" + 0.007*"settings" + 0.007*"usa" + 0.006*"explosive" + 0.006*"parking" + 0.006*"april" + 0.006*"inherent"
Topic 2: 0.063*"image" + 0.040*"db" + 0.028*"jpeg" + 0.027*"gif" + 0.025*"images" + 0.018*"format" + 0.014*"color" + 0.012*"graphics" + 0.012*"bit" + 0.010*"ray" + 0.010*"convert" + 0.009*"ecs" + 0.009*"formats" + 0.009*"si" + 0.008*"cs" + 0.008*"processing" + 0.007*"slot" + 0.007*"al" + 0.007*"files" + 0.007*"wires"
Topic 3: 0.020*"fuse" + 0.018*"hst" + 0.014*"bathroom" + 0.010*"ceiling" + 0.008*"fixtures" + 0.008*"explorer" + 0.007*"disasters" + 0.006*"dining" + 0.006*"reacting" + 0.006*"experiencing" + 0.004*"advisors" + 0.004*"apparantly" + 0.003*"serviced" + 0.003*"gyros" + 0.003*"arrays" + 0.003*"perniciou

In [7]:
%pip install datasets



Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Using cached dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp39-cp39-macosx_11_0_arm64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Using cached multiprocess-0.70.16-py39-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
Using cached dill-0.3.8-py3-none-any.whl (116 kB)
Using cached multiprocess-0.70.16-py39-none-any.whl (133 kB)
Downloading xxhash-3.5.0-cp39-cp39-macosx_11_0_arm64.whl (30 kB)
Installing collected packages: xxhash, dill, multiprocess, datasets
  Attempting uninstall: dill
    Found existing installation: dill 0.3.9
    Uninstalling dill-0.3.9:
      Successfully uninstalled dill-0.3.9
  Attempting uninstall: multiprocess
    Found existing installation: multiprocess 0.70.17
    Uninstalling multiprocess-0.70.17:
      Succ

In [8]:
from datasets import load_dataset


duc_dataset = load_dataset("midas/duc2001", "raw")
duc_documents = duc_dataset['test']['document']


README.md:   0%|          | 0.00/4.32k [00:00<?, ?B/s]

duc2001.py:   0%|          | 0.00/5.58k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/714k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/308 [00:00<?, ? examples/s]

In [25]:
duc_documents = [" ".join(doc) for doc in duc_documents]

In [27]:
# print(duc_documents)
print(type(duc_documents[0]))

<class 'str'>


In [28]:
processed_duc_documents = [preprocess_text(doc) for doc in duc_documents]
duc_dictionary = corpora.Dictionary(processed_duc_documents)
duc_corpus = [duc_dictionary.doc2bow(doc) for doc in processed_duc_documents]

for i in [10, 20, 30, 50]:
    train_and_display_topics("LDA", num_topics=i, corpus=duc_corpus, dictionary=duc_dictionary)
    train_and_display_topics("NMF", num_topics=i, corpus=duc_corpus, dictionary=duc_dictionary)



LDA Model with 10 topics:
Topic 1: 0.019*"said" + 0.012*"fire" + 0.006*"disease" + 0.006*"year" + 0.006*"forest" + 0.005*"national" + 0.005*"fires" + 0.004*"tuberculosis" + 0.004*"hurricane" + 0.004*"people" + 0.004*"new" + 0.004*"last" + 0.004*"one" + 0.003*"slovenia" + 0.003*"park" + 0.003*"officials" + 0.003*"would" + 0.003*"acres" + 0.003*"years" + 0.003*"service"
Topic 2: 0.020*"eclipse" + 0.018*"said" + 0.009*"air" + 0.009*"sun" + 0.008*"crash" + 0.006*"aircraft" + 0.005*"force" + 0.005*"base" + 0.005*"crashed" + 0.005*"miles" + 0.004*"moon" + 0.004*"one" + 0.004*"pilot" + 0.004*"solar" + 0.004*"military" + 0.004*"people" + 0.003*"jet" + 0.003*"two" + 0.003*"west" + 0.003*"hawaii"
Topic 3: 0.009*"said" + 0.005*"lrb" + 0.005*"rrb" + 0.005*"nra" + 0.005*"would" + 0.004*"year" + 0.004*"drought" + 0.004*"one" + 0.004*"two" + 0.003*"tunnel" + 0.003*"us" + 0.003*"last" + 0.003*"gun" + 0.003*"time" + 0.002*"new" + 0.002*"may" + 0.002*"house" + 0.002*"french" + 0.002*"also" + 0.002*"jun