In [5]:
import numpy as np
import pandas as pd
from gensim.utils import simple_preprocess
from gensim.models import LdaModel
from gensim import corpora
import nltk
from nltk.corpus import stopwords
import re

In [2]:
model = "jasper"
meta_path = f"../cache/{model}_meta.tsv"

meta = pd.read_csv(meta_path, sep="\t")

In [22]:
nltk.download('stopwords')
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use','neural','networks', 'network'])

def preprocess(text):
    tokens = simple_preprocess(text)
    tokens = [token for token in tokens if token not in stop_words]
    tokens = [token for token in tokens if len(token) > 3]
    return tokens


meta['text'] = meta['title'] + ' ' + meta['summary']
meta['text'] = meta['text'].apply(preprocess)

dictionary = corpora.Dictionary(meta['text'])
dictionary.filter_extremes(no_below=15, no_above=0.5)

corpus = [dictionary.doc2bow(text) for text in meta['text']]
num_topics = 9
lda = LdaModel(
        corpus=corpus,
    id2word=dictionary,
    num_topics=num_topics,
    random_state=42,
    passes=10,
    alpha='auto',
    eta='auto',
    per_word_topics=True
)

print("Top Topics:")
for idx, topic in lda.print_topics(-1):
    print(f'Topic {idx}: {topic}')


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/rongfei/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Top Topics:
Topic 0: 0.060*"language" + 0.031*"tasks" + 0.030*"training" + 0.026*"large" + 0.026*"performance" + 0.018*"tuning" + 0.017*"fine" + 0.016*"shot" + 0.016*"data" + 0.015*"zero"
Topic 1: 0.039*"language" + 0.030*"tasks" + 0.027*"word" + 0.024*"large" + 0.022*"code" + 0.020*"context" + 0.015*"method" + 0.015*"performance" + 0.014*"knowledge" + 0.014*"show"
Topic 2: 0.081*"text" + 0.048*"image" + 0.034*"audio" + 0.032*"speech" + 0.025*"generation" + 0.021*"translation" + 0.020*"diffusion" + 0.014*"synthesis" + 0.013*"high" + 0.013*"quality"
Topic 3: 0.040*"optimization" + 0.031*"machine" + 0.026*"performance" + 0.025*"algorithm" + 0.023*"show" + 0.022*"translation" + 0.020*"learning" + 0.018*"convolutional" + 0.017*"process" + 0.016*"problems"
Topic 4: 0.048*"learning" + 0.029*"detection" + 0.025*"time" + 0.020*"object" + 0.019*"real" + 0.018*"human" + 0.017*"reinforcement" + 0.017*"high" + 0.016*"approach" + 0.016*"using"
Topic 5: 0.046*"image" + 0.029*"diffusion" + 0.026*"ima

In [23]:
import pyLDAvis.gensim
pyLDAvis.enable_notebook();
vis = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False);
vis

In [48]:

lda[corpus[0]]

([(7, 0.9788595)],
 [(0, [7]),
  (1, [7]),
  (2, [7]),
  (3, [7]),
  (4, [7]),
  (5, [7]),
  (6, [7]),
  (7, [7]),
  (8, [7]),
  (9, [7]),
  (10, [7]),
  (11, [7]),
  (12, [7]),
  (13, [7]),
  (14, [7]),
  (15, [7])],
 [(0, [(7, 0.99987495)]),
  (1, [(7, 0.99985445)]),
  (2, [(7, 0.9998665)]),
  (3, [(7, 1.9999752)]),
  (4, [(7, 0.9999338)]),
  (5, [(7, 0.9999832)]),
  (6, [(7, 1.9999878)]),
  (7, [(7, 0.9998771)]),
  (8, [(7, 0.9999957)]),
  (9, [(7, 2.9999473)]),
  (10, [(7, 0.9998533)]),
  (11, [(7, 0.9999583)]),
  (12, [(7, 0.99990034)]),
  (13, [(7, 1.9999676)]),
  (14, [(7, 2.9999454)]),
  (15, [(7, 0.9999014)])])

In [56]:
def dominant_topic(doc_num):
    bow = corpus[doc_num]
    topics = sorted(lda[bow][0], key=lambda x: x[1], reverse=True)
    return topics[0] if topics else (0, 0)

meta['dominant_topic'] = [dominant_topic(i)[0] for i in range(len(meta))]
meta['topic_probability'] = [dominant_topic(i)[1] for i in range(len(meta))]

# Get representative papers for each topic
def get_representative_papers(topic_id, n_samples=5):
    # Filter papers where this is the dominant topic
    topic_papers = meta[meta['dominant_topic'] == topic_id]
    # Sort by probability (most representative first)
    topic_papers = topic_papers.sort_values('topic_probability', ascending=False)
    # Return top n samples
    return topic_papers.head(n_samples)[['title', 'authors', 'year', 'topic_probability']]

for topic_id in range(num_topics):
    print(f"\n\n--- TOPIC {topic_id} ---")
    # Get the topic keywords
    print("Keywords:", lda.print_topic(topic_id))
    print("\nRepresentative Papers:")
    sample_papers = get_representative_papers(topic_id)
    # Check if we have any papers for this topic
    if len(sample_papers) > 0:
        for idx, row in sample_papers.iterrows():
            print(f"- {row['title']} (by {row['authors']}, {row['year']}) [Prob: {row['topic_probability']:.3f}]")
    else:
        print("No papers found with this as dominant topic.")



--- TOPIC 0 ---
Keywords: 0.060*"language" + 0.031*"tasks" + 0.030*"training" + 0.026*"large" + 0.026*"performance" + 0.018*"tuning" + 0.017*"fine" + 0.016*"shot" + 0.016*"data" + 0.015*"zero"

Representative Papers:
- palm scaling language modeling with pathways (by aakanksha chowdhery et al, 2022) [Prob: 0.995]
- large language models encode clinical knowledge (by singhal et al, 2022) [Prob: 0.994]
- multitask prompted training enables zero shot task generalization (by victor sanh et al, 2022) [Prob: 0.993]
- qlora efficient finetuning of quantized llms (by tim dettmers et al, 2023) [Prob: 0.993]
- efficient streaming language models with attention sinks (by guangxuan xiao et al, 2023) [Prob: 0.992]


--- TOPIC 1 ---
Keywords: 0.039*"language" + 0.030*"tasks" + 0.027*"word" + 0.024*"large" + 0.022*"code" + 0.020*"context" + 0.015*"method" + 0.015*"performance" + 0.014*"knowledge" + 0.014*"show"

Representative Papers:
- do as i can not as i say grounding language in robotic afforda