In [1]:
import json
from langchain_community.document_loaders import ArxivLoader
from langchain_community.document_loaders.merge import MergedDataLoader
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document
from typing import Iterator

class CustomArxivLoader(ArxivLoader):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
    
    def lazy_load(self) -> Iterator[Document]:
        documents = super().lazy_load()

        def update_metadata(documents):
            for document in documents:
                summary = document.metadata["Summary"]

                del document.metadata["Summary"]

                yield Document(
                    page_content=summary,
                    metadata={
                        **document.metadata,
                        "Source": f"https://arxiv.org/pdf/{self.query}.pdf"
                    }
                )
        
        return update_metadata(documents)

retriever = "large-language-models"

with open(f"../assets/results/crawler_{retriever}.json") as file:
    results = json.load(file)

arxiv_urls = results["collected_urls"]["arxiv.org"]
arxiv_ids = map(lambda url: url.split("/")[-1].strip(".pdf"), arxiv_urls)

all_loaders = [CustomArxivLoader(query=arxiv_id) for arxiv_id in arxiv_ids]

merged_loader = MergedDataLoader(loaders=all_loaders)

documents = merged_loader.load()

MuPDF error: syntax error: could not parse color space (1138 0 R)



In [5]:
len(documents)

294

In [6]:
documents[0]

Document(metadata={'Published': '2022-07-04', 'Title': 'MetaFormer Is Actually What You Need for Vision', 'Authors': 'Weihao Yu, Mi Luo, Pan Zhou, Chenyang Si, Yichen Zhou, Xinchao Wang, Jiashi Feng, Shuicheng Yan', 'Source': 'https://arxiv.org/pdf/2111.11418.pdf'}, page_content='Transformers have shown great potential in computer vision tasks. A common\nbelief is their attention-based token mixer module contributes most to their\ncompetence. However, recent works show the attention-based module in\nTransformers can be replaced by spatial MLPs and the resulted models still\nperform quite well. Based on this observation, we hypothesize that the general\narchitecture of the Transformers, instead of the specific token mixer module,\nis more essential to the model\'s performance. To verify this, we deliberately\nreplace the attention module in Transformers with an embarrassingly simple\nspatial pooling operator to conduct only basic token mixing. Surprisingly, we\nobserve that the derived 

In [2]:
!pip install bertopic

Collecting bertopic
  Downloading bertopic-0.16.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting hdbscan>=0.8.29 (from bertopic)
  Downloading hdbscan-0.8.37-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Collecting umap-learn>=0.5.0 (from bertopic)
  Downloading umap_learn-0.5.6-py3-none-any.whl.metadata (21 kB)
Collecting cython<3,>=0.27 (from hdbscan>=0.8.29->bertopic)
  Downloading Cython-0.29.37-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl.metadata (3.1 kB)
Collecting numba>=0.51.2 (from umap-learn>=0.5.0->bertopic)
  Downloading numba-0.60.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (2.7 kB)
Collecting pynndescent>=0.5 (from umap-learn>=0.5.0->bertopic)
  Downloading pynndescent-0.5.13-py3-none-any.whl.metadata (6.8 kB)
Collecting llvmlite<0.44,>=0.43.0dev0 (from numba>=0.51.2->umap-learn>=0.5.0->bertopic)
  Downloading llvmlite-0.43.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_6

In [4]:
summaries = [doc.page_content for doc in documents]
summaries[0]

'Transformers have shown great potential in computer vision tasks. A common\nbelief is their attention-based token mixer module contributes most to their\ncompetence. However, recent works show the attention-based module in\nTransformers can be replaced by spatial MLPs and the resulted models still\nperform quite well. Based on this observation, we hypothesize that the general\narchitecture of the Transformers, instead of the specific token mixer module,\nis more essential to the model\'s performance. To verify this, we deliberately\nreplace the attention module in Transformers with an embarrassingly simple\nspatial pooling operator to conduct only basic token mixing. Surprisingly, we\nobserve that the derived model, termed as PoolFormer, achieves competitive\nperformance on multiple computer vision tasks. For example, on ImageNet-1K,\nPoolFormer achieves 82.1% top-1 accuracy, surpassing well-tuned Vision\nTransformer/MLP-like baselines DeiT-B/ResMLP-B24 by 0.3%/1.1% accuracy with\n35%

In [11]:
summaries[2]

'Transformer based language models (LMs) demonstrate increasing performance\nwith scale across a wide variety of tasks. Scale alone however cannot enable\nmodels to solve tasks that require access to ephemeral, changing, or private\ndata that was unavailable at training time. Many useful tasks may also benefit\nfrom LMs being able to access APIs that read or modify state. In this work, we\npresent Tool Augmented Language Models (TALM), combining a text-only approach\nto augment language models with non-differentiable tools, and an iterative\n"self-play" technique to bootstrap performance starting from few tool\ndemonstrations. TALM exhibits strong performance on both a knowledge-heavy QA\ntask and a reasoning oriented math task with simple tools. At a given model\nscale, TALM significantly outperforms non-augmented LMs. We further demonstrate\nthat TALM successfully performs out-of-distribution inferences on both QA and\nmath tasks, where non-augmented LMs fail. Our results suggest tha

In [29]:
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired

representation_model = KeyBERTInspired()

topic_model = BERTopic(
   "english", 
   verbose=True, 
   nr_topics=None, 
   min_topic_size=10, 
   representation_model=representation_model,
   seed_topic_list=[
       ["BERT"],
       ["GPT"]
   ]
)

topics, probs = topic_model.fit_transform(summaries)

2024-07-11 18:26:06,385 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 10/10 [00:21<00:00,  2.15s/it]
2024-07-11 18:26:28,288 - BERTopic - Embedding - Completed ✓
2024-07-11 18:26:28,290 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-07-11 18:26:31,139 - BERTopic - Dimensionality - Completed ✓
2024-07-11 18:26:31,141 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-07-11 18:26:31,150 - BERTopic - Cluster - Completed ✓
2024-07-11 18:26:31,153 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-07-11 18:26:32,271 - BERTopic - Representation - Completed ✓


In [30]:
freq = topic_model.get_topic_info()

In [31]:
freq.head(10)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,234,0_learning_attention_language_models,"[learning, attention, language, models, tasks,...",[Learning from human preferences is important ...
1,1,60,1_imagenet_multimodal_imagetext_visual,"[imagenet, multimodal, imagetext, visual, visi...",[Effective scaling and a flexible task interfa...


In [18]:
help(topic_model)

Help on BERTopic in module bertopic._bertopic object:

class BERTopic(builtins.object)
 |  BERTopic(language: str = 'english', top_n_words: int = 10, n_gram_range: Tuple[int, int] = (1, 1), min_topic_size: int = 10, nr_topics: Union[int, str] = None, low_memory: bool = False, calculate_probabilities: bool = False, seed_topic_list: List[List[str]] = None, zeroshot_topic_list: List[str] = None, zeroshot_min_similarity: float = 0.7, embedding_model=None, umap_model: umap.umap_.UMAP = None, hdbscan_model: hdbscan.hdbscan_.HDBSCAN = None, vectorizer_model: sklearn.feature_extraction.text.CountVectorizer = None, ctfidf_model: sklearn.feature_extraction.text.TfidfTransformer = None, representation_model: bertopic.representation._base.BaseRepresentation = None, verbose: bool = False)
 |  
 |  BERTopic is a topic modeling technique that leverages BERT embeddings and
 |  c-TF-IDF to create dense clusters allowing for easily interpretable topics
 |  whilst keeping important words in the topic des

In [17]:
topic_model.get_topics()

{0: [('imagenet', 0.47163364),
  ('multimodal', 0.4176013),
  ('imagetext', 0.37882277),
  ('visual', 0.37534803),
  ('visionlanguage', 0.36929408),
  ('learning', 0.3649571),
  ('trained', 0.3336314),
  ('neural', 0.33088285),
  ('generative', 0.3216042),
  ('images', 0.3159722)],
 1: [('learning', 0.39817607),
  ('attention', 0.39350474),
  ('language', 0.38959372),
  ('models', 0.3655308),
  ('tasks', 0.31999213),
  ('memory', 0.31300262),
  ('trained', 0.3098377),
  ('text', 0.2974574),
  ('knowledge', 0.2952758),
  ('task', 0.28992867)]}