In [None]:
import json
from langchain_community.document_loaders import ArxivLoader
from langchain_community.document_loaders.merge import MergedDataLoader
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document
from typing import Iterator

class CustomArxivLoader(ArxivLoader):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
    
    def lazy_load(self) -> Iterator[Document]:
        documents = super().lazy_load()

        def update_metadata(documents):
            for document in documents:
                yield Document(
                    page_content=document.page_content,
                    metadata={
                        **document.metadata,
                        "ArxivId": self.query,
                        "Source": f"https://arxiv.org/pdf/{self.query}.pdf"
                    }
                )
        
        return update_metadata(documents)

retriever = "large-language-models"

with open(f"../assets/results/crawler_{retriever}.json") as file:
    results = json.load(file)

arxiv_urls = results["collected_urls"]["arxiv.org"]
arxiv_ids = map(lambda url: url.split("/")[-1].strip(".pdf"), arxiv_urls)

all_loaders = [CustomArxivLoader(query=arxiv_id) for arxiv_id in arxiv_ids]

merged_loader = MergedDataLoader(loaders=all_loaders)

documents = merged_loader.load()

MuPDF error: syntax error: could not parse color space (1138 0 R)



In [None]:
len(documents)

In [None]:
!pip install bertopic

In [None]:
articles = [doc.page_content for doc in documents]

In [None]:
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired

representation_model = KeyBERTInspired()

topic_model = BERTopic(
   "english", 
   verbose=True, 
   nr_topics=None, 
   min_topic_size=10, 
   representation_model=representation_model
)

topics, probs = topic_model.fit_transform(articles)

In [None]:
freq = topic_model.get_topic_info()

In [None]:
freq.head(10)

In [None]:
topic_model.visualize_barchart()

In [None]:
topic_model.visualize_documents(summaries)

In [None]:
help(topic_model)

In [None]:
topic_labels = topic_model.generate_topic_labels(nr_words=3, topic_prefix=False, separator=' ')
topic_labels

In [None]:
topic_documents = {label: [] for label in topic_labels}

for doc, topic in zip(documents, topics):
    label = topic_labels[topic]

    topic_documents[label].append(doc)
    
    print(doc.metadata['Title'], label)

In [None]:
topic_model.