<a href="https://colab.research.google.com/github/diegomrodrigues/llm/blob/main/RAG_de_Artigos_Arxiv.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install langchain_experimental langchain_huggingface sentence-transformers pypdf arxiv pymupdf faiss-gpu google-generativeai

Collecting langchain_experimental
  Downloading langchain_experimental-0.0.62-py3-none-any.whl (202 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m202.7/202.7 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain_huggingface
  Downloading langchain_huggingface-0.0.3-py3-none-any.whl (17 kB)
Collecting sentence-transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pypdf
  Downloading pypdf-4.2.0-py3-none-any.whl (290 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.4/290.4 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting arxiv
  Downloading arxiv-2.1.3-py3-none-any.whl (11 kB)
Collecting pymupdf
  Downloading PyMuPDF-1.24.7-cp310-none-manylinux2014_x86_64.whl (3.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [

In [2]:
!curl -o readme.md https://raw.githubusercontent.com/Hannibal046/Awesome-LLM/main/README.md

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 56129  100 56129    0     0   230k      0 --:--:-- --:--:-- --:--:--  230k


In [8]:
import re
import requests
from bs4 import BeautifulSoup

def extract_links_from_markdown(markdown_content):
    # Regular expression to match Markdown links
    # This regex matches both [text](url) and bare URL formats
    link_pattern = r'\[([^\]]+)\]\(([^)]+)\)|(?<!\()(?:https?://\S+)'
    return re.findall(link_pattern, markdown_content)

def scrape_links(markdown_file_path):
    # Read the Markdown file
    with open(markdown_file_path, 'r', encoding='utf-8') as file:
        markdown_content = file.read()

    # Extract links from the Markdown content
    links = extract_links_from_markdown(markdown_content)

    pdf_to_downloads = []

    # Process each link
    for link in links:
        if isinstance(link, tuple):
            # This is a [text](url) style link
            text, url = link
        else:
            # This is a bare URL
            url = link
            text = url

        print(f"Link text: {text}")
        print(f"URL: {url}")

        if not url.startswith("http") or "manning" in url:
            continue

        if url.endswith(".pdf"):
            pdf_to_downloads.append(url)

        elif "arxiv" in url:
            parts = url.split("/")
            arxiv_id = parts[-1]

            pdf_to_downloads.append(f"https://arxiv.org/pdf/{arxiv_id}.pdf")

        elif "github" not in url and not "img" in url:
            try:
                print(f"Fetching url {url}")
                # Fetch the linked page
                response = requests.get(url)
                soup = BeautifulSoup(response.text, 'html.parser')

                # Extract the title of the linked page
                urls = soup.find_all('a') if soup.find('a') else None

                if urls:
                    for url in urls:
                        if url.has_attr('href') and url['href'].endswith('.pdf'):
                            pdf_url = url['href']

                            if "arxiv" in pdf_url:
                                print(f"Found PDF link: {pdf_url}")
                                pdf_to_downloads.append(pdf_url)
            except Exception as e:
                print(f"Error fetching {url}: {str(e)}")

        print("---")

    pdf_to_downloads = list(set(pdf_to_downloads))

    print("PDF Links:")
    for pdf_link in pdf_to_downloads:
        print(pdf_link)

    print("Done!")

    return pdf_to_downloads



In [2]:
from langchain.document_loaders import ArxivLoader
from langchain.document_loaders.merge import MergedDataLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

In [7]:
embeddings = HuggingFaceEmbeddings(
    model_name="all-MiniLM-L6-v2",
    show_progress=True,
    model_kwargs={'device': 'cpu'}
)

In [12]:
pdf_to_download = scrape_links("readme.md")

arxiv_ids = []

for pdf in pdf_to_download:
    if "arxiv" in pdf:
        parts = pdf.split("/")
        arxiv_id = parts[-1][:-4]
        arxiv_ids.append(arxiv_id)

docs_to_merge = []

for arxiv_id in arxiv_ids:
    loader = ArxivLoader(query=arxiv_id)
    docs_to_merge.append(loader)

print(f"Total documents: {len(docs_to_merge)}")

all_loaders = MergedDataLoader(loaders=docs_to_merge)

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=32)

all_chunks = all_loaders.load_and_split(text_splitter)

print(f"Total chunks: {len(all_chunks)}")

vectorstore = FAISS.from_documents(all_chunks, embeddings)

retriever = vectorstore.as_retriever()

Link text: ![Awesome
URL: https://awesome.re/badge.svg
Fetching url https://awesome.re/badge.svg
---
Link text: LibreChat
URL: https://github.com/danny-avila/LibreChat
---
Link text: Open-Sora
URL: https://github.com/hpcaitech/Open-Sora
---
Link text: LLM101n
URL: https://github.com/karpathy/LLM101n
---
Link text: Gemma 2
URL: https://blog.google/technology/developers/google-gemma-2/
Fetching url https://blog.google/technology/developers/google-gemma-2/
---
Link text: Awesome-LLM 
URL: #awesome-llm-
Link text: Milestone Papers
URL: #milestone-papers
Link text: Other Papers
URL: #other-papers
Link text: LLM Leaderboard
URL: #llm-leaderboard
Link text: Open LLM
URL: #open-llm
Link text: LLM Data
URL: #llm-data
Link text: LLM Evaluation
URL: #llm-evaluation
Link text: LLM Training Framework
URL: #llm-training-frameworks
Link text: LLM Deployment
URL: #llm-deployment
Link text: LLM Applications
URL: #llm-applications
Link text: LLM Books
URL: #llm-books
Link text: Great thoughts about LLM


KeyboardInterrupt: 

In [13]:
vectorstore = FAISS.from_documents(all_chunks, embeddings)

retriever = vectorstore.as_retriever()

Batches:   0%|          | 0/755 [00:00<?, ?it/s]

In [21]:
vectorstore.save_local("./arxiv_retriever")

In [22]:
!zip -r ./arxiv_retriever.zip ./arxiv_retriever

  adding: arxiv_retriever/ (stored 0%)
  adding: arxiv_retriever/index.faiss (deflated 7%)
  adding: arxiv_retriever/index.pkl (deflated 65%)


In [13]:
!unzip ./arxiv_retriever.zip -d ./

Archive:  ./arxiv_retriever.zip
   creating: ./arxiv_retriever/
  inflating: ./arxiv_retriever/index.faiss  
  inflating: ./arxiv_retriever/index.pkl  


In [14]:
vectorstore = FAISS.load_local("./arxiv_retriever", embeddings, allow_dangerous_deserialization=True)

In [21]:
def retrieve(query, retriever):
    results = retriever.invoke(query)

    documents = []

    for idx, doc in enumerate(results):
        document = (
            f"<Document index={idx+1} title={doc.metadata['Title']}>" +
                "<Sumary>" +
                    doc.metadata['Summary'] +
                "</Sumary>" +
                "<Content>" +
                    doc.page_content +
                "</Content>" +
            "</Document>"
        )

        documents.append(document)

    return documents

def create_prompt_for_summary(query):
    with open("Template para RAG Artigos.md", "r") as f:
        template = f.read()

    retriever = vectorstore.as_retriever(search_kwargs={"k": 15})

    documents = retrieve(query, retriever)

    context = "\n".join(documents)
    context = f"""<context>
    {context}
    </context>"""

    template = template.replace("<context></context>", context)
    template = (
        template + "\n\n" +
        "X = " + query + "\n\n" +
        "Resposta em português:"
    )

    display(template)

X = "Stability of training Large Language Models"

create_prompt_for_summary(X)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

"You are Perplexica, an AI model who is expert at searching the web and answering user's queries.\n\nGenerate a response that is informative and relevant to the user's query based on provided context (the context consits of search results containg a brief description of the content of that page).\nYou must use this context to answer the user's query in the best way possible. Use an unbaised and journalistic tone in your response. Do not repeat the text.\nYou must not tell the user to open any link or visit any website to get the answer. You must provide the answer in the response itself. If the user asks for links you can provide them.\nYour responses should be long in length be informative and relevant to the user's query. You can use markdowns to format your response. You should use the template provided below in \\`template\\` section. Make sure the answer is not short and is informative.\nYou have to cite the answer using [number] notation. You must cite the sentences with their re