In [16]:
!pip install requests PyMuPDF sentence-transformers chromadb --quiet
from huggingface_hub import login
login(token="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")

import requests
import fitz
import os
import chromadb
import torch
from sentence_transformers import SentenceTransformer
from transformers import AutoModelForCausalLM, AutoTokenizer


if torch.cuda.is_available():
  device=torch.device("cuda")
  print("GPU is available")
elif torch.backends.mps.is_available():
  device=torch.device("mps")
  print("MPS is available")
else:
  device=torch.device("cpu")
  print("CUDA is not available")

chroma_client=chromadb.PersistentClient(path="./chroma_db")
collection=chroma_client.get_or_create_collection(name="arxiv_papers")

embedding_model=SentenceTransformer("BAAI/bge-m3").to(device)

GPU is available


In [2]:

llama_model_name="meta-llama/Llama-3.2-1B"
tokenizer=AutoTokenizer.from_pretrained(llama_model_name)
model=AutoModelForCausalLM.from_pretrained(llama_model_name).to(device)

tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/843 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

In [None]:
def extract_arxiv_id(abstract_url):
    return abstract_url.split("/")[-1]

In [4]:
def download_pdf(arxiv_id):
  pdf_url = f"https://arxiv.org/pdf/{arxiv_id}.pdf"
  response = requests.get(pdf_url)

  if response.status_code == 200:
    pdf_path = f"{arxiv_id}.pdf"
    with open(pdf_path,"wb") as f:
      f.write(response.content)
    return pdf_path
  else:
    raise Exception(f"Failed to download PDF: {pdf_url}")

In [5]:
def extract_text_from_pdf(pdf_path):
  doc = fitz.open(pdf_path)
  text = "\n".join([page.get_text("text") for page in doc])
  doc.close()
  return text

In [6]:
def chunk_text(text,chunk_size=512):
  words = text.split()
  return [" ".join(words[i:i+chunk_size]) for i in range(0,len(words),chunk_size)]

In [12]:
def store_in_chroma(arxiv_id,text):
  text_chunks = chunk_text(text)
  print(text_chunks)

  with torch.no_grad():
    embeddings = embedding_model.encode(text_chunks,normalize_embeddings=True, convert_to_tensor = True).cpu().tolist()

  for i,chunk in enumerate(text_chunks):
    collection.add(
        ids = [f"{arxiv_id}-{i}"],
        documents = [chunk],
        metadatas = [{"source": f"https://arxiv.org/{arxiv_id}", "chunk_id":i}],
        embeddings = [embeddings[i]]
    )

In [8]:
def process_arxiv_paper(abstract_url):
  arxiv_id = extract_arxiv_id(abstract_url)
  print(f"Processing arxiv paper: {arxiv_id}")
  pdf_path = download_pdf(arxiv_id)
  text = extract_text_from_pdf(pdf_path)
  print(f"Extracted {len(text)} characters of text")
  store_in_chroma(arxiv_id,text)
  os.remove(pdf_path)
  print("Processing complete")

In [9]:
def retrieve_relevant_chunks(query,top_k = 5):
  query_embedding = embedding_model.encode(query,normalize_embeddings=True).tolist()
  results = collection.query(
      query_embeddings = [query_embedding],
      n_results = top_k
  )

  if results["documents"]:
    return results["documents"][0]
  else:
    return []


In [19]:
def generate_response(query, max_new_tokens = 3000,top_k=3):
  retrieved_texts = retrieve_relevant_chunks(query,top_k=top_k)
  if not retrieved_texts:
    return "No relevant information found in the database."
  context = "\n\n".join(retrieved_texts)[:3000]
  prompt = f"Context:\n{context}\n\nQuestion:{query}\nAnswer:"
  inputs = tokenizer(prompt, return_tensors = "pt", truncation = True).to(device)
  with torch.no_grad():
    outputs = model.generate(**inputs, max_new_tokens = max_new_tokens)
    answer = tokenizer.decode(outputs[0], skip_special_tokens = True)
  return answer

In [20]:
process_arxiv_paper("https://arxiv.org/abs/2503.01751")
print(generate_response("What does the conclusion of this paper say?"))

Processing arxiv paper: 2503.01751
Extracted 51153 characters of text
['SAKE: Steering Activations for Knowledge Editing Marco Scialanga*,1,2, Thibault Laugel*,1,3, Vincent Grari1,3, Marcin Detyniecki1,3,4 1AXA, Paris, France, 2EPFL, Lausanne, Switzerland, 3TRAIL, LIP6, Sorbonne Université, Paris, France 4Polish Academy of Science, IBS PAN, Warsaw, Poland *Equal contribution Correspondence: marco.scialanga@epfl.ch, thibault.laugel@axa.com Abstract As Large Langue Models have been shown to memorize real-world facts, the need to update this knowledge in a controlled and efficient manner arises. Designed with these constraints in mind, Knowledge Editing (KE) approaches propose to alter specific facts in pretrained mod- els. However, they have been shown to suffer from several limitations, including their lack of contextual robustness and their failure to gener- alize to logical implications related to the fact. To overcome these issues, we propose SAKE, a steering activation method that m

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Processing complete
Context:
instance by Yao et al. (2023), are a type of logical implications that is not addressed, and cannot be easily integrated into our method. References Sawsan Alqahtani, Garima Lalwani, Yi Zhang, Salva- tore Romeo, and Saab Mansour. 2021. Using optimal transport as alignment objective for fine-tuning mul- tilingual contextualized embeddings. In Findings of the Association for Computational Linguistics: EMNLP 2021, pages 3904–3919. Andy Arditi, Oscar Obeso, Aaquib Syed, Daniel Paleka, Nina Panickssery, Wes Gurnee, and Neel Nanda. 2024. Refusal in language models is mediated by a single direction. arXiv preprint arXiv:2406.11717. Hoyeon Chang, Jinho Park, Seonghyeon Ye, Sohee Yang, Youngkyung Seo, Du-Seong Chang, and Min- joon Seo. 2024. How do large language models acquire factual knowledge during pretraining? In Proceedings of the 38th Conference on Neural Infor- mation Processing Systems (NeurIPS). Curran Asso- ciates, Inc. Roi Cohen, Eden Biran, Ori Yoran, A