### Loading PDF

In [24]:
from langchain_pymupdf4llm import PyMuPDF4LLMLoader

path = "/home/cristian/projects/rag_pae/data/pdfs/amazonica/A4.pdf"

def load_pdf(path: str):
    loader = PyMuPDF4LLMLoader(path)
    doc = loader.load()
    return doc

doc = load_pdf(path)

In [25]:
for page in doc:
    print(f"Page {page}")
    print(page.page_content)
    print("\n" + "=" * 80 + "\n")

Page page_content='ISSN 0187-6961


Estudios Fronterizos, nueva época, vol. 16, núm. 31, enero-junio de 2015, pp. 39-64

# **El lado colombiano de la frontera colombo-brasilera. ** **Una aproximación desde la categoría de área sin ley [1]** **The Colombian side of the Colombian-Brazilian ** **border. An approach using the category of lawless area **

_Luis Fernando Trejos Rosero*_


Resumen

En este trabajo se realiza una aproximación
desde la categoría de _área sin ley_, a la compleja situación de seguridad generada por los
crecientes vínculos de la criminalidad local

con redes del crimen organizado transnacional en el lado colombiano de la frontera

colombo-brasilera. El establecimiento permanente de actores armados ilegales en este
espacio fronterizo se debe en gran medida
a la ausencia y débil presencia del Estado
colombiano en sus periferias. El abordaje
metodológico es de tipo bibliográfico y descriptivo.
_Palabras clave_ : Estado fallido, frontera, área
sin ley, guerrilla, narc

### Chunking

Semantic Chunking based on similarity

In [26]:
# from langchain_experimental.text_splitter import SemanticChunker
# from langchain_ollama import OllamaEmbeddings
# from langchain.vectorstores import Chroma

# embeddings = OllamaEmbeddings(model="nomic-embed-text")

# semantic_splitter = SemanticChunker(
#     embeddings=embeddings,
#     breakpoint_threshold_type="percentile",  
#     breakpoint_threshold_amount=95,  
#     number_of_chunks=None,  
#     buffer_size=1  
# )

# split_docs = semantic_splitter.split_documents(doc)
# vectorstore = Chroma.from_documents(documents=split_docs, embedding= embeddings)
# retriever = vectorstore.as_retriever()


Normal chunking

In [27]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_ollama import OllamaEmbeddings
from langchain.vectorstores import Chroma

embeddings = OllamaEmbeddings(model="nomic-embed-text")

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=2000,
    chunk_overlap=200,
    length_function=len
)
split_docs = text_splitter.split_documents(doc)
vectorstore = Chroma.from_documents(documents=split_docs, embedding= embeddings)
retriever = vectorstore.as_retriever()

print(f"Number of chunks: {len(split_docs)}")

Number of chunks: 54


### Structured Output

In [28]:
from pydantic import BaseModel, Field
from typing import List, Optional

class AcademicPaper(BaseModel):
    title: str = Field(description="The complete and exact title of the article.")
    authors: List[str] = Field(description="The list of authors of the article, in the order they appear in the publication.")
    publication_year: int = Field(description="The year in which the article was published.")
    journal: str = Field(description="The name of the journal or conference proceedings where it was published.")
    abstract: Optional[str] = Field(description="The complete and accurate abstract as it appears in the article.")

In [29]:
from langchain_ollama import ChatOllama
llm = ChatOllama(model="llama3.2")
llm_structured = llm.with_structured_output(AcademicPaper)

### Extraction chain

In [30]:
system_prompt = (
    "You are an expert in extracting metadata from academic papers.\n"
    "Your task is to extract the following metadata from the provided text:\n"
    "Do not include any additional information or explanations.\n"
    "You need to be conservative in your extraction, if you are not sure about a field, leave it empty.\n"
    "Do not change the language of the metadata, keep it in the original language of the paper.\n"
    "Assume spanish as the primary language for metadata extraction.\n"
    "Do not take into account references or citations when extracting metadata.\n"

    "EDGE CASES:\n" \
    "- If there are multiple titles, extract the main article title.\n" \
    "- If the paper is in multiple languages, extract metadata in the primary language.\n" \
    "- Ignore bibliographic references when extracting metadata.\n" \
    "- If the year appears multiple times, use the publication year, not the submission or acceptance year.\n"
    "- If there are abstracts in multiple languages, extract the abstract in the primary language of the paper. You are focused on Spanish texts\n"

    "The metadata fields you need to extract are:\n"
    "- Title: The complete and exact title of the article, not section headings.\n"
    "- Authors: List all primary authors, not editors or reviewers.\n"
    "- Publication Year: The year of publication, not submission or acceptance.\n"
    "- Journal: The name of the journal or conference proceedings where it was published, not the publisher.\n"
    "- Abstract: The complete abstract content, excluding the word 'Abstract.'.\n"

    "You will be provided with the text of an academic paper. Based on the following academic paper text: {context}\n"
)



FIELD_QUERIES = {
    "title": ["title", "article title", "research title", "paper title", "titulo"],
    "authors": ["written by", "researchers", "corresponding  author", "author list", "authors", "autores", "autor"],
    "publication_year": ["published", "publication year", "copyright", "year"],
    "journal": ["journal", "published in", "proceedings", "conference", "publication venue"],
    "abstract": ["abstract", "summary", "resumen", "resumo"]
}

FIELD_PROMPTS = {
    "title": "Focus specifically on extracting the main title of this academic paper. Ignore section headings, references, or secondary titles.",
    "authors": "Focus on extracting all primary author names. Include only the main authors, not editors, reviewers, or cited authors.",
    "abstract": "Focus on extracting the complete abstract section. Include the full abstract content but exclude the word 'Abstract' itself.",
    "publication_year": "Focus on extracting the publication year of the article. Ignore submission or acceptance dates.",
    "journal": "Focus on extracting the name of the journal or conference proceedings where the article was published. Ignore publisher names."
}

In [31]:
import pprint

def extract_single_field(field_name, retriever, llm_structured):
    # 1. Obtener queries para este campo específico
    field_queries = FIELD_QUERIES[field_name]
    
    # 2. Hacer retrieval con cada query del campo
    retrieved_docs = []
    for query in field_queries:
        docs = retriever.get_relevant_documents(query)
        retrieved_docs.extend(docs)
    
    # 3. Combinar todos los chunks relevantes
    combined_text = "\n".join([doc.page_content for doc in retrieved_docs])
    
    # 4. Crear prompt específico para este campo
    field_instruction = FIELD_PROMPTS[field_name]
    full_prompt = system_prompt + "\n" + field_instruction + "\n"
    
    # 5. Extraer usando LLM estructurado
    result = llm_structured.invoke(full_prompt.format(context=combined_text))
    
    # 6. Devolver resultado
    return result

result = extract_single_field("title", retriever, llm_structured)
pprint.pprint(result.model_dump())

{'abstract': 'Este trabajo se realiza una aproximación desde la categoría de '
             'área sin ley, a la compleja situación de seguridad generada por '
             'los crecientes vínculos de la criminalidad local con redes del '
             'crimen organizado transnacional en el lado colombiano de la '
             'frontera colombo-brasilera. El establecimiento permanente de '
             'actores armados ilegales en este espacio fronterizo se debe en '
             'gran medida a la ausencia y debil presencia del Estado '
             'colombiano en sus periferias.',
 'authors': ['Luis Fernando Trejos Rosero'],
 'journal': 'Estudios Fronterizos',
 'publication_year': 2015,
 'title': 'El lado colombiano de la frontera colombo-brasilera. Una '
          'aproximacion desde la categoría de área sin ley'}


In [21]:
vectorstore.delete_collection()