### Loading PDF

In [124]:
from langchain_pymupdf4llm import PyMuPDF4LLMLoader

path = "/home/cristian/projects/rag_pae/data/pdfs/amazonica/A20.pdf"

def load_pdf(path: str):
    loader = PyMuPDF4LLMLoader(path)
    doc = loader.load()
    return doc

doc = load_pdf(path)

In [125]:
for page in doc:
    print(f"Page {page}")
    print(page.page_content)
    print("\n" + "=" * 80 + "\n")

Page page_content='# Conflictos socioambientales en la macrocuenca del Orinoco

**Aportes en identificación y**
**transformación de estos conflictos**

Oscar Andrés Prieto-Cruz [a] ; Diana Morales [a] y Omar Ruiz-Nieto [a]
## Reflexionar y poner a disposición el conocimiento sobre las características de los conflictos socioambientales, es una herramienta para fomentar espacios de diálogo que promuevan la comprensión mutua, el análisis de las controversias y la construcción de entendimientos.

En el año 2022 el Instituto Humboldt apoyó al
Ministerio de Ambiente y Desarrollo Sostenible
en la implementación de una de las instancias de
articulación, cooperación y participación para la
reflexión sobre alternativas de prevención y transformación positiva de los conflictos socioambientales
a escala regional, como lo es, el Centro Regional de
Diálogo Ambiental [1] de la macrocuenca Orinoco. Este
apoyo incluyó la identificación y caracterización de
conflictos socioambientales en los departament

### Chunking

Semantic Chunking based on similarity

In [126]:
# from langchain_experimental.text_splitter import SemanticChunker
# from langchain_ollama import OllamaEmbeddings
# from langchain.vectorstores import Chroma

# embeddings = OllamaEmbeddings(model="nomic-embed-text")

# semantic_splitter = SemanticChunker(
#     embeddings=embeddings,
#     breakpoint_threshold_type="percentile",  
#     breakpoint_threshold_amount=95,  
#     number_of_chunks=None,  
#     buffer_size=1  
# )

# split_docs = semantic_splitter.split_documents(doc)
# vectorstore = Chroma.from_documents(documents=split_docs, embedding= embeddings)
# retriever = vectorstore.as_retriever()


Normal chunking

In [127]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_ollama import OllamaEmbeddings
from langchain.vectorstores import Chroma

embeddings = OllamaEmbeddings(model="nomic-embed-text")

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=2000,
    chunk_overlap=200,
    length_function=len
)
split_docs = text_splitter.split_documents(doc)
vectorstore = Chroma.from_documents(documents=split_docs, embedding= embeddings)
retriever = vectorstore.as_retriever()

print(f"Number of chunks: {len(split_docs)}")

Number of chunks: 6


### Structured Output

In [128]:
from pydantic import BaseModel, Field
from typing import List, Optional

class AcademicPaper(BaseModel):
    title: str = Field(description="The complete and exact title of the article.")
    authors: List[str] = Field(description="The list of authors of the article, in the order they appear in the publication.")
    publication_year: Optional[int] = Field(description="The year in which the article was published.")
    journal: Optional[str] = Field(description="The name of the journal or conference proceedings where it was published.")
    abstract: Optional[str] = Field(description="The complete and accurate abstract as it appears in the article.")
    keywords: Optional[List[str]] = Field(description="A list of keywords associated with the article, if available.")
    regions: Optional[List[str]] = Field(description="The list of Colombian geographical regions where the study is focused in order of relevance, if applicable.")

In [129]:
from langchain_ollama import ChatOllama
llm = ChatOllama(model="llama3.2")
llm_structured = llm.with_structured_output(AcademicPaper)

### Extraction chain

In [132]:
system_prompt = (
    "You are an expert in extracting metadata from academic papers.\n"
    "Your task is to extract the following metadata from the provided text:\n"
    "Do not include any additional information or explanations.\n"
    "You need to be conservative in your extraction, if you are not sure about a field, leave it empty.\n"
    "Do not change the language of the metadata, keep it in the original language of the paper.\n"
    "Assume spanish as the primary language for metadata extraction.\n"
    "Do not take into account references or citations when extracting metadata.\n"

    "EDGE CASES:\n" \
    "- If there are multiple titles, extract the main article title.\n" \
    "- If the paper is in multiple languages, extract metadata in the primary language.\n" \
    "- Ignore bibliographic references when extracting metadata.\n" \
    "- If the year appears multiple times, use the publication year, not the submission or acceptance year.\n"
    "- If there are abstracts in multiple languages, extract the abstract in the primary language of the paper. You are focused on Spanish texts\n"

    "The metadata fields you need to extract are:\n"
    "- Title: The complete and exact title of the article, not section headings.\n"
    "- Authors: List all primary authors, not editors or reviewers.\n"
    "- Publication Year: The year of publication, not submission or acceptance.\n"
    "- Journal: The name of the journal or conference proceedings where it was published, not the publisher.\n"
    "- Abstract: The complete abstract content, excluding the word 'Abstract.'.\n"
    "- Keywords: A list of keywords associated with the article, if available.\n"
    "- Regions: A list of Colombian geographical regions where the study is focused, if applicable. \n"

    "You will be provided with the text of an academic paper. Based on the following academic paper text: {context}\n"
)



FIELD_QUERIES = {
    "title": ["title", "article title", "research title", "paper title", "titulo"],
    "authors": ["written by", "researchers", "corresponding  author", "author list", "authors", "autores", "autor"],
    "publication_year": ["published", "publication year", "copyright", "year"],
    "journal": ["journal", "published in", "proceedings", "conference", "publication venue"],
    "abstract": ["abstract", "summary", "resumen", "resumo"],
    "keywords": ["keywords", "key terms", "palabras clave", "termos-chave", "tags"],
    "regions": ["region", "geographical area", "geographic region", "area of study", "geographical scope"]
}

FIELD_PROMPTS = {
    "title": "Focus specifically on extracting the main title of this academic paper. Ignore section headings, references, or secondary titles.",
    "authors": "Focus on extracting all primary author names. Include only the main authors, not editors, reviewers, or cited authors.",
    "abstract": "Focus on extracting the complete abstract section. Include the full abstract content but exclude the word 'Abstract' itself.",
    "publication_year": "Focus on extracting the publication year of the article. Ignore submission or acceptance dates.",
    "journal": "Focus on extracting the name of the journal or conference proceedings where the article was published. Ignore publisher names.",
    "keywords": "Focus on extracting a list of keywords associated with the article, if available. If no keywords are provided, leave this field empty.",
    "regions": (
        "Focus on extracting the Colombian geographical region where the study is focused. "
        "ONLY extract regions that are specifically Colombian regions such as: "
        "Amazonía, Andina, Atlántica, Insular, Orinoquía, Pacífica, "
        "If the study mentions other countries or non-Colombian regions, ignore them. "
        "If no Colombian region is mentioned, leave this field empty."
        "If you are not sure about the region, leave it empty."
        "Colombia is not a region, it is a country, do not include it in the regions field."
    )
}

In [133]:
import pprint

def extract_single_field(field_name, retriever, llm_structured):
    # 1. Obtener queries para este campo específico
    field_queries = FIELD_QUERIES[field_name]
    
    # 2. Hacer retrieval con cada query del campo
    retrieved_docs = []
    for query in field_queries:
        docs = retriever.get_relevant_documents(query)
        retrieved_docs.extend(docs)
    
    # 3. Combinar todos los chunks relevantes
    combined_text = "\n".join([doc.page_content for doc in retrieved_docs])
    
    # 4. Crear prompt específico para este campo
    field_instruction = FIELD_PROMPTS[field_name]
    full_prompt = system_prompt + "\n" + field_instruction + "\n"
    
    # 5. Extraer usando LLM estructurado
    result = llm_structured.invoke(full_prompt.format(context=combined_text))
    
    # 6. Devolver resultado
    return result

def extract_metadata(retriever, llm_structured):
    metadata = {}
    for field in FIELD_QUERIES.keys():
        pointing_field = field
        print(f"Extracting field: {field}")
        if field in ["title", "authors", "publication_year", "journal", "abstract", "keywords"]:
            pointing_field = "title"
        result = extract_single_field(pointing_field, retriever, llm_structured)
        metadata[field] = result.model_dump()[field]
    return metadata

result = extract_metadata(retriever, llm_structured)
pprint.pprint(result)

Extracting field: title
Extracting field: authors
Extracting field: publication_year
Extracting field: journal
Extracting field: abstract
Extracting field: keywords
Extracting field: regions
{'abstract': 'La macrocuenca Orinoco es una región estratégica debido a su '
             'alta diversidad biológica, ecológica y cultural. En Colombia '
             'cuenta con un Ñrea de 34 720 832,5 ha distribuidas en 72 '
             'subcuencas de nueve zonas hidrográficas',
 'authors': ['Oscar Andrés Prieto-Cruz', 'Diana Morales', 'Omar Ruiz-Niùto'],
 'journal': 'Centros Regionales de Diálogo Ambiental (CRDA)',
 'keywords': ['conflictos socioambientales',
              'macrocuenca del Orinoco',
              'diversidad biológica',
              'ecología cultural'],
 'publication_year': 2022,
 'regions': ['Orinoquía', 'Amazonía'],
 'title': 'Conflictos socioambientales en la macrocuenca del Orinoco'}


In [123]:
vectorstore.delete_collection()