# Upsert documents

## Download documents

In [1]:
urls = [
    "https://bocyl.jcyl.es/boletines/2025/04/11/xml/BOCYL-D-11042025-15.xml",
    "https://bocyl.jcyl.es/boletines/2025/03/21/xml/BOCYL-D-21032025-21.xml",
    "https://bocyl.jcyl.es/boletines/2025/03/21/xml/BOCYL-D-21032025-26.xml",
    "https://bocyl.jcyl.es/boletines/2025/03/07/xml/BOCYL-D-07032025-16.xml",
    "https://bocyl.jcyl.es/boletines/2025/03/07/xml/BOCYL-D-07032025-17.xml",
    "https://bocyl.jcyl.es/boletines/2025/02/26/xml/BOCYL-D-26022025-18.xml",
    "https://bocyl.jcyl.es/boletines/2025/02/10/xml/BOCYL-D-10022025-2.xml",
    "https://bocyl.jcyl.es/boletines/2025/02/10/xml/BOCYL-D-10022025-11.xml",
    "https://bocyl.jcyl.es/boletines/2025/02/10/xml/BOCYL-D-10022025-12.xml",
    "https://bocyl.jcyl.es/boletines/2025/02/10/xml/BOCYL-D-10022025-13.xml",
]


In [2]:
from pathlib import Path
folder_documents = Path("/workspace/data/documents/BOCYL")

In [3]:
from modules.preprocessing import BOCYLMarkdownExporter
exporter = BOCYLMarkdownExporter(folder_documents)

In [4]:
paths = []

for url in urls:
    exporter.export(url)
    filename = url.split('/')[-1].split('.')[0]
    path = folder_documents / f"{filename}.md"
    paths.append(path)

paths

File /workspace/data/documents/BOCYL/BOCYL-D-11042025-15.md already exists. Skipping export.
File /workspace/data/documents/BOCYL/BOCYL-D-21032025-21.md already exists. Skipping export.
File /workspace/data/documents/BOCYL/BOCYL-D-21032025-26.md already exists. Skipping export.
File /workspace/data/documents/BOCYL/BOCYL-D-07032025-16.md already exists. Skipping export.
File /workspace/data/documents/BOCYL/BOCYL-D-07032025-17.md already exists. Skipping export.
File /workspace/data/documents/BOCYL/BOCYL-D-26022025-18.md already exists. Skipping export.
File /workspace/data/documents/BOCYL/BOCYL-D-10022025-2.md already exists. Skipping export.
File /workspace/data/documents/BOCYL/BOCYL-D-10022025-11.md already exists. Skipping export.
File /workspace/data/documents/BOCYL/BOCYL-D-10022025-12.md already exists. Skipping export.
File /workspace/data/documents/BOCYL/BOCYL-D-10022025-13.md already exists. Skipping export.


[PosixPath('/workspace/data/documents/BOCYL/BOCYL-D-11042025-15.md'),
 PosixPath('/workspace/data/documents/BOCYL/BOCYL-D-21032025-21.md'),
 PosixPath('/workspace/data/documents/BOCYL/BOCYL-D-21032025-26.md'),
 PosixPath('/workspace/data/documents/BOCYL/BOCYL-D-07032025-16.md'),
 PosixPath('/workspace/data/documents/BOCYL/BOCYL-D-07032025-17.md'),
 PosixPath('/workspace/data/documents/BOCYL/BOCYL-D-26022025-18.md'),
 PosixPath('/workspace/data/documents/BOCYL/BOCYL-D-10022025-2.md'),
 PosixPath('/workspace/data/documents/BOCYL/BOCYL-D-10022025-11.md'),
 PosixPath('/workspace/data/documents/BOCYL/BOCYL-D-10022025-12.md'),
 PosixPath('/workspace/data/documents/BOCYL/BOCYL-D-10022025-13.md')]

## 

### Instantiate client

In [5]:
import chromadb
from chromadb.config import Settings
from sentence_transformers import SentenceTransformer
from pathlib import Path
import os

In [6]:
# Define the path where the database will be stored
db_path = "/workspace/data/vectordb/chromadb"

# Create directory if it doesn't exist
os.makedirs(db_path, exist_ok=True)

# Initialize the ChromaDB client with persistent storage
client = chromadb.PersistentClient(path=db_path)

### Create collection

In [7]:
# Create a new collection (or get existing one)
collection_name = "bocyl"
collection = client.get_or_create_collection(name=collection_name)

### Insert documents

#### Define functions to extract metadata

In [8]:
import re
from datetime import datetime

# Function to extract metadata from filename
def extract_metadata_from_filename(filename):
    # Pattern: BOCYL-D-DDMMYYYY-NN
    pattern = r"BOCYL-D-(\d{2})(\d{2})(\d{4})-(\d+)"
    match = re.match(pattern, filename)
    
    if match:
        day, month, year, doc_num = match.groups()
        # Create date object
        doc_date = datetime.strptime(f"{day}/{month}/{year}", "%d/%m/%Y").strftime("%Y-%m-%d")
        
        return {
            "doc_id": filename,
            "date": doc_date,
            "doc_number": int(doc_num),
            "source": "BOCYL",
            "type": "official_document",
            "area": "AGRARIA",
            "consejeria": "AGRICULTURA Y PESCA"
        }
        
    return {"doc_id": filename}

# Function to extract potential metadata from document content
def extract_metadata_from_content(content):
    # Example patterns to look for in document content
    title_match = re.search(r"^# (.+)$", content, re.MULTILINE)
    title = title_match.group(1) if title_match else "Unknown Title"
    
    # Could extract more metadata based on document structure
    # This is a simplified example
    
    return {
        "title": title
    }

#### Define embedding model

In [9]:
import torch
from langchain_huggingface import HuggingFaceEmbeddings

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = HuggingFaceEmbeddings(
    model_name='BAAI/bge-small-en-v1.5',
    model_kwargs={'device': device}
)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/94.8k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

#### Upsert documents

In [10]:
paths

[PosixPath('/workspace/data/documents/BOCYL/BOCYL-D-11042025-15.md'),
 PosixPath('/workspace/data/documents/BOCYL/BOCYL-D-21032025-21.md'),
 PosixPath('/workspace/data/documents/BOCYL/BOCYL-D-21032025-26.md'),
 PosixPath('/workspace/data/documents/BOCYL/BOCYL-D-07032025-16.md'),
 PosixPath('/workspace/data/documents/BOCYL/BOCYL-D-07032025-17.md'),
 PosixPath('/workspace/data/documents/BOCYL/BOCYL-D-26022025-18.md'),
 PosixPath('/workspace/data/documents/BOCYL/BOCYL-D-10022025-2.md'),
 PosixPath('/workspace/data/documents/BOCYL/BOCYL-D-10022025-11.md'),
 PosixPath('/workspace/data/documents/BOCYL/BOCYL-D-10022025-12.md'),
 PosixPath('/workspace/data/documents/BOCYL/BOCYL-D-10022025-13.md')]

In [11]:
for path in paths:
    # Read document content
    with open(path, "r", encoding="utf-8") as f:
        content = f.read()
    
    # Extract document ID and filename
    doc_id = path.stem
    
    # Check if document already exists (using the first chunk's ID as indicator)
    first_chunk_id = f"{doc_id}_0"
    existing = collection.get(ids=[first_chunk_id])
    
    if len(existing['ids']) > 0:
        print(f"Document {doc_id} already exists. Deleting existing chunks...")
        # Get all chunks for this document
        doc_chunks = collection.get(where={"doc_id": doc_id})
        if doc_chunks and len(doc_chunks['ids']) > 0:
            # Delete all existing chunks for this document
            collection.delete(ids=doc_chunks['ids'])
    
    # Extract metadata from filename
    filename_metadata = extract_metadata_from_filename(doc_id)
    
    # Extract metadata from content
    content_metadata = extract_metadata_from_content(content)
    
    # Combine metadata
    document_metadata = {**filename_metadata, **content_metadata}
    
    # Add doc_id to metadata
    document_metadata["doc_id"] = doc_id
    
    # Split document into chunks (simple approach)
    chunk_size = 1000
    chunks = [content[i:i+chunk_size] for i in range(0, len(content), chunk_size)]
    
    # Process each chunk
    ids = []
    metadatas = []
    documents = []
    
    for i, chunk in enumerate(chunks):
        chunk_id = f"{doc_id}_{i}"
        ids.append(chunk_id)
        
        # Add chunk-specific metadata
        chunk_metadata = {
            **document_metadata,
            "chunk_id": i,
            "chunk_total": len(chunks)
        }
        metadatas.append(chunk_metadata)
        documents.append(chunk)
    
    # Instead of encoding each chunk individually in a loop
    embeddings = model.embed_documents(chunks)
    
    # Add to collection
    collection.add(
        ids=ids,
        embeddings=embeddings,
        metadatas=metadatas,
        documents=chunks
    )
    
    print(f"Added {doc_id} with {len(chunks)} chunks and metadata: {document_metadata}")

Added BOCYL-D-11042025-15 with 3 chunks and metadata: {'doc_id': 'BOCYL-D-11042025-15', 'date': '2025-04-11', 'doc_number': 15, 'source': 'BOCYL', 'type': 'official_document', 'area': 'AGRARIA', 'consejeria': 'AGRICULTURA Y PESCA', 'title': 'EXTRACTO de la Orden de 8 de abril de 2025, de la Consejería de Agricultura, Ganadería y Desarrollo Rural, por la que se convocan las ayudas a la cosecha en verde de viñedos en la Comunidad de Castilla y León para la vendimia 2025.'}
Added BOCYL-D-21032025-21 with 6 chunks and metadata: {'doc_id': 'BOCYL-D-21032025-21', 'date': '2025-03-21', 'doc_number': 21, 'source': 'BOCYL', 'type': 'official_document', 'area': 'AGRARIA', 'consejeria': 'AGRICULTURA Y PESCA', 'title': 'ORDEN AGR/263/2025, de 11 de marzo, por la que se resuelve la aprobación del plan de obras de la infraestructura rural de la zona regable del Canal de San José-Sector I (Zamora).'}
Added BOCYL-D-21032025-26 with 9 chunks and metadata: {'doc_id': 'BOCYL-D-21032025-26', 'date': '2025

### Count documents inserted

In [12]:
# Count items in collection
count = collection.count()
print(f"Total number of chunks in collection: {count}")

# Get unique document IDs
results = collection.get(
    where={"chunk_id": 0}  # Get only first chunk of each document
)
print(f"Number of documents: {len(results['ids'])}")

Total number of chunks in collection: 122
Number of documents: 10


## Query the vector db

### Define query

In [13]:
query = "Dame un resumen de la normativa agraria reciente publicada en BOCYL"

### Encode query with embedding model

In [14]:
query_embedding = model.embed_query(query)
query_embedding

[-0.046658698469400406,
 0.034835949540138245,
 0.025681229308247566,
 -0.01039701048284769,
 -0.03776011988520622,
 0.006033893674612045,
 0.052861299365758896,
 0.06641411781311035,
 0.02814403921365738,
 -0.06875647604465485,
 -0.051330920308828354,
 -0.05179449915885925,
 -0.01772548444569111,
 -0.0028604811523109674,
 0.025533495470881462,
 -0.004578458610922098,
 0.07741551101207733,
 0.03160997852683067,
 -0.06160251796245575,
 0.013882341794669628,
 0.04177721217274666,
 -0.033684927970170975,
 0.008339233696460724,
 0.029327671974897385,
 -0.01139218918979168,
 -0.01367481891065836,
 -0.04429325833916664,
 0.020326945930719376,
 -0.025521716102957726,
 -0.15243107080459595,
 0.018130943179130554,
 -0.006615903694182634,
 -0.01787617802619934,
 -0.01325325295329094,
 0.014333341270685196,
 -0.022217022255063057,
 -0.01836596429347992,
 -0.007624195888638496,
 -0.01365339383482933,
 0.026709960773587227,
 -0.0020161056891083717,
 -0.024781016632914543,
 0.03289389610290527,
 -0.

### Run the query

In [15]:
# Run the query - include_distances=True is key for getting scores
search_results = collection.query(
    query_embeddings=[query_embedding],
    n_results=3,
)

# Display results with scores
for i, (id, document, metadata, distance) in enumerate(zip(
        search_results["ids"][0],
        search_results["documents"][0],
        search_results["metadatas"][0],
        search_results["distances"][0]  # This contains the similarity scores
    )):
    # Convert distance to similarity score (closer to 1 is better)
    # ChromaDB typically returns distances, where smaller is better
    # For cosine similarity, you can convert: similarity = 1 - distance
    similarity_score = 1 - distance  # For cosine distance
    
    print(f"Result {i+1}:")
    print(f"  Document: {metadata['doc_id']}")
    print(f"  Chunk: {metadata['chunk_id']}")
    print(f"  Similarity Score: {similarity_score:.4f}")
    print(f"  Text: {document}...")
    print()

Result 1:
  Document: BOCYL-D-10022025-2
  Chunk: 11
  Similarity Score: 0.4825
  Text: ades y respecto de la solicitud de expedición del certificado de competencia, se apuesta por su tramitación electrónica a través de entidades de formación.

Igualmente se ajusta al principio de seguridad jurídica puesto que este proyecto se adecúa a la normativa europea, estatal y autonómica en esta materia. Con relación al principio de coherencia, la orden es compatible con el resto de las actuaciones y objetivos de las políticas públicas en materia de formación en materia de bienestar animal y bioseguridad, higiene y sanidad animal de los animales de producción.

Asimismo, se cumple con el principio de eficiencia puesto que la norma impone las obligaciones indispensables a los destinatarios, con las mínimas cargas posibles, y en todo caso las obligaciones que se establecen para las entidades y el alumnado resultan justificadas de acuerdo con las funciones perseguidas. Con respecto al gasto público

## Refactor the code

In [16]:
from modules.vectordb.bocyl import BOCYLVectorDB
vectordb = BOCYLVectorDB()

ValueError: An instance of Chroma already exists for /workspace/data/vectordb/chromadb with different settings

## Multiple queries

In [None]:
result1 = vectordb.query(
    "¿Qué medidas de condicionalidad reforzada se mencionan en la normativa agraria?",
    n_results=3
)

TypeError: VectorStore.similarity_search_with_relevance_scores() missing 1 required positional argument: 'query'

In [None]:
result1

{'query': '¿Qué medidas de condicionalidad reforzada se mencionan en la normativa agraria?',
 'results': [{'document_id': 'BOCYL-D-02012024-12',
   'chunk_id': 4,
   'metadata': {'title': 'ORDEN AGR/1488/2023, de 22 de diciembre, por la que se determinan en Castilla y León las obligaciones de la condicionalidad reforzada y el procedimiento para las penalizaciones por incumplimiento de la condicionalidad social, que han de cumplir las personas beneficiarias de ayudas de la Política Agraria Común que perciban pagos directos y determinados pagos anuales por superficies y animales para el desarrollo rural, en el marco del Plan Estratégico Nacional de la PAC 2023-2027 de España.',
    'date': '2024-01-02',
    'type': 'official_document',
    'chunk_id': 4,
    'chunk_total': 104,
    'source': 'BOCYL',
    'doc_id': 'BOCYL-D-02012024-12',
    'doc_number': 12},
   'text': 'as competentes por razón de las materias relacionadas con la condicionalidad reforzada, que dispongan de datos e infor

In [None]:
result3 = querier.query(
    "Requisitos para beneficiarios de fondos FEAGA",
    n_results=5
)

Result 1:
  Document: BOCYL-D-18072023-10
  Chunk: 119
  Similarity Score: 0.5955
  Text:  cuenta los plazos fijados respectivamente para el mantenimiento del empleo y para el mantenimiento de la inversión. Esta responsabilidad deberá ser a...

Result 2:
  Document: BOCYL-D-18072023-10
  Chunk: 33
  Similarity Score: 0.5909
  Text: iterios de sostenibilidad establecidos en el artículo 29 de la Directiva (UE) 2018/2001 del Parlamento Europeo y del Consejo.

Base cuarta.  Entidades...

Result 3:
  Document: BOCYL-D-18072023-10
  Chunk: 75
  Similarity Score: 0.5840
  Text: haga constar la fecha y el órgano o dependencia en que fueron presentados o, en su caso, emitidos, y no hayan transcurrido más de cinco años desde la ...

Result 4:
  Document: BOCYL-D-02012024-12
  Chunk: 2
  Similarity Score: 0.5748
  Text:  en relación con la ayuda a los planes estratégicos que deben elaborar los Estados miembros en el marco de la política agrícola común (PAC) financiada...

Result 5:
  Document: BO

## Refactor vectordb instance

In [None]:
from modules.vectordb import get_vectordb_bocyl