In [1]:
from google.colab import files
uploaded = files.upload()

In [2]:
import os
os.makedirs("docs", exist_ok=True)

In [3]:
os.makedirs("output", exist_ok=True)

In [3]:

!pip uninstall -y torch torchvision torchaudio transformers bitsandbytes accelerate langchain langchain_community chromadb sentence-transformers

Found existing installation: torch 2.9.0+cu128
Uninstalling torch-2.9.0+cu128:
  Successfully uninstalled torch-2.9.0+cu128
Found existing installation: torchvision 0.24.0+cu128
Uninstalling torchvision-0.24.0+cu128:
  Successfully uninstalled torchvision-0.24.0+cu128
Found existing installation: torchaudio 2.9.0+cu128
Uninstalling torchaudio-2.9.0+cu128:
  Successfully uninstalled torchaudio-2.9.0+cu128
Found existing installation: transformers 4.57.6
Uninstalling transformers-4.57.6:
  Successfully uninstalled transformers-4.57.6
Found existing installation: bitsandbytes 0.39.0
Uninstalling bitsandbytes-0.39.0:
  Successfully uninstalled bitsandbytes-0.39.0
Found existing installation: accelerate 0.21.0
Uninstalling accelerate-0.21.0:
  Successfully uninstalled accelerate-0.21.0
Found existing installation: langchain 1.2.10
Uninstalling langchain-1.2.10:
  Successfully uninstalled langchain-1.2.10
Found existing installation: langchain-community 0.4.1
Uninstalling langchain-community

In [1]:
!pip install transformers accelerate bitsandbytes torch langchain langchain_community chromadb sentence-transformers



In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch

model_name = "mistralai/Mistral-7B-Instruct-v0.2"

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Intentamos 4-bit (menos memoria)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

try:
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map="auto",
        quantization_config=bnb_config
    )
    print("Modelo cargado en 4-bit ✅")
except:
    # Si falla por memoria, usar FP16
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map="auto",
        torch_dtype=torch.float16
    )
    print("Modelo cargado en FP16 ✅")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading weights:   0%|          | 0/291 [00:00<?, ?it/s]

Modelo cargado en 4-bit ✅


In [3]:
def extract_structured_info(query, chunks):

    context = "\n\n".join([c['content'] for c in chunks])

    prompt = f"""
    Eres un sistema de extracción de información.

    Extrae del siguiente texto la información relacionada con:
    "{query}"

    Devuelve EXCLUSIVAMENTE un JSON válido.
    Si no encuentras la información, usa null.
    Tu respuesta debe empezar con {{ y terminar con }}.

    Texto:
    {context}
    """

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    outputs = model.generate(
        **inputs,
        max_new_tokens=500,
        temperature=0
    )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Extraer solo el JSON
    import re
    match = re.search(r"\{.*\}", response, re.DOTALL)
    if match:
        return match.group()
    else:
        return "{}"

In [5]:
!pip install pypdf

Collecting pypdf
  Downloading pypdf-6.7.1-py3-none-any.whl.metadata (7.1 kB)
Downloading pypdf-6.7.1-py3-none-any.whl (331 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m331.0/331.0 kB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pypdf
Successfully installed pypdf-6.7.1


In [6]:
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import SentenceTransformerEmbeddings
from langchain_community.vectorstores import Chroma
import chromadb

import json
import os

docs_path = "docs/"
output_path = "output/"

loader = PyPDFDirectoryLoader(docs_path)
docs = loader.load() # returns a list of `document` objects, which refer to the pages of the docs

# -- SPLITTING --
splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000, chunk_overlap = 200, add_start_index = True
)

chunks = splitter.split_documents(docs) # list of documents (just smaller size -> chunks)
chunks = [c for c in chunks if "Código seguro de Verificación" not in c.page_content]

# -- VECTOR STORE --
embeddings = SentenceTransformerEmbeddings(model_name='all-MiniLM-L6-v2')
client = chromadb.EphemeralClient()
try:
    client.delete_collection('langchain') # fresh restart
except Exception:
    pass
vector_store = Chroma.from_documents(documents=chunks, embedding=embeddings, client=client)

# -- OBTAINING INFO AND SAVING TO JSON --
queries = [
    "beneficiarios y destinatarios de las ayudas",
    "requisitos y condiciones para solicitar la ayuda",
    "cuantía e importe de las ayudas",
    "plazo y periodo de solicitud",
    "documentación necesaria para la solicitud",
    "criterios de selección y baremación",
    "modalidades y tipos de ayuda",
    "obligaciones de los beneficiarios",
    "incompatibilidades con otras ayudas",
    "procedimiento de resolución y concesión",
    "forma de pago y abono de las ayudas",
    "causas de denegación o revocación",
]

sources = os.listdir(docs_path)

output = {}
for query in queries:
    output[query] = {}
    for source in sources:
        results = vector_store.similarity_search(
            query=query,
            k=3,
            filter={'source': f'docs/{source}'}
        )
        chunks_for_llm = [
            {
                'content': doc.page_content,
                'page': doc.metadata['page']
            } for doc in results
        ]

        structured_json = extract_structured_info(query, chunks_for_llm)

        output[query][source] = structured_json


with open(output_path + 'info.json', 'w', encoding='utf-8') as f:
    json.dump(output, f, ensure_ascii=False, indent=2)

  embeddings = SentenceTransformerEmbeddings(model_name='all-MiniLM-L6-v2')


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for ope