In [1]:
import time
import inspect
from functools import wraps

def timer(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
        # Start timing
        start_time = time.time()

        # Call the original function
        retval = func(*args, **kwargs)

        # Calculate how long the function took
        end_time = time.time()
        time_taken = end_time - start_time

        # Retrieve the names and values of the arguments
        arg_names = inspect.getfullargspec(func).args
        arg_values = args
        kwarg_items = kwargs.items()

        # Open the file in append mode and write the details
        with open("function_executions.txt", "a") as f:
            f.write(f"Function '{func.__name__}' with arguments ")
            # Write position arguments
            for name, value in zip(arg_names, arg_values):
                f.write(f"{name}={value}, ")
            # Write keyword arguments
            for name, value in kwarg_items:
                f.write(f"{name}={value}, ")
            # Remove the last comma and space
            if arg_values or kwarg_items:
                f.seek(f.tell() - 2)
            # Write execution time
            f.write(f" took {time_taken:.8f} seconds to execute.\n")

        # Return the original function's return value
        return retval
    return wrapper


In [13]:
from llama_index.core import Settings

# Cuestomize the text splitter
from llama_index.core.node_parser import SentenceSplitter

text_splitter = SentenceSplitter(chunk_size=512, chunk_overlap=10)
Settings.text_splitter = text_splitter

# Customize the embedding model
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

Settings.embed_model = HuggingFaceEmbedding(
    model_name="BAAI/bge-small-en-v1.5"
)

# Customize the llm
from llama_index.llms.ollama import Ollama

Settings.llm = Ollama(model="phi", request_timeout=200.0)

In [3]:
from llama_index.core import SimpleDirectoryReader

document = SimpleDirectoryReader("./xml").load_data()
print(document)

[Document(id_='5d86e028-9500-4439-9660-beebbb3b1387', embedding=None, metadata={'file_path': '/Users/danielmoreno/NLP/boe/xml/sumario-20230101.xml', 'file_name': 'sumario-20230101.xml', 'file_type': 'application/xml', 'file_size': 105, 'creation_date': '2024-03-22', 'last_modified_date': '2024-03-22'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, text='<?xml version="1.0"?>\n<error><descripcion>No se encontr&#xF3; el sumario original.</descripcion></error>\n', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n'), Document(id_='d19059d1-284e-4134-8a4b-c157d459dc32', embedding=None, metadata={'file_path': '/Users/danielmoreno/NLP/boe/xml/sumario-20230103.xml', 'file_n

In [4]:
from llama_index.core import VectorStoreIndex
@timer
def create_index_from_documents(documents):
    return VectorStoreIndex.from_documents(documents)


In [5]:
index2 = create_index_from_documents(document)

In [8]:
# Save the index
index2.storage_context.persist("xml_little")

In [15]:
query_engine = index2.as_query_engine(llm=Settings.llm)

In [11]:
with open("entender_xml.txt", "r") as f:
    explain =f.read()
print(explain)

Voy a explicarte cada etiqueta, tambien llamada label en ingles, de los archivos xml del BOE.

0 <sumario> etiqueta general.
1 <meta> metadatos
1.1 <pub> publicacion
1.2 <anno> año de publicacion del BOE
1.3 <fecha> fecha exacta de publicacion del BOE, formato dd/mm/aaaa
1.4 <fechaInv> fecha exacta de publicacion del BOE, formato aaaa/mm/dd
1.5 <fechaAnt> fecha exacta de publicacion del BOE anterior, formato dd/mm/aaaa
1.6 <fechaAntAnt> fecha exacta de publicacion del BOE anterior al anterior, formato dd/mm/aaaa
1.7 <fechaSig> fecha exacta de publicacion del BOE siguiente, formato dd/mm/aaaa
1.7 <fechaPub> fecha exacta de publicacion del BOE, formato dia dd de mes de aaaa
1.8 <pubDate> timestamp

2 <diario nbo="numero del boletin"> información de las disposiciones que conforman cada uno de los boletines publicados en una determinada fecha
2.1 <sumario_nbo id="identificador unico del BOE">  Información referente al documento pdf que contiene el sumario correspondiente al boletín.
2.1.1 

In [16]:
prompt = "Cuál es la url para el pdf del BOE?"
query = explain + prompt
response = query_engine.query(query)
# LLama2 takes more than 200 seconds

In [17]:
response.response

' Based on the context information provided, there is no prior knowledge available to answer this question accurately. However, using a search engine or web scraping tools, it may be possible to find the URL for the BOE PDF document mentioned in the query.\n'

In [36]:
prompt = "is <urlPdf> in the text? If yes find what is inside"
query = f"""
You must find in a xml file the url of a pdf. The root of the URL is https://boe.es.
The xml has a tag <urlPdf> in which the rest of the url is contained.
Now the task:
{prompt}

YOU MUST ONLY ANSWER WITH THE URL, NO OTHER THINGS
"""
response = query_engine.query(query)

In [37]:
response.response

' Yes, you can find the answer by searching for "<urlPdf>" and extracting the value of the szBytes attribute. In this case, it would be "161173". Therefore, the URL is https://boe.es/dias/2023/01/03/pdfs/.\n'