In [1]:
from langchain import hub
from langchain.schema import StrOutputParser
from langchain.schema.runnable import RunnablePassthrough
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.schema import Document
from langchain.chains import LLMChain, HypotheticalDocumentEmbedder
from langchain.prompts import PromptTemplate

In [2]:
local_llm = "llama3.2:latest"
url_llm = "http://localhost:11434"

from langchain_community.llms import Ollama
llm = Ollama(model=local_llm, base_url=url_llm, temperature=0) 

  llm = Ollama(model=local_llm, base_url=url_llm, temperature=0)


In [3]:
template = """You are an assistant for question-answering tasks and an expert in research projects funded by the European Union under the Horizon 2020 programme.
Use the following context from Horizon 2020 projects to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.

Question: {question} 

Context: {context} 

Answer:

"""

In [4]:
query_1 = "What is the objective of the project with grant agreement 740934?"
query_2 = "What is the total cost of the project with the acronym HYPERGRYD?"
query_3 = "How much funding was allocated for the project titled Transforming Research through Innovative Practices for Linked interdisciplinary Exploration?"
query_4 = "Which organisation played the role of coordinator in the grant agreement 777998?"
query_5 = "What topic does the project with the acronym INTERRFACE belong to?"
query_6 = "What legal basis was the project titled European Joint Programme on Radioactive Waste Management framed within?"
query_7 = "What type of proposal was the grant agreement 814416?"
query_8 = "To which master call was the project with the acronym G9NIGHT submitted?"
query_9 = "To which sub call was the project titled Electron Nanocrystallography submitted?"
query_10 = "Provide the grant agreement of 1 project which objective is related to artificial intelligence."
query_11 = "Provide the acronym of 1 project which objective is related to robotics."
query_12 = "Provide the title of 1 project which objective is related to geolocation."
query_13 = "Provide the objective of 1 project related to digital twin."
query_14 = "Provide the objective of 3 different projects related to corrosion."
query_15 = "Provide the title of 3 different projects which objective is related to offshore structures."
query_16 = "Provide the acronym of 3 different projects which objective is related to materials engineering."
query_17 = "Provide the grant agreement of 3 different projects which objective is related to nanocomposites."
query_18 = "Provide the name of an organisation that has participated in projects which objective is related to artificial intelligence."
query_19 = "Provide the name of an organisation which activity type is PRC and that has participated in projects which objective is related to robotics."
query_20 = "Provide the PIC of an organisation that is a small or medium enterprise and has participated in projects which objective is related to geolocation."
query_21 = "Provide the name of an organisation that has played the role of coordinator in projects which objective is related to digital twin."
query_22 = "Provide the PIC of a Spanish organisation that has participated in projects which objective is related to corrosion."
query_23 = "Provide the name of an european organisation that has participated in projects which objective is related to offshore structures."
query_24 = "Provide the PIC of an european small or medium enterprise that has participated in projects which objective is related to materials engineering."
query_25 = "Provide the name of an european small or medium enterprise that has played the role of coordinator in projects which objective is related to nanocomposites."

In [5]:
from langchain.embeddings import HuggingFaceBgeEmbeddings

model_name = "sentence-transformers/paraphrase-MiniLM-L6-v2"
model_kwargs = {"device": "cpu"} # Al no tener NVIDIA es necesario cambiarlo model_kwargs = {'device': 'cuda:0'}
encode_kwargs = {"normalize_embeddings": True}

emb = HuggingFaceBgeEmbeddings(model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs)

  emb = HuggingFaceBgeEmbeddings(model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs)


In [6]:
from langchain.vectorstores import FAISS

vectorstore_7_txt = FAISS.load_local("faiss_index_proyectos_7_txt", embeddings=emb, 
                                 allow_dangerous_deserialization=True)

In [9]:
from langchain.prompts import PromptTemplate
from langchain.retrievers.multi_query import MultiQueryRetriever
from typing import List

retriever_multi_7_txt = MultiQueryRetriever.from_llm(
    retriever=vectorstore_7_txt.as_retriever(),
    llm=llm,
    prompt=prompt_5best
)

In [8]:
prompt_5best = PromptTemplate.from_template("""
You are an expert at rewriting queries. Given the original query, 
generate 10 variations that would return the most relevant documents.

Original: {question}

Your 5 best query variations:
1.""")

filter_prompt = PromptTemplate.from_template("""
Pregunta:
{question}

Contenido del documento:
{document}

¿Este documento responde (al menos parcialmente) a la pregunta?
Responde solo "Sí" o "No".
""")
filter_chain = LLMChain(llm=llm, prompt=filter_prompt)

  filter_chain = LLMChain(llm=llm, prompt=filter_prompt)


## RunnableLambda

In [31]:
def filtrar_docs(docs: List):
    docs_filtrados = []
    for doc in docs:
        respuesta = filter_chain.run({"question": current_query, "document": doc.page_content})
        if respuesta.strip().lower().startswith(("sí", "yes", "Sí", "Yes")):
            docs_filtrados.append(doc)
    return docs_filtrados

RunnableLambda se usa con el fin de que una función de Python pueda encajar dentro de un pipeline de LangChain que espera.

RunnablePassthrough se usa para pasar el valor de entrada al realizar el invoke.

In [36]:
format_docs = RunnableLambda(lambda docs: "\n\n".join(doc.page_content for doc in docs))

rag_chain_multi_7_txt = (
    {
        "context": retriever_multi_7_txt 
                    | filtrado_llm 
                    | format_docs,
        "question": RunnablePassthrough()
    }
    | prompt
    | llm
    | StrOutputParser()
)

In [38]:
import time

resultados = {}

for i in range(1,2):  
    inicio = time.time()
    
    current_query = eval(f"query_{i}")
    answer = rag_chain_multi_7_txt.invoke(current_query)
    
    fin = time.time()
    tiempo_ejecucion = fin - inicio
    
    resultados[f"query_{i}"] = answer
    
    print(f"Query {i}: Tiempo = {tiempo_ejecucion:.4f} s \n")
    print(f"La respuesta es: {answer} \n")
    print("\n\n---\n\n")

Query 1: Tiempo = 738.3345 s 

La respuesta es: I don't know the answer to your question about the grant agreement 740934. 



---




## Filtro de manera más manual

In [9]:
filtered_docs = []

for doc in docs:
    response = filter_chain.run({
        "question": query_1,
        "document": doc.page_content
    })
    print(response)
    if response.strip().lower().startswith("sí") or response.strip().lower().startswith("Sí") or response.strip().lower().startswith("Yes"):  
        filtered_docs.append(doc)


NameError: name 'docs' is not defined

In [28]:
from langchain_core.runnables import RunnableParallel, RunnablePassthrough, RunnableLambda
from langchain_core.output_parsers import StrOutputParser
from langchain.prompts import ChatPromptTemplate

prompt = ChatPromptTemplate.from_template(template)

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain_multi_7_txt = (
    {"context": retriever_multi_7_txt | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [29]:
r

I don't know the specific grant agreement number you are referring to. However, I can tell you that CLASP (Cultural Landscape of Anglo-Saxon Poetry) has a grant agreement number 740934 and its objective is to create an online and interactive consolidated library of Anglo-Saxon poetry. The project aims to produce a comprehensive digital tool for the study of Anglo-Saxon verse, marking up over 60,000 lines of poetry in TEI P5 XML.


## FILTRO MANUAL EN EL CODIGO

In [10]:
from langchain_core.runnables import RunnableParallel, RunnablePassthrough, RunnableLambda
from langchain_core.output_parsers import StrOutputParser
from langchain.prompts import ChatPromptTemplate

prompt = ChatPromptTemplate.from_template(template)

In [15]:
import time
resultados={}
for i in range(1, 26):
    inicio = time.time()
    current_query = eval(f"query_{i}")

    # Recuperar documentos
    #raw_docs = retriever_multi_7_txt.invoke(current_query)
    
    def format_docs(docs):
        filtered_docs = []
        for doc in docs #raw_docs:
            response = filter_chain.run({
                "question": current_query,
                "document": doc.page_content
            })
            if response.strip().lower().startswith(("sí", "yes", "Sí")):
                filtered_docs.append(doc)

        return "\n\n".join(doc.page_content for doc in filtered_docs)

    rag_chain_multi_7_txt = (
        {"context": retriever_multi_7_txt | format_docs, "question": RunnablePassthrough()}
        | prompt
        | llm
        | StrOutputParser()
    )
    # Ejecutar RAG con los docs filtrados
    answer = rag_chain_multi_7_txt.invoke(current_query)
    fin = time.time()
    tiempo_ejecucion = fin - inicio

    resultados[f"query_{i}"] = answer

    print(f"Query {i}: {current_query} \n Tiempo = {tiempo_ejecucion:.4f} s")
    print(f"La respuesta es: {answer}\n\n---\n\n")

Query 1: Tiempo = 658.6796 s
La respuesta es: I don't know the objective of the project with grant agreement 740934.

The objective of the SPOTVIEW project is to develop and demonstrate innovative, sustainable and efficient processes and technology components, in order to optimize the use of natural resources, especially water, in three industrial sectors (Dairy, Pulp and Paper and Steel).

I don't know the objective of the MiLC project.

---


Query 2: Tiempo = 635.0576 s
La respuesta es: I don't know the specific details about the HYPERGRYD project, including its total cost. However, I can suggest checking the official Horizon 2020 website or contacting the European Commission for more information on this project. If available, the budget and costs of the project might be listed in the project's deliverables or reports.

---


Query 3: Tiempo = 783.2616 s
La respuesta es: I couldn't find any information on a Horizon 2020 project with the title "Transforming Research through Innovativ

Query 19: Tiempo = 744.7253 s
La respuesta es: I'm not aware of a specific organisation with the exact combination of characteristics you've described, including being under Horizon 2020 and having a project related to robotics. However, I can suggest some EU-funded organisations that have participated in robotics-related projects under Horizon 2020, such as the European Robotics Association (ERA) or the Robotics for All initiative. If you provide more information, I may be able to help further.

---


Query 20: Tiempo = 579.8024 s
La respuesta es: I don't have enough information to provide a specific PIC (Personal Identification Code) for an organization. However, I can suggest some EU-funded Horizon 2020 projects related to geolocation, such as the "GeoSolutions" project or the "Geospatial Intelligence" project. These projects involved SMEs in various European countries and might have provided information on their participation.

---


Query 21: Tiempo = 893.5346 s
La respuesta es: I

## NOTAS

In [13]:
filter_prompt = PromptTemplate.from_template("""
Pregunta:
{question}

Contenido del documento:
{document}

Del 0 al 10, ¿qué tan bien responde este documento a la pregunta?
Responde solo con un número entero del 0 al 10.
""")
filter_chain = LLMChain(llm=llm, prompt=filter_prompt)

  filter_chain = LLMChain(llm=llm, prompt=filter_prompt)


In [14]:
from langchain_core.runnables import RunnableParallel, RunnablePassthrough, RunnableLambda
from langchain_core.output_parsers import StrOutputParser
from langchain.prompts import ChatPromptTemplate

prompt = ChatPromptTemplate.from_template(template)

In [16]:
filter_chain = LLMChain(llm=llm, prompt=filter_prompt)
import time
resultados={}
for i in range(1, 3):
    inicio = time.time()
    current_query = eval(f"query_{i}")

    # Recuperar documentos
    raw_docs = retriever_multi_7_txt.invoke(current_query)
    
    def format_docs(docs):
        filtered_docs = []
        for doc in docs:
            response = filter_chain.run({
                "question": current_query,
                "document": doc.page_content
            })
            print(f"Filtro LLM: {response}")
            try:
                score = int(response.strip())
                if score > 7:
                    filtered_docs.append(doc)
            except ValueError:
                response = filter_chain.run({
                "question": current_query,
                "document": doc.page_content
                })
                print(f"Filtro LLM: {response}")
                try:
                    score = int(response.strip())
                    if score > 7:
                        filtered_docs.append(doc)

                except ValueError:
                    continue
        return "\n\n".join(doc.page_content for doc in filtered_docs)

    rag_chain_multi_7_txt = (
        {"context": retriever_multi_7_txt | format_docs, "question": RunnablePassthrough()}
        | prompt
        | llm
        | StrOutputParser()
    )
    # Ejecutar RAG con los docs filtrados
    answer = rag_chain_multi_7_txt.invoke(current_query)
    fin = time.time()
    tiempo_ejecucion = fin - inicio

    resultados[f"query_{i}"] = answer

    print(f"Query {i}: Tiempo = {tiempo_ejecucion:.4f} s")
    print(f"La respuesta es: {answer}\n\n---\n\n")

  response = filter_chain.run({


Filtro LLM: 7
Filtro LLM: 7
Filtro LLM: 7
Filtro LLM: 7
Filtro LLM: 7
Filtro LLM: 7
Filtro LLM: 7
Filtro LLM: 7
Filtro LLM: 7
Filtro LLM: 7
Filtro LLM: 7
Filtro LLM: 7
Filtro LLM: 7
Filtro LLM: 7
Filtro LLM: 7
Filtro LLM: 7
Filtro LLM: 7
Filtro LLM: 7
Filtro LLM: 7
Filtro LLM: 7
Filtro LLM: 7
Filtro LLM: 7
Filtro LLM: 7
Filtro LLM: 7
Filtro LLM: 7
Filtro LLM: 7
Filtro LLM: 7
Filtro LLM: 7
Filtro LLM: 7
Filtro LLM: 7
Filtro LLM: 7
Filtro LLM: 7
Filtro LLM: 7
Filtro LLM: 7
Filtro LLM: 7
Filtro LLM: 7
Query 1: Tiempo = 596.8999 s
La respuesta es: I don't have information on a specific Horizon 2020 project with grant agreement 740934. I can try to search for more context or details about the project, but without further information, I couldn't find any relevant data. If you provide more context or details about the project, I'll do my best to help.

---


Filtro LLM: 7
Filtro LLM: 7
Filtro LLM: 7
Filtro LLM: 7
Filtro LLM: 7
Filtro LLM: 7
Filtro LLM: 7
Filtro LLM: 7
Filtro LLM: 7
Filtro LLM

## EXP 8

In [None]:
from langchain.vectorstores import FAISS

vectorstore_8_txt = FAISS.load_local("faiss_index_proyectos_8_txt", embeddings=emb, 
                                 allow_dangerous_deserialization=True)

In [None]:
retriever_multi_8_txt = MultiQueryRetriever.from_llm(
    retriever=vectorstore_8_txt.as_retriever(),
    llm=llm,
    prompt=prompt_5best
)

In [None]:
from langchain_core.runnables import RunnableParallel, RunnablePassthrough, RunnableLambda
from langchain_core.output_parsers import StrOutputParser
from langchain.prompts import ChatPromptTemplate

prompt = ChatPromptTemplate.from_template(template)

In [None]:
import time
resultados={}
for i in range(1, 26):
    inicio = time.time()
    current_query = eval(f"query_{i}")

    # Recuperar documentos
    raw_docs = retriever_multi_8_txt.invoke(current_query)
    
    def format_docs(docs):
        filtered_docs = []
        for doc in raw_docs:
            response = filter_chain.run({
                "question": current_query,
                "document": doc.page_content
            })
            if response.strip().lower().startswith(("sí", "yes", "Sí")):
                filtered_docs.append(doc)

        return "\n\n".join(doc.page_content for doc in filtered_docs)

    rag_chain_multi_8_txt = (
        {"context": retriever_multi_8_txt | format_docs, "question": RunnablePassthrough()}
        | prompt
        | llm
        | StrOutputParser()
    )
    # Ejecutar RAG con los docs filtrados
    answer = rag_chain_multi_8_txt.invoke(current_query)
    fin = time.time()
    tiempo_ejecucion = fin - inicio

    resultados[f"query_{i}"] = answer

    print(f"Query {i}: Tiempo = {tiempo_ejecucion:.4f} s")
    print(f"La respuesta es: {answer}\n\n---\n\n")

## EXP 9

In [None]:
from langchain.embeddings import HuggingFaceBgeEmbeddings
model_name_9 = "intfloat/e5-small-v2"
model_kwargs_9 = {"device": "cpu"}
encode_kwargs_9={"normalize_embeddings": True, "batch_size": 32}

emb_9 = HuggingFaceBgeEmbeddings(model_name=model_name_9,model_kwargs=model_kwargs_9,encode_kwargs=encode_kwargs_9)

In [None]:
from langchain.vectorstores import FAISS

vectorstore_9_txt = FAISS.load_local("faiss_index_proyectos_9_txt", embeddings=emb_9, 
                                 allow_dangerous_deserialization=True)

In [None]:
retriever_multi_9_txt = MultiQueryRetriever.from_llm(
    retriever=vectorstore_9_txt.as_retriever(),
    llm=llm,
    prompt=prompt_5best
)

In [None]:
from langchain_core.runnables import RunnableParallel, RunnablePassthrough, RunnableLambda
from langchain_core.output_parsers import StrOutputParser
from langchain.prompts import ChatPromptTemplate

prompt = ChatPromptTemplate.from_template(template)

In [None]:
import time
resultados={}
for i in range(1, 26):
    inicio = time.time()
    current_query = eval(f"query_{i}")

    # Recuperar documentos
    raw_docs = retriever_multi_8_txt.invoke(current_query)
    
    def format_docs(docs):
        filtered_docs = []
        for doc in raw_docs:
            response = filter_chain.run({
                "question": current_query,
                "document": doc.page_content
            })
            if response.strip().lower().startswith(("sí", "yes", "Sí")):
                filtered_docs.append(doc)

        return "\n\n".join(doc.page_content for doc in filtered_docs)

    rag_chain_multi_8_txt = (
        {"context": retriever_multi_8_txt | format_docs, "question": RunnablePassthrough()}
        | prompt
        | llm
        | StrOutputParser()
    )
    # Ejecutar RAG con los docs filtrados
    answer = rag_chain_multi_8_txt.invoke(current_query)
    fin = time.time()
    tiempo_ejecucion = fin - inicio

    resultados[f"query_{i}"] = answer

    print(f"Query {i}: Tiempo = {tiempo_ejecucion:.4f} s")
    print(f"La respuesta es: {answer}\n\n---\n\n")