In [1]:
from langchain import hub
from langchain.schema import StrOutputParser, Document
from langchain.schema.runnable import RunnablePassthrough, RunnableLambda
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate, ChatPromptTemplate
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain_ollama import OllamaLLM
from langchain.embeddings import HuggingFaceBgeEmbeddings
from concurrent.futures import ThreadPoolExecutor
from functools import lru_cache
import time

In [2]:
phi= "phi3:mini"
llama = "llama3.2:latest"
url_llm = "http://localhost:11434"

In [3]:
phi_llm = OllamaLLM(model=phi, base_url=url_llm, temperature=0)
llama_llm = OllamaLLM(model=llama, base_url=url_llm, temperature=0)

In [4]:
template = """
You are an assistant for question-answering tasks and an expert in research projects funded by the European Union under the Horizon 2020 programme.
Use the following context from Horizon 2020 projects to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.

Question: {question}

Context: {context}

Answer:
"""


filter_prompt = PromptTemplate.from_template("""
You are an assistant that determines whether a document provides relevant information to answer a question.

If the document contains any information that helps partially or fully answer the question, answer "Yes". 
If it is irrelevant, answer "No".

ONLY answer with "Yes" or "No" — no other words or punctuation.

Question: {question}
Document: {document}
Answer:
""")

prompt = ChatPromptTemplate.from_template(template)
filter_chain = LLMChain(llm=llama_llm, prompt=filter_prompt)

  filter_chain = LLMChain(llm=llama_llm, prompt=filter_prompt)


In [5]:
query_1 = "What is the objective of the project with grant agreement 740934?"
query_2 = "What is the total cost of the project with the acronym HYPERGRYD?"
query_3 = "How much funding was allocated for the project titled Transforming Research through Innovative Practices for Linked interdisciplinary Exploration?"
query_4 = "Which organisation played the role of coordinator in the grant agreement 777998?"
query_5 = "What topic does the project with the acronym INTERRFACE belong to?"
query_6 = "What legal basis was the project titled European Joint Programme on Radioactive Waste Management framed within?"
query_7 = "What type of proposal was the grant agreement 814416?"
query_8 = "To which master call was the project with the acronym G9NIGHT submitted?"
query_9 = "To which sub call was the project titled Electron Nanocrystallography submitted?"
query_10 = "Provide the grant agreement of 1 project which objective is related to artificial intelligence."
query_11 = "Provide the acronym of 1 project which objective is related to robotics."
query_12 = "Provide the title of 1 project which objective is related to geolocation."
query_13 = "Provide the objective of 1 project related to digital twin."
query_14 = "Provide the objective of 3 different projects related to corrosion."
query_15 = "Provide the title of 3 different projects which objective is related to offshore structures."
query_16 = "Provide the acronym of 3 different projects which objective is related to materials engineering."
query_17 = "Provide the grant agreement of 3 different projects which objective is related to nanocomposites."
query_18 = "Provide the name of an organisation that has participated in projects which objective is related to artificial intelligence."
query_19 = "Provide the name of an organisation which activity type is PRC and that has participated in projects which objective is related to robotics."
query_20 = "Provide the PIC of an organisation that is a small or medium enterprise and has participated in projects which objective is related to geolocation."
query_21 = "Provide the name of an organisation that has played the role of coordinator in projects which objective is related to digital twin."
query_22 = "Provide the PIC of a Spanish organisation that has participated in projects which objective is related to corrosion."
query_23 = "Provide the name of an european organisation that has participated in projects which objective is related to offshore structures."
query_24 = "Provide the PIC of an european small or medium enterprise that has participated in projects which objective is related to materials engineering."
query_25 = "Provide the name of an european small or medium enterprise that has played the role of coordinator in projects which objective is related to nanocomposites."

## MODELO sentence-transformers/paraphrase-MiniLM-L6-v2

In [6]:
model_name = "sentence-transformers/paraphrase-MiniLM-L6-v2"
model_kwargs = {"device": "cpu"}
encode_kwargs = {"normalize_embeddings": True}
emb = HuggingFaceBgeEmbeddings(model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs)

  emb = HuggingFaceBgeEmbeddings(model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs)


## EXPERIMENTO 7

In [7]:
from langchain.vectorstores import FAISS

vectorstore_7_txt = FAISS.load_local("faiss_index_proyectos_7_txt", embeddings=emb, 
                                 allow_dangerous_deserialization=True)

In [8]:
retriever_multi_7_txt = MultiQueryRetriever.from_llm(
    retriever=vectorstore_7_txt.as_retriever(search_kwargs={"k": 3}),
    llm=phi_llm,
    include_original=True
)

In [10]:
import time
resultados={}
for i in range(1, 26):
    inicio = time.time()
    current_query = eval(f"query_{i}")

    def format_docs(docs):
        filtered_docs = []
        for doc in docs:
            response = filter_chain.invoke({
                "question": current_query,
                "document": doc.page_content
            })
            respuesta_texto = response["text"]
            print(f"\n--------\n {respuesta_texto} \n--------\n")
            if respuesta_texto.startswith("Sí") or respuesta_texto.startswith("Yes") or respuesta_texto.startswith("sí") or respuesta_texto.startswith("yes"):
                filtered_docs.append(doc)
        return "\n\n".join(doc.page_content for doc in filtered_docs)

    rag_chain_multi_7_txt = (
        {
            "context": retriever_multi_7_txt | format_docs,
            "question": RunnablePassthrough()
        }
        | prompt
        | phi_llm
        | StrOutputParser()
    )
    # Ejecutar RAG con los docs filtrados
    answer = rag_chain_multi_7_txt.invoke(current_query)
    fin = time.time()
    tiempo_ejecucion = fin - inicio

    resultados[f"query_{i}"] = answer

    print(f"Query {i}: Tiempo = {tiempo_ejecucion:.4f} s")
    print(f"La respuesta es: {answer}\n\n---\n\n")


--------
 No 
--------


--------
 No 
--------


--------
 Yes 
--------


--------
 Yes 
--------


--------
 Yes 
--------


--------
 Yes 
--------


--------
 Yes 
--------


--------
 No 
--------


--------
 No 
--------


--------
 Yes 
--------

Query 1: Tiempo = 281.2076 s
La respuesta es: The objective of grant agreement number 740934 under Horizon 2020 was not provided in your context. Therefore, I cannot provide an answer to this question based on the given information. If you have access to more details about project 740934 or would like assistance with a different query related to Horizon 2020 projects, please let me know!

---



--------
 No 
--------


--------
 No 
--------


--------
 No 
--------


--------
 No 
--------


--------
 Yes 
--------


--------
 No 
--------


--------
 Yes 
--------


--------
 Yes 
--------


--------
 Yes 
--------


--------
 Yes 
--------


--------
 Yes 
--------

Query 2: Tiempo = 259.8443 s
La respuesta es: The total funded co


--------
 No 
--------


--------
 No 
--------


--------
 No 
--------


--------
 Yes 
--------


--------
 Yes 
--------


--------
 No 
--------


--------
 No 
--------


--------
 No 
--------


--------
 No. 
--------


--------
 Yes. 
--------


--------
 Yes 
--------


--------
 No 
--------

Query 14: Tiempo = 202.6529 s
La respuesta es: 1. The ADDPRIME project aims to create an innovative solution that significantly increases metal's resistance to corrosion by using encapsulated inhibitors released through specific triggers, potentially reducing the annual costs of dealing with corrosion-related issues in Europe alone.

2. U-CROSS seeks to develop and validate ultrasonic sensors for real-time detection of early localized corrosion stages as well as monitoring damage progression when paint degrades, which could improve the aircraft industry's ability to detect both initiation and propagation of corrosion.

3. The objective of gran agreement 864905 is not explicitly stated 


--------
 Yes 
--------


--------
 No 
--------


--------
 Yes 
--------


--------
 No 
--------


--------
 No 
--------


--------
 No 
--------


--------
 No 
--------


--------
 Yes 
--------


--------
 Yes 
--------


--------
 No 
--------

Query 25: Tiempo = 173.1460 s
La respuesta es: The name of an European small or medium enterprise that has played the role of coordinator in projects related to nanocomposites under Horizon 2020 is ZENIT. They have been involved with CO-PILOT and other initiatives aimed at enhancing innovation management capacities for SMEs, particularly focusing on nanotechnology applications like nanocomposites.

---




### 3,3 minutos de media ha tardado cada consulta

## EXPERIMENTO 8

In [11]:
vectorstore_8_txt = FAISS.load_local("faiss_index_proyectos_8_txt", embeddings=emb, 
                                 allow_dangerous_deserialization=True)

In [12]:
retriever_multi_8_txt = MultiQueryRetriever.from_llm(
    retriever=vectorstore_8_txt.as_retriever(search_kwargs={"k": 3}),
    llm=phi_llm,
    include_original=True
)

In [13]:
import time
resultados={}
for i in range(1, 26):
    inicio = time.time()
    current_query = eval(f"query_{i}")

    def format_docs(docs):
        filtered_docs = []
        for doc in docs:
            response = filter_chain.invoke({
                "question": current_query,
                "document": doc.page_content
            })
            respuesta_texto = response["text"]
            print(f"\n--------\n {respuesta_texto} \n--------\n")
            if respuesta_texto.startswith("Sí") or respuesta_texto.startswith("Yes") or respuesta_texto.startswith("sí") or respuesta_texto.startswith("yes"):
                filtered_docs.append(doc)
        return "\n\n".join(doc.page_content for doc in filtered_docs)

    rag_chain_multi_8_txt = (
        {
            "context": retriever_multi_8_txt | format_docs,
            "question": RunnablePassthrough()
        }
        | prompt
        | phi_llm
        | StrOutputParser()
    )
    # Ejecutar RAG con los docs filtrados
    answer = rag_chain_multi_8_txt.invoke(current_query)
    fin = time.time()
    tiempo_ejecucion = fin - inicio

    resultados[f"query_{i}"] = answer

    print(f"Query {i}: Tiempo = {tiempo_ejecucion:.4f} s")
    print(f"La respuesta es: {answer}\n\n---\n\n")

NameError: name 'raw_docs' is not defined

## MODELO intfloat/e5-small-v2

In [None]:
from langchain.embeddings import HuggingFaceBgeEmbeddings
model_name_9 = "intfloat/e5-small-v2"
model_kwargs_9 = {"device": "cpu"}
encode_kwargs_9={"normalize_embeddings": True, "batch_size": 32}

emb_9 = HuggingFaceBgeEmbeddings(model_name=model_name_9,model_kwargs=model_kwargs_9,encode_kwargs=encode_kwargs_9)

### EXPERIMENTO 9

In [None]:
vectorstore_9_txt = FAISS.load_local("faiss_index_proyectos_9_txt", embeddings=emb_9, 
                                 allow_dangerous_deserialization=True)

In [None]:
retriever_multi_9_txt = MultiQueryRetriever.from_llm(
    retriever=vectorstore_9_txt.as_retriever(search_kwargs={"k": 3}),
    llm=phi_llm,
    include_original=True
)

In [None]:
import time
resultados={}
for i in range(1, 26):
    inicio = time.time()
    current_query = eval(f"query_{i}")

    def format_docs(docs):
        filtered_docs = []
        for doc in docs:
            response = filter_chain.invoke({
                "question": current_query,
                "document": doc.page_content
            })
            respuesta_texto = response["text"]
            print(f"\n--------\n {respuesta_texto} \n--------\n")
            if respuesta_texto.startswith("Sí") or respuesta_texto.startswith("Yes") or respuesta_texto.startswith("sí") or respuesta_texto.startswith("yes"):
                filtered_docs.append(doc)
        return "\n\n".join(doc.page_content for doc in filtered_docs)

    rag_chain_multi_9_txt = (
        {
            "context": retriever_multi_9_txt | format_docs,
            "question": RunnablePassthrough()
        }
        | prompt
        | phi_llm
        | StrOutputParser()
    )
    # Ejecutar RAG con los docs filtrados
    answer = rag_chain_multi_9_txt.invoke(current_query)
    fin = time.time()
    tiempo_ejecucion = fin - inicio

    resultados[f"query_{i}"] = answer

    print(f"Query {i}: Tiempo = {tiempo_ejecucion:.4f} s")
    print(f"La respuesta es: {answer}\n\n---\n\n")