In [1]:
from langchain import hub
from langchain.schema import StrOutputParser, Document
from langchain.schema.runnable import RunnablePassthrough, RunnableLambda
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.retrievers import ParentDocumentRetriever
from langchain.vectorstores import FAISS
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate, ChatPromptTemplate
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain_ollama import OllamaLLM
from langchain.embeddings import HuggingFaceBgeEmbeddings
from concurrent.futures import ThreadPoolExecutor
from langchain.document_loaders.base import BaseLoader
from langchain.docstore.document import Document
from typing import List, Optional
from functools import lru_cache
from langchain.storage import InMemoryStore
import time
import re
import os
import pandas as pd

In [2]:
phi= "phi3:mini"
llama = "llama3.2:latest"
url_llm = "http://localhost:11434"

In [3]:
phi_llm = OllamaLLM(model=phi, base_url=url_llm, temperature=0)
llama_llm = OllamaLLM(model=llama, base_url=url_llm, temperature=0)

In [4]:
prompt_evaluation_response = """
You are a rigorous evaluator. You will compare a system-generated answer to the known correct answer and evaluate its accuracy.

INSTRUCTIONS:
    1. Analyze the question, the correct answer, and the generated answer.
    2. Evaluate EXCLUSIVELY the match between the generated answer and the correct answer.
    3. Assigns a score from 1 to 5 based on this scale:
    
- Quality of the answer:
    1 = Completely incorrect (no match in any respect)
    2 = Mostly incorrect (minimal match)
    3 = Partially correct (some elements match)
    4 = Mostly correct (only minor errors)
    5 = Completely correct (exact match or equivalent).

DATA:
Question: {question}
Correct answer: {answer_correct}
Response generated: {answer_generated}

STRICT OUTPUT FORMAT (no deviations):
Justification: [1-2 sentence concise explanation of differences/similarities]
Score: [Integer from 1 to 5]


You can only get the output in the format indicated.
"""

prompt_evaluacion_respuesta = """
Usted es un evaluador riguroso. Compararás una respuesta generada por el sistema con la respuesta correcta conocida y evaluarás su precisión.

INSTRUCCIONES:
    1. Analiza la pregunta, la respuesta correcta y la respuesta generada.
    2. Evalúe EXCLUSIVAMENTE la coincidencia entre la respuesta generada y la respuesta correcta.
    3. Asigna una puntuación de 1 a 5 en base a esta escala:
    
- Calidad de la respuesta:
    1 = Completamente incorrecta (no hay coincidencia en ningún aspecto)
 2 = Mayormente incorrecta (coincidencia mínima)
 3 = Parcialmente correcta (coincidencia de algunos elementos)
 4 = Mayormente correcta (sólo errores menores)
 5 = Completamente correcta (coincidencia exacta o equivalente).

DATOS:
Pregunta: {pregunta}
Respuesta_correcta: {respuesta_correcta}
Respuesta generada: {respuesta_generada}

FORMATO DE SALIDA ESTRICTO (sin desviaciones):
Justificación: [Explicación concisa de 1-2 frases de las diferencias/similitudes]
Puntuación: [Entero de 1 a 5]


Sólo se puede obtener la salida en el formato indicado. Con una breve explicación en la justificación y en el apartado de PUNTUACIÓN un valor númerico del 1 al 5 *UNICAMENTE* 
"""


prompt_evaluation_context = """
You are an advanced evaluator. You must evaluate:

1. Faithfulness: Is the generated answer fully supported by the retrieved context? (Score 1–5)
2. Coverage: Does the retrieved context contain enough information to correctly answer the question? (Score 1–5)
3. Fluency: Is the generated answer clear, coherent, and well-written? (Score 1–5)

INSTRUCTIONS:
- Read the question, the correct answer, the generated answer, and the context.
- Assign three scores between 1 and 5.

DATA:
Question: {question}
Correct answer: {answer_correct}
Generated answer: {answer_generated}
Context: {context}

STRICT OUTPUT FORMAT:
Justification: [Concise explanation]
Score_faithfulness: [1–5]
Score_coverage: [1–5]
Score_fluency: [1–5]

The output has to be ONLY THE STRICT OUTPUT FORMAT.
"""

prompt_evaluacion_contexto = """
Eres un evaluador avanzado. Debes evaluar:

1. Fidelidad: ¿La respuesta generada está totalmente respaldada por el contexto recuperado? (Puntuación 1-5)
2. Cobertura: ¿El contexto recuperado contiene suficiente información para responder correctamente a la pregunta? (Puntuación 1-5)
3. Fluidez: ¿La respuesta generada es clara, coherente y está bien redactada? (Puntuación 1-5)

INSTRUCCIONES:
- Lee la pregunta, la respuesta correcta, la respuesta generada y el contexto.
- Asigna tres puntuaciones entre 1 y 5.

DATOS:
Pregunta: {question}
Respuesta correcta: {answer_correct}
Respuesta generada: {answer_generated}
Contexto: {context}

FORMATO DE SALIDA ESTRICTO:
Justificación: [Explicación concisa]
Puntuación_fidelidad: [1-5]
Puntuación_cobertura: [1-5]
Puntuación_fluidez: [1-5]

La salida debe tener ÚNICAMENTE EL FORMATO DE SALIDA ESTRICTO.
"""
prompt_evaluacion_respuesta = ChatPromptTemplate.from_template(prompt_evaluacion_respuesta)
prompt_evaluation_response = ChatPromptTemplate.from_template(prompt_evaluation_response)
prompt_evaluation_context = ChatPromptTemplate.from_template(prompt_evaluation_context)
prompt_evaluacion_contexto = ChatPromptTemplate.from_template(prompt_evaluacion_contexto)

In [5]:
answers_correct = ["The objective of the project with grant agreement 740934 is to combat violent extremism by analyzing its root causes, developing both preventive and repressive measures, and countering extremist narratives through collaboration with civil society and law enforcement agencies (LEAs), all while upholding fundamental rights.",
                  "The total cost of the project with the acronym HYPERGRYD (grant agreement 101036656) was €5,987,875.00.",
                  "The project titled “Transforming Research through Innovative Practices for Linked interdisciplinary Exploration” (TRIPLE), with grant agreement 863420, received a total EU contribution of €5,626,548.75 under Horizon 2020, within the “EXCELLENT SCIENCE – Research Infrastructures” programme.",
                  "The organisation that played the role of coordinator in the grant agreement 777998 was UNIVERSIDADE NOVA DE LISBOA (Participant Identification Code: 960782479).",
                  "The project with the acronym INTERRFACE (grant agreement 824330) belongs to the topic LC-SC3-ES-5-2018-2020: TSO – DSO – Consumer: Large-scale demonstrations of innovative grid services through demand response, storage and small-scale (RES) generation.",
                  "The project titled European Joint Programme on Radioactive Waste Management (grant agreement 847593) was framed within the legal basis H2020-Euratom.",
                  "The grant agreement 814416 corresponded to a Research and Innovation Action (RIA) type of proposal.",
                  "The project with the acronym G9NIGHT (grant agreement 101036041) was submitted under the master call H2020-MSCA-NIGHT-2020bis.",
                  "The project titled Electron Nanocrystallography (grant agreement 956099) was submitted under the sub call H2020-EU.1.3.EXCELLENT SCIENCE - Marie Skłodowska-Curie Actions.",
                  ]

In [6]:
answers_correct = [
    "The project’s objective is to combat violent extremism by analyzing its root causes, developing preventive and repressive measures, and countering extremist narratives through collaboration with civil society and LEAs, all while upholding fundamental rights.",
    "The total cost of the project with the acronym HYPERGRYD (grant agreement 101036656) was €5,987,875.00.",
    "Transforming Research through Innovative Practices for Linked interdisciplinary Exploration” (TRIPLE), identified by grant agreement 863420, received a total EU contribution of € 5,626,548.75. This funding was allocated as part of Horizon 2020 under the “EXCELLENT SCIENCE – Research Infrastructures",
    """the role of coordinator in the grant agreement 777998. The participation cost of the organisation with PIC 960782479 in the grant agreement 777998.
The organisation with Participant Identification Code (PIC) 960782479 participated in the grant agreement 777998. The name of this organisation is UNIVERSIDADE NOVA DE LISBOA. The organisation with PIC 960782479 is not a small or medium-sized enterprise. The organisation with PIC 960782479 develops an activity of type HES. The organisation with PIC 960782479 is based in the country PT, codified under ISO 3166. The organisation with PIC 960782479 played the role of coordinator in the grant agreement 777998. The participation cost of the organisation with PIC 960782479 in the grant agreement 777998 was 409500.0 euros. The total amount funded to the organization with PIC 960782479 in the grant agreement 777998 was 409500.0 euros.""",
    "The grant agreement 824330 was framed within the topic LC-SC3-ES-5-2018-2020TSO – DSO – Consumer: Large-scale demonstrations of innovative grid services through demand response, storage and small-scale (RES) generation. The grant agreement 824330 was framed within the master call H2020-LC-SC3-2018-2019-2020. The grant agreement 824330 was framed within the subcall H2020-LC-SC3-2018-ES-SCC.",
    "The grant agreement 847593 was framed within the legal basis H2020-EuratomEuratom.",
    "The grant agreement 814416 was a Research and Innovation Action (RIA) proposal.",
    "The grant agreement 101036041 was framed within the master call H2020-MSCA-NIGHT-2020bis.",
    "The grant agreement 956099 was framed within the legal basis H2020-EU.1.3.EXCELLENT SCIENCE - Marie Skłodowska-Curie Actions.",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""]

In [7]:
answers_correct=["The project’s objective is to combat violent extremism by analyzing its root causes, developing preventive and repressive measures, and countering extremist narratives through collaboration with civil society and LEAs, all while upholding fundamental rights.",
                 "The total cost of the project with the acronym HYPERGRYD (grant agreement 101036656) was €5,987,875.00.",
                 "Transforming Research through Innovative Practices for Linked interdisciplinary Exploration” (TRIPLE), identified by grant agreement 863420, received a total EU contribution of € 5,626,548.75. This funding was allocated as part of Horizon 2020 under the “EXCELLENT SCIENCE – Research Infrastructures",
                 """the role of coordinator in the grant agreement 777998. The participation cost of the organisation with PIC 960782479 in the grant agreement 777998.
The organisation with Participant Identification Code (PIC) 960782479 participated in the grant agreement 777998. The name of this organisation is UNIVERSIDADE NOVA DE LISBOA. The organisation with PIC 960782479 is not a small or medium-sized enterprise. The organisation with PIC 960782479 develops an activity of type HES. The organisation with PIC 960782479 is based in the country PT, codified under ISO 3166. The organisation with PIC 960782479 played the role of coordinator in the grant agreement 777998. The participation cost of the organisation with PIC 960782479 in the grant agreement 777998 was 409500.0 euros. The total amount funded to the organization with PIC 960782479 in the grant agreement 777998 was 409500.0 euros.""",
                 "The grant agreement 824330 was framed within the topic LC-SC3-ES-5-2018-2020TSO – DSO – Consumer: Large-scale demonstrations of innovative grid services through demand response, storage and small-scale (RES) generation. The grant agreement 824330 was framed within the master call H2020-LC-SC3-2018-2019-2020. The grant agreement 824330 was framed within the subcall H2020-LC-SC3-2018-ES-SCC.",
                 "The grant agreement 847593 was framed within the legal basis H2020-EuratomEuratom.",
                 "The grant agreement 814416 was a Research and Innovation Action (RIA) proposal.",
                 "The grant agreement 101036041 was framed within the master call H2020-MSCA-NIGHT-2020.",
                 "The grant agreement 956099 was framed within the legal basis H2020-EU.1.3.EXCELLENT SCIENCE - Marie Skłodowska-Curie Actions.",
                 "The objective described corresponds to the ILIAD project, which aims at establishing an interoperable, data-intensive, and cost-effective Digital Twin of the Ocean (DTO). The project integrates Earth observation data, immersive technologies, and semantic frameworks to support sustainable ocean management and contributes to the European Green Deal and the UN Ocean Decade objectives. OR The objective described corresponds to the TwinECS project, which focuses on developing an efficient and accurate model for simulating an e-ECS system under the Dymola/Modelica framework. It involves the creation of physical and surrogate models for thermo-fluid and electrical components but does not explicitly refer to the concept of digital twin.",
                 """These are the 12 projects whose objective is related to maritime structures. The correct answer must coincide of these responses:
1. Elastopoli (AquaComp)
Objetivo:
Replicar comercialmente en el sector automotriz un nanocompuesto de nanocelulosa (AquaComp), validado previamente en instrumentos musicales, mediante la ampliación de su producción a escala industrial, con el fin de sustituir compuestos poliméricos no renovables por materiales más ligeros, sostenibles y con mejores propiedades mecánicas, acústicas y hápticas.

2. SPNano (Nanodispersión con proteína SP1)
Objetivo:
Implementar la proteína SP1 en procesos industriales para optimizar la dispersión de nanopartículas (como grafeno o nanotubos de carbono) en materiales compuestos, logrando nanocompuestos con un 50% más de resistencia y propiedades mejoradas, superando los problemas de aglomeración y escalabilidad.

3. NANOLEAP (Nanocompuestos para construcción)
Objetivo:
Establecer una red europea de líneas piloto especializadas en nanocompuestos para aplicaciones en infraestructura y construcción, facilitando la escalabilidad industrial de recubrimientos anti-corrosivos, materiales multifuncionales (autolimpiantes, térmicos) y elementos prefabricados no estructurales.

4. CO-PILOT (Producción abierta de nanocompuestos)
Objetivo:
Crear una infraestructura de acceso abierto para que pymes produzcan nanocompuestos funcionales (anti-llama, aislantes, UV-protectores) a escala piloto (20-100 kg), integrando monitorización en línea de la calidad de dispersión de nanopartículas y procesos automatizados.

5. Carbo4Power (Palas de turbinas offshore)
Objetivo:
Desarrollar materiales multimodales ligeros y nanoingenierizados (compuestos dinámicos, recubrimientos multifuncionales) para palas de turbinas eólicas y mareomotrices, mejorando su rendimiento, durabilidad y reciclabilidad (hasta 95%), reduciendo el coste de la energía renovable.

6. ACHIEF (Materiales para industrias intensivas en energía)
Objetivo:
Desarrollar aleaciones de alta entropía (HEAs) y recubrimientos cerámicos avanzados para equipos industriales expuestos a condiciones extremas, mejorando su resistencia a la corrosión, fatiga térmica y creep, con sensores integrados para monitoreo en tiempo real.

7. OptiNanoPro (Nanomateriales en packaging y energía)
Objetivo:
Integrar nanotecnologías en líneas de producción industriales para desarrollar materiales con propiedades avanzadas (barrera, autolimpiantes, UV-resistentes) en sectores como envases, automoción y paneles solares, optimizando procesos mediante sistemas de monitorización en línea.

8. NANO2DAY (MXenes para electrónica y aerospacio)
Objetivo:
Explorar el potencial de los MXenes (nuevos nanomateriales 2D) en composites poliméricos para aplicaciones en electrónica vestible y componentes aeroespaciales, comparando su eficacia frente a compuestos con grafeno y escalando su síntesis a nivel cuasi-industrial.

9. PANG (Grafeno contra patógenos resistentes)
Objetivo:
Investigar nanocompuestos basados en grafeno como alternativa a los antibióticos para combatir infecciones bacterianas multirresistentes, estudiando su mecanismo de acción y desarrollando terapias no invasivas en colaboración con socios académicos e industriales.

10. RECOPHARMA (Tratamiento de aguas con nanomateriales)
Objetivo:
Desarrollar un sistema innovador basado en nanocompuestos y oxidación avanzada para eliminar fármacos recalcitrantes (como citostáticos) de aguas residuales, combinando polímeros impresos molecularmente y procesos de recuperación en modo continuo.

11. FAST (Impresión 3D híbrida para ingeniería de tejidos)
Objetivo:
Desarrollar una tecnología híbrida de impresión 3D que combine nanocompuestos funcionalizados, plasma atmosférico y control de gradientes para fabricar andamios tisulares personalizados con propiedades mecánicas, bioactivas y superficiales optimizadas para regeneración ósea.

12. MSCA-RISE (Nanocompuestos ópticos anisotrópicos)
Objetivo:
Diseñar materiales nanoestructurados con anisotropía controlada para aplicaciones en óptica y sub-THz, combinando cristalografía, nanoingeniería y modelado avanzado, con el fin de mejorar la eficiencia energética en celdas ópticas y dispositivos fotónicos."""
    ]

In [8]:
print(len(answers_correct))

11


In [9]:
df = pd.read_excel('resultados_8.xlsx', engine='openpyxl')
primeras_9 = df.iloc[:9]
filas_13_14 = df.iloc[[12, 13]]

df_seleccionado = pd.concat([primeras_9, filas_13_14])

## EVALUACIÓN INGLES

In [10]:
resultados = []

for idx, (_, row) in enumerate(df_seleccionado.iterrows()):
    question = str(row['Query'])
    answer_generated = str(row['Answer'])
    answer_correct = str(answers_correct[idx])

    prompt = prompt_evaluation_response.format(
        question=question,
        answer_correct=answer_correct,
        answer_generated=answer_generated
    )

    response = llama_llm.invoke(prompt)

    match = re.search(r"Score:\s*(\d+)", response)
    puntuacion = int(match.group(1)) if match else None

    resultados.append({
        'Consulta': question,
        'Respuesta_generada': answer_generated,
        'Justificación_LLM': response,
        'Puntuación': puntuacion
    })

df_resultados = pd.DataFrame(resultados)

In [11]:
print(df_resultados)

                                             Consulta  \
0   What is the objective of the project with gran...   
1   What is the total cost of the project with the...   
2   How much funding was allocated for the project...   
3   Which organisation played the role of coordina...   
4   What topic does the project with the acronym I...   
5   What legal basis was the project titled Europe...   
6   What type of proposal was the grant agreement ...   
7   To which master call was the project with the ...   
8   To which sub call was the project titled Elect...   
9   Provide the objective of 1 project related to ...   
10  Provide the objective of 3 different projects ...   

                                   Respuesta_generada  \
0   The Terrorism Prevention via Radicalisation Co...   
1   The total cost of the project HYPERGRYD under ...   
2   The project titled Transforming Research throu...   
3   The European Commission (EC) acted as the proj...   
4   The INTERRFACE project is 

In [12]:
def extract_score(response_text, keyword):
    pattern = rf"(?i)(?:score[_ ]*)?{re.escape(keyword)}\s*[:=]\s*([0-9]+)"
    match = re.search(pattern, response_text)
    if match:
        return int(match.group(1))
    else:
        raise AttributeError(f"Score for '{keyword}' not found in text.")

In [13]:
evaluaciones = {}
for idx, (_, row) in enumerate(df_seleccionado.iterrows()):
    question = str(row['Query'])
    answer_generated = str(row['Answer'])
    answer_correct = str(answers_correct[idx])
    
    # Unimos fragmentos para el contexto
    fragmentos = []
    for col in df.columns:
        if col.startswith('Fragment_') and pd.notna(row[col]):
            fragmentos.append(str(row[col]))
    context = "\n".join(fragmentos)

    # Formateamos el prompt
    prompt = prompt_evaluation_context.format(
        question=question,
        answer_correct=answer_correct,
        answer_generated=answer_generated,
        context=context
    )

    response = llama_llm.invoke(prompt)
    
    print(f"\n--- Evaluación extendida para Query {idx} ---")
    print(f"Pregunta: {question}")
    print(f"Respuesta correcta: {answer_correct}")
    print(f"Respuesta generada: {answer_generated}")
    print(f"Evaluación del LLM:\n{response}")
    print("---\n")
    
    try:
        score_faithfulness = extract_score(response, "faithfulness")
        score_coverage = extract_score(response, "coverage")
        score_fluency = extract_score(response, "fluency")
    except AttributeError:
        print(f"Error: No se encontraron scores en la respuesta para Query {idx}")
        score_faithfulness = None
        score_coverage = None
        score_fluency = None
    
    evaluaciones[f"query_{idx}"] = {
        "score_faithfulness": score_faithfulness,
        "score_coverage": score_coverage,
        "score_fluency": score_fluency
    }
    
    print("Score faithfulness:", score_faithfulness)
    print("Score coverage:", score_coverage)
    print("Score fluency:", score_fluency)
    print("\n")


--- Evaluación extendida para Query 0 ---
Pregunta: What is the objective of the project with grant agreement 740934?
Respuesta correcta: The project’s objective is to combat violent extremism by analyzing its root causes, developing preventive and repressive measures, and countering extremist narratives through collaboration with civil society and LEAs, all while upholding fundamental rights.
Respuesta generada: The Terrorism Prevention via Radicalisation Counter-Narrative - Narrative approach aims to develop strategies for preventing violent radicalization in Europe by creating comprehensive approaches from early understanding to improving protection. It seeks to explore the impact of narratives on counteracting extremist ideologies and promoting social cohesion within EU societies.
Evaluación del LLM:
Justification: The generated answer is partially supported by the retrieved context, as it mentions the project's objective related to combating violent extremism. However, the specif


--- Evaluación extendida para Query 7 ---
Pregunta: To which master call was the project with the acronym G9NIGHT submitted?
Respuesta correcta: The grant agreement 101036041 was framed within the master call H2020-MSCA-NIGHT-2020.
Respuesta generada: The project G9NIGHT was submitted under master call H2020-MSCA-NIGHT-2020bis.
Evaluación del LLM:
Justification: The generated answer partially deviates from the retrieved context, as it incorrectly includes "European Researchers' Night" in the master call name.

Score_faithfulness: 3
Score_coverage: 4
Score_fluency: 5
---

Score faithfulness: 3
Score coverage: 4
Score fluency: 5



--- Evaluación extendida para Query 8 ---
Pregunta: To which sub call was the project titled Electron Nanocrystallography submitted?
Respuesta correcta: The grant agreement 956099 was framed within the legal basis H2020-EU.1.3.EXCELLENT SCIENCE - Marie Skłodowska-Curie Actions.
Respuesta generada: The question cannot be answered based on the provided context 

In [15]:
scores_faithfulness = []
scores_coverage = []
scores_fluency = []

for i in range(len(df_resultados)):
    eval_result = evaluaciones.get(f"query_{i}", {})
    scores_faithfulness.append(eval_result.get("score_faithfulness"))
    scores_coverage.append(eval_result.get("score_coverage"))
    scores_fluency.append(eval_result.get("score_fluency"))

df_resultados['Puntuación_faithfulness'] = scores_faithfulness
df_resultados['Puntuación_coverage'] = scores_coverage
df_resultados['Puntuación_fluency'] = scores_fluency

In [16]:
df_resultados.to_excel('evaluacion_ingles_9.xlsx', index=False)

## EVALUACIÓN ESPAÑOL

In [10]:
resultados_evaluacion = []

for idx, (_, row) in enumerate(df_seleccionado.iterrows()):
    question = str(row['Query'])
    answer_generated = str(row['Answer'])
    answer_correct = str(answers_correct[idx])

    fragmentos = []
    for col in df.columns:
        if col.startswith('Fragment_') and pd.notna(row[col]):
            fragmentos.append(str(row[col]))
    context = "\n".join(fragmentos)

    prompt = prompt_evaluacion_respuesta.format(
        pregunta=question,
        respuesta_correcta=answer_correct,
        respuesta_generada=answer_generated
    )

    response = llama_llm.invoke(prompt)
    print(f"\n-----\n {response} \n-----\n")

    match = re.search(
        r"\**\s*Puntuación\s*\**\s*(?:[:=])\s*\**\s*(\d+)\s*\**",
        response,
        re.IGNORECASE
    )
    puntuacion = int(match.group(1)) if match else None

    resultados_evaluacion.append({
        "Consulta": question,
        "Respuesta_generada": answer_generated,
        "Respuesta_correcta": answer_correct,
        "Evaluación_LLM": response,
        "Puntuación_global": puntuacion
    })


df_eval_global = pd.DataFrame(resultados_evaluacion)


print(df_eval_global.head())



-----
 Justificación: La respuesta generada no coincide con la respuesta correcta en términos de objetivo del proyecto, ya que se centra en mejorar las características de búsqueda en motores de búsqueda, mientras que la respuesta correcta aborda el combate al extremismo violento.

Puntuación: 2 
-----


-----
 Justificación:
La respuesta generada coincide con la respuesta correcta en cuanto a la mención del proyecto HYPERGRYD, pero presenta una diferencia significativa en el cálculo del costo total. La respuesta correcta proporciona un valor específico de €5,987,875.00, mientras que la respuesta generada utiliza un formato numérico diferente y no proporciona el contexto completo.

Puntuación: 4 
-----


-----
 Justificación:
La respuesta generada no menciona el nombre completo del proyecto "Transforming Research through Innovative Practices for Linked interdisciplinary Exploration" (TRIPLE) como se indica en la pregunta, y también omite la mención de la EU contribution. 

Puntuación: 

In [11]:
def extract_score(text, keyword):
    cleaned_text = re.sub(r'\*\*', '', text)
    cleaned_text = cleaned_text.lower()     
    keyword = keyword.lower()               

    pattern = rf"(puntuación[_ ]*)?{keyword}\s*[:=]\s*([0-9]+)"
    match = re.search(pattern, cleaned_text)
    
    if match:
        return int(match.group(2))
    return None

In [12]:
evaluaciones = {}
for idx, (_, row) in enumerate(df_seleccionado.iterrows()):
    question = str(row['Query'])
    answer_generated = str(row['Answer'])
    answer_correct = str(answers_correct[idx])
    
    fragmentos = []
    for col in df.columns:
        if col.startswith('Fragment_') and pd.notna(row[col]):
            fragmentos.append(str(row[col]))
    context = "\n".join(fragmentos)

    prompt = prompt_evaluacion_contexto.format(
        question=question,
        answer_correct=answer_correct,
        answer_generated=answer_generated,
        context=context
    )

    response = llama_llm.invoke(prompt)
    
    print(f"\n--- Evaluación extendida para Query {idx} ---")
    print(f"Pregunta: {question}")
    print(f"Respuesta correcta: {answer_correct}")
    print(f"Respuesta generada: {answer_generated}")
    print(f"Evaluación del LLM:\n{response}")
    print("---\n")
    
    try:
        score_faithfulness = extract_score(response, "fidelidad")
        score_coverage = extract_score(response, "cobertura")
        score_fluency = extract_score(response, "fluidez")
    except AttributeError:
        print(f"Error: No se encontraron scores en la respuesta para Query {idx}")
        score_faithfulness = None
        score_coverage = None
        score_fluency = None
    
    evaluaciones[f"query_{idx}"] = {
        "score_faithfulness": score_faithfulness,
        "score_coverage": score_coverage,
        "score_fluency": score_fluency
    }
    
    print("Score faithfulness:", score_faithfulness)
    print("Score coverage:", score_coverage)
    print("Score fluency:", score_fluency)
    print("\n")



--- Evaluación extendida para Query 0 ---
Pregunta: What is the objective of the project with grant agreement 740934?
Respuesta correcta: The project’s objective is to combat violent extremism by analyzing its root causes, developing preventive and repressive measures, and countering extremist narratives through collaboration with civil society and LEAs, all while upholding fundamental rights.
Respuesta generada: The grant agreement 740934 aimed to improve query suggestion features in web search engines. It focused on helping users express their information needs more precisely through enhanced suggestions based on user intent and contextual understanding. The project sought to refine the accuracy of these suggestions, thereby improving overall search experience for end-users.
Evaluación del LLM:
Fidelidad: 0
Cobertura: 2
Fluidez: 3
---

Score faithfulness: 0
Score coverage: 2
Score fluency: 3



--- Evaluación extendida para Query 1 ---
Pregunta: What is the total cost of the project


--- Evaluación extendida para Query 7 ---
Pregunta: To which master call was the project with the acronym G9NIGHT submitted?
Respuesta correcta: The grant agreement 101036041 was framed within the master call H2020-MSCA-NIGHT-2020.
Respuesta generada: The project G9NIGHT does not appear to be associated with any Horizon 2020 master call based on the provided context, as it is a fictional acronym and no relevant information about such a project exists within this document. Therefore, I do not have data regarding its submission or association with a specific Horizon 2020 master call.
Evaluación del LLM:
Justificación:
La respuesta generada no está totalmente respaldada por el contexto recuperado, ya que no se menciona explícitamente la relación entre el proyecto G9NIGHT y el master call H2020-MSCA-NIGHT-2020. La respuesta también es demasiado negativa y no proporciona una solución o una posibilidad de respuesta correcta.

Puntuación_fidelidad: 2
Puntuación_cobertura: 4
Puntuación_fluide

In [13]:
scores_faithfulness = []
scores_coverage = []
scores_fluency = []

for i in range(len(df_eval_global)):
    eval_result = evaluaciones.get(f"query_{i}", {})
    scores_faithfulness.append(eval_result.get("score_faithfulness"))
    scores_coverage.append(eval_result.get("score_coverage"))
    scores_fluency.append(eval_result.get("score_fluency"))

# Añadir las columnas al DataFrame
df_eval_global['Puntuación_faithfulness'] = scores_faithfulness
df_eval_global['Puntuación_coverage'] = scores_coverage
df_eval_global['Puntuación_fluency'] = scores_fluency

In [14]:
print(df_eval_global)

                                             Consulta  \
0   What is the objective of the project with gran...   
1   What is the total cost of the project with the...   
2   How much funding was allocated for the project...   
3   Which organisation played the role of coordina...   
4   What topic does the project with the acronym I...   
5   What legal basis was the project titled Europe...   
6   What type of proposal was the grant agreement ...   
7   To which master call was the project with the ...   
8   To which sub call was the project titled Elect...   
9   Provide the objective of 1 project related to ...   
10  Provide the objective of 3 different projects ...   

                                   Respuesta_generada  \
0   The grant agreement 740934 aimed to improve qu...   
1   The total funded cost of the HYPERGRYD project...   
2   The Transforming Research through Innovative P...   
3   The European Commission (EC) acted as the proj...   
4   INTERRFACE is a project th

In [15]:
df_eval_global.to_excel('evaluacion_español_8.xlsx', index=False)