In [1]:
from langchain import hub
from langchain.schema import StrOutputParser, Document
from langchain.schema.runnable import RunnablePassthrough, RunnableLambda
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.retrievers import ParentDocumentRetriever
from langchain.vectorstores import FAISS
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate, ChatPromptTemplate
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain_ollama import OllamaLLM
from langchain.embeddings import HuggingFaceBgeEmbeddings
from concurrent.futures import ThreadPoolExecutor
from langchain.document_loaders.base import BaseLoader
from langchain.docstore.document import Document
from typing import List, Optional
from functools import lru_cache
from langchain.storage import InMemoryStore
import time
import re
import os
import pandas as pd

In [2]:
phi= "phi3:mini"
llama = "llama3.2:latest"
url_llm = "http://localhost:11434"

In [3]:
phi_llm = OllamaLLM(model=phi, base_url=url_llm, temperature=0)
llama_llm = OllamaLLM(model=llama, base_url=url_llm, temperature=0)

# 1-5

In [4]:
prompt_evaluation_response = """
You are a careful but flexible evaluator. You will compare a system-generated answer to the known correct answer and assess its overall correctness based on meaning and relevance, not exact wording.

INSTRUCTIONS:
1. Read the question, the correct answer, and the generated answer.
2. Focus on whether the generated answer conveys the same key meaning or achieves the same communicative goal as the correct answer.
3. Ignore minor differences in grammar, structure, or terminology unless they cause confusion or change the meaning.
4. Assign a score from 1 to 5 based on the following scale:

    SCORE DEFINITIONS:
    1 = Completely incorrect — The answer is unrelated, factually wrong, or contradicts the correct answer.
    2 = Mostly incorrect — The answer contains minor relevance or surface similarity, but lacks essential meaning.
    3 = Partially correct — The answer includes some key information but misses or distorts important parts.
    4 = Mostly correct — The answer preserves most of the intended meaning with only minor omissions or inaccuracies.
    5 = Fully correct — The answer accurately conveys the essential meaning and intent, even if reworded or slightly reformulated.

DATA:
Question: {question}
Correct answer: {answer_correct}
Response generated: {answer_generated}

STRICT OUTPUT FORMAT (no deviations):
Justification: [1-2 sentence concise explanation of differences/similarities]
Score: [Integer from 1 to 5]
"""

prompt_evaluation_context = """ 
You are an advanced evaluator. You must evaluate:

1. Faithfulness: Is the generated answer fully supported by the retrieved context? (Score 1–5)
2. Coverage: Does the retrieved context contain enough information to correctly answer the question? (Score 1–5)
3. Fluency: Is the generated answer clear, coherent, and well-written? (Score 1–5)

INSTRUCTIONS:
- Read the question, the correct answer, the generated answer, and the context.
- Assign three scores between 1 and 5 using the following scale:

SCORE DEFINITIONS:
    1 = Very Poor — Major issues or severe deficiencies (e.g. completely unsupported claims, missing or irrelevant context, incoherent writing)
    2 = Poor — Significant problems that undermine quality or accuracy
    3 = Fair — Noticeable issues but partially adequate or acceptable
    4 = Good — Minor issues, generally correct and well-formed
    5 = Excellent — Fully accurate, comprehensive, and well-written with no issues

DATA:
Question: {question}
Correct answer: {answer_correct}
Generated answer: {answer_generated}
Context: {context}

STRICT OUTPUT FORMAT:
Justification: [Concise explanation]
Score_faithfulness: [1–5]
Score_coverage: [1–5]
Score_fluency: [1–5]

The output has to be ONLY THE STRICT OUTPUT FORMAT.
"""


prompt_evaluation_response = ChatPromptTemplate.from_template(prompt_evaluation_response)
prompt_evaluation_context = ChatPromptTemplate.from_template(prompt_evaluation_context)

 # 1-3

In [4]:
prompt_evaluation_response = """
You are a careful but flexible evaluator. You will compare a system-generated answer to the known correct answer and assess its overall correctness based on meaning and relevance, not exact wording.

INSTRUCTIONS:
1. Read the question, the correct answer, and the generated answer.
2. Focus on whether the generated answer conveys the same key meaning or achieves the same communicative goal as the correct answer.
3. Ignore minor differences in grammar, structure, or terminology unless they cause confusion or change the meaning.
4. Assign a score from 1 to 3 based on the following scale:

SCORE DEFINITIONS:
    1 = Incorrect — The answer misses the main point or conveys incorrect or unrelated information.
    2 = Partially correct — The answer captures part of the intended meaning but omits or distorts important elements.
    3 = Correct — The answer preserves the essential meaning and intent of the correct answer, even if phrased differently.

DATA:
Question: {question}
Correct answer: {answer_correct}
Response generated: {answer_generated}

STRICT OUTPUT FORMAT (no deviations):
Justification: [1-2 sentence concise explanation of differences/similarities]
Score: [Integer from 1 to 3]
"""
prompt_evaluation_context = """
You are an advanced evaluator. You must evaluate:

1. Faithfulness: Is the generated answer fully supported by the retrieved context? (Score 1–3)
2. Coverage: Does the retrieved context contain enough information to correctly answer the question? (Score 1–3)
3. Fluency: Is the generated answer clear, coherent, and well-written? (Score 1–3)

INSTRUCTIONS:
- Read the question, the correct answer, the generated answer, and the context.
- Assign three scores between 1 and 3 using the following scale:

SCORE DEFINITIONS:
    1 = Poor — Major issues or gaps in this dimension (e.g. unsupported claims, insufficient context, or poor writing)
    2 = Fair — Some issues present but partially adequate
    3 = Good — Fully satisfactory with no significant issues

DATA:
Question: {question}
Correct answer: {answer_correct}
Generated answer: {answer_generated}
Context: {context}

STRICT OUTPUT FORMAT:
Justification: [Concise explanation]
Score_faithfulness: [1–3]
Score_coverage: [1–3]
Score_fluency: [1–3]

The output has to be ONLY THE STRICT OUTPUT FORMAT.
"""
prompt_evaluation_context = ChatPromptTemplate.from_template(prompt_evaluation_context)
prompt_evaluation_response = ChatPromptTemplate.from_template(prompt_evaluation_response)

answers_correct = ["The objective of the project with grant agreement 740934 is to combat violent extremism by analyzing its root causes, developing both preventive and repressive measures, and countering extremist narratives through collaboration with civil society and law enforcement agencies (LEAs), all while upholding fundamental rights.",
                  "The total cost of the project with the acronym HYPERGRYD (grant agreement 101036656) was €5,987,875.00.",
                  "The project titled “Transforming Research through Innovative Practices for Linked interdisciplinary Exploration” (TRIPLE), with grant agreement 863420, received a total EU contribution of €5,626,548.75 under Horizon 2020, within the “EXCELLENT SCIENCE – Research Infrastructures” programme.",
                  "The organisation that played the role of coordinator in the grant agreement 777998 was UNIVERSIDADE NOVA DE LISBOA (Participant Identification Code: 960782479).",
                  "The project with the acronym INTERRFACE (grant agreement 824330) belongs to the topic LC-SC3-ES-5-2018-2020: TSO – DSO – Consumer: Large-scale demonstrations of innovative grid services through demand response, storage and small-scale (RES) generation.",
                  "The project titled European Joint Programme on Radioactive Waste Management (grant agreement 847593) was framed within the legal basis H2020-Euratom.",
                  "The grant agreement 814416 corresponded to a Research and Innovation Action (RIA) type of proposal.",
                  "The project with the acronym G9NIGHT (grant agreement 101036041) was submitted under the master call H2020-MSCA-NIGHT-2020bis.",
                  "The project titled Electron Nanocrystallography (grant agreement 956099) was submitted under the sub call H2020-EU.1.3.EXCELLENT SCIENCE - Marie Skłodowska-Curie Actions.",
                  ]

answers_correct = [
    "The project’s objective is to combat violent extremism by analyzing its root causes, developing preventive and repressive measures, and countering extremist narratives through collaboration with civil society and LEAs, all while upholding fundamental rights.",
    "The total cost of the project with the acronym HYPERGRYD (grant agreement 101036656) was €5,987,875.00.",
    "Transforming Research through Innovative Practices for Linked interdisciplinary Exploration” (TRIPLE), identified by grant agreement 863420, received a total EU contribution of € 5,626,548.75. This funding was allocated as part of Horizon 2020 under the “EXCELLENT SCIENCE – Research Infrastructures",
    """the role of coordinator in the grant agreement 777998. The participation cost of the organisation with PIC 960782479 in the grant agreement 777998.
The organisation with Participant Identification Code (PIC) 960782479 participated in the grant agreement 777998. The name of this organisation is UNIVERSIDADE NOVA DE LISBOA. The organisation with PIC 960782479 is not a small or medium-sized enterprise. The organisation with PIC 960782479 develops an activity of type HES. The organisation with PIC 960782479 is based in the country PT, codified under ISO 3166. The organisation with PIC 960782479 played the role of coordinator in the grant agreement 777998. The participation cost of the organisation with PIC 960782479 in the grant agreement 777998 was 409500.0 euros. The total amount funded to the organization with PIC 960782479 in the grant agreement 777998 was 409500.0 euros.""",
    "The grant agreement 824330 was framed within the topic LC-SC3-ES-5-2018-2020TSO – DSO – Consumer: Large-scale demonstrations of innovative grid services through demand response, storage and small-scale (RES) generation. The grant agreement 824330 was framed within the master call H2020-LC-SC3-2018-2019-2020. The grant agreement 824330 was framed within the subcall H2020-LC-SC3-2018-ES-SCC.",
    "The grant agreement 847593 was framed within the legal basis H2020-EuratomEuratom.",
    "The grant agreement 814416 was a Research and Innovation Action (RIA) proposal.",
    "The grant agreement 101036041 was framed within the master call H2020-MSCA-NIGHT-2020bis.",
    "The grant agreement 956099 was framed within the legal basis H2020-EU.1.3.EXCELLENT SCIENCE - Marie Skłodowska-Curie Actions.",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""]

# ESPAÑOL

In [4]:
prompt_evaluacion_respuesta = """
Eres un evaluador cuidadoso pero flexible. Compararás una respuesta generada por el sistema con la respuesta correcta conocida y evaluarás su corrección general en función del significado y la relevancia, no de la redacción exacta.

INSTRUCCIONES:
1. Lee la pregunta, la respuesta correcta y la respuesta generada.
2. Concéntrate en si la respuesta generada transmite el mismo significado clave o cumple el mismo objetivo comunicativo que la respuesta correcta.
3. Ignora diferencias menores en gramática, estructura o terminología, a menos que causen confusión o cambien el significado.
4. Asigna una puntuación del 1 al 5 según la siguiente escala:

    DEFINICIÓN DE PUNTUACIONES:
    1 = Completamente incorrecta — La respuesta no está relacionada, es incorrecta o contradice la respuesta correcta.
    2 = Mayormente incorrecta — La respuesta contiene cierta relevancia superficial, pero carece del significado esencial.
    3 = Parcialmente correcta — La respuesta incluye información clave pero omite o distorsiona partes importantes.
    4 = Mayormente correcta — La respuesta conserva la mayor parte del significado previsto con solo omisiones o inexactitudes menores.
    5 = Totalmente correcta — La respuesta transmite con precisión el significado e intención esenciales, incluso si está reformulada o expresada de manera diferente.

DATOS:
Pregunta: {question}
Respuesta correcta: {answer_correct}
Respuesta generada: {answer_generated}

FORMATO DE SALIDA ESTRICTO (sin desviaciones):
Justificación: [Explicación concisa de 1 a 2 frases sobre las diferencias/similitudes]
Puntuación: [Entero del 1 al 5]
"""

prompt_evaluacion_contexto = """ 
Eres un evaluador avanzado. Debes evaluar:

1. Fidelidad: ¿La respuesta generada está completamente respaldada por el contexto recuperado? (Puntuación 1–5)
2. Cobertura: ¿El contexto recuperado contiene suficiente información para responder correctamente a la pregunta? (Puntuación 1–5)
3. Fluidez: ¿La respuesta generada es clara, coherente y está bien redactada? (Puntuación 1–5)

INSTRUCCIONES:
- Lee la pregunta, la respuesta correcta, la respuesta generada y el contexto.
- Asigna tres puntuaciones entre 1 y 5 usando la siguiente escala:

DEFINICIÓN DE PUNTUACIONES:
    1 = Muy deficiente — Problemas importantes o deficiencias graves (por ejemplo, afirmaciones sin respaldo, contexto irrelevante o ausente, redacción incoherente)
    2 = Deficiente — Problemas significativos que afectan la calidad o exactitud
    3 = Aceptable — Problemas notables pero parcialmente adecuada
    4 = Buena — Problemas menores, en general correcta y bien formulada
    5 = Excelente — Totalmente precisa, completa y bien redactada sin problemas

DATOS:
Pregunta: {question}
Respuesta correcta: {answer_correct}
Respuesta generada: {answer_generated}
Contexto: {context}

FORMATO DE SALIDA ESTRICTO:
Justificación: [Explicación concisa]
Puntuación_fidelidad: [1–5]
Puntuación_cobertura: [1–5]
Puntuación_fluidez: [1–5]
"""

prompt_evaluacion_respuesta = ChatPromptTemplate.from_template(prompt_evaluacion_respuesta)
prompt_evaluacion_contexto = ChatPromptTemplate.from_template(prompt_evaluacion_contexto)


In [5]:
answers_correct=["The project’s objective is to combat violent extremism by analyzing its root causes, developing preventive and repressive measures, and countering extremist narratives through collaboration with civil society and LEAs, all while upholding fundamental rights.",
                 "The total cost of the project with the acronym HYPERGRYD (grant agreement 101036656) was €5,987,875.00.",
                 "Transforming Research through Innovative Practices for Linked interdisciplinary Exploration” (TRIPLE), identified by grant agreement 863420, received a total EU contribution of € 5,626,548.75. This funding was allocated as part of Horizon 2020 under the “EXCELLENT SCIENCE – Research Infrastructures",
                 """the role of coordinator in the grant agreement 777998. The participation cost of the organisation with PIC 960782479 in the grant agreement 777998.
The organisation with Participant Identification Code (PIC) 960782479 participated in the grant agreement 777998. The name of this organisation is UNIVERSIDADE NOVA DE LISBOA. The organisation with PIC 960782479 is not a small or medium-sized enterprise. The organisation with PIC 960782479 develops an activity of type HES. The organisation with PIC 960782479 is based in the country PT, codified under ISO 3166. The organisation with PIC 960782479 played the role of coordinator in the grant agreement 777998. The participation cost of the organisation with PIC 960782479 in the grant agreement 777998 was 409500.0 euros. The total amount funded to the organization with PIC 960782479 in the grant agreement 777998 was 409500.0 euros.""",
                 "The grant agreement 824330 was framed within the topic LC-SC3-ES-5-2018-2020TSO – DSO – Consumer: Large-scale demonstrations of innovative grid services through demand response, storage and small-scale (RES) generation. The grant agreement 824330 was framed within the master call H2020-LC-SC3-2018-2019-2020. The grant agreement 824330 was framed within the subcall H2020-LC-SC3-2018-ES-SCC.",
                 "The grant agreement 847593 was framed within the legal basis H2020-EuratomEuratom.",
                 "The grant agreement 814416 was a Research and Innovation Action (RIA) proposal.",
                 "The grant agreement 101036041 was framed within the master call H2020-MSCA-NIGHT-2020.",
                 "The grant agreement 956099 was framed within the legal basis H2020-EU.1.3.EXCELLENT SCIENCE - Marie Skłodowska-Curie Actions.",
                 "The objective described corresponds to the ILIAD project, which aims at establishing an interoperable, data-intensive, and cost-effective Digital Twin of the Ocean (DTO). The project integrates Earth observation data, immersive technologies, and semantic frameworks to support sustainable ocean management and contributes to the European Green Deal and the UN Ocean Decade objectives. OR The objective described corresponds to the TwinECS project, which focuses on developing an efficient and accurate model for simulating an e-ECS system under the Dymola/Modelica framework. It involves the creation of physical and surrogate models for thermo-fluid and electrical components but does not explicitly refer to the concept of digital twin.",
                 """These are the 12 projects whose objective is related to maritime structures. The correct answer must coincide of these responses:
1. Elastopoli (AquaComp)
Objetivo:
Replicar comercialmente en el sector automotriz un nanocompuesto de nanocelulosa (AquaComp), validado previamente en instrumentos musicales, mediante la ampliación de su producción a escala industrial, con el fin de sustituir compuestos poliméricos no renovables por materiales más ligeros, sostenibles y con mejores propiedades mecánicas, acústicas y hápticas.

2. SPNano (Nanodispersión con proteína SP1)
Objetivo:
Implementar la proteína SP1 en procesos industriales para optimizar la dispersión de nanopartículas (como grafeno o nanotubos de carbono) en materiales compuestos, logrando nanocompuestos con un 50% más de resistencia y propiedades mejoradas, superando los problemas de aglomeración y escalabilidad.

3. NANOLEAP (Nanocompuestos para construcción)
Objetivo:
Establecer una red europea de líneas piloto especializadas en nanocompuestos para aplicaciones en infraestructura y construcción, facilitando la escalabilidad industrial de recubrimientos anti-corrosivos, materiales multifuncionales (autolimpiantes, térmicos) y elementos prefabricados no estructurales.

4. CO-PILOT (Producción abierta de nanocompuestos)
Objetivo:
Crear una infraestructura de acceso abierto para que pymes produzcan nanocompuestos funcionales (anti-llama, aislantes, UV-protectores) a escala piloto (20-100 kg), integrando monitorización en línea de la calidad de dispersión de nanopartículas y procesos automatizados.

5. Carbo4Power (Palas de turbinas offshore)
Objetivo:
Desarrollar materiales multimodales ligeros y nanoingenierizados (compuestos dinámicos, recubrimientos multifuncionales) para palas de turbinas eólicas y mareomotrices, mejorando su rendimiento, durabilidad y reciclabilidad (hasta 95%), reduciendo el coste de la energía renovable.

6. ACHIEF (Materiales para industrias intensivas en energía)
Objetivo:
Desarrollar aleaciones de alta entropía (HEAs) y recubrimientos cerámicos avanzados para equipos industriales expuestos a condiciones extremas, mejorando su resistencia a la corrosión, fatiga térmica y creep, con sensores integrados para monitoreo en tiempo real.

7. OptiNanoPro (Nanomateriales en packaging y energía)
Objetivo:
Integrar nanotecnologías en líneas de producción industriales para desarrollar materiales con propiedades avanzadas (barrera, autolimpiantes, UV-resistentes) en sectores como envases, automoción y paneles solares, optimizando procesos mediante sistemas de monitorización en línea.

8. NANO2DAY (MXenes para electrónica y aerospacio)
Objetivo:
Explorar el potencial de los MXenes (nuevos nanomateriales 2D) en composites poliméricos para aplicaciones en electrónica vestible y componentes aeroespaciales, comparando su eficacia frente a compuestos con grafeno y escalando su síntesis a nivel cuasi-industrial.

9. PANG (Grafeno contra patógenos resistentes)
Objetivo:
Investigar nanocompuestos basados en grafeno como alternativa a los antibióticos para combatir infecciones bacterianas multirresistentes, estudiando su mecanismo de acción y desarrollando terapias no invasivas en colaboración con socios académicos e industriales.

10. RECOPHARMA (Tratamiento de aguas con nanomateriales)
Objetivo:
Desarrollar un sistema innovador basado en nanocompuestos y oxidación avanzada para eliminar fármacos recalcitrantes (como citostáticos) de aguas residuales, combinando polímeros impresos molecularmente y procesos de recuperación en modo continuo.

11. FAST (Impresión 3D híbrida para ingeniería de tejidos)
Objetivo:
Desarrollar una tecnología híbrida de impresión 3D que combine nanocompuestos funcionalizados, plasma atmosférico y control de gradientes para fabricar andamios tisulares personalizados con propiedades mecánicas, bioactivas y superficiales optimizadas para regeneración ósea.

12. MSCA-RISE (Nanocompuestos ópticos anisotrópicos)
Objetivo:
Diseñar materiales nanoestructurados con anisotropía controlada para aplicaciones en óptica y sub-THz, combinando cristalografía, nanoingeniería y modelado avanzado, con el fin de mejorar la eficiencia energética en celdas ópticas y dispositivos fotónicos."""
    ]

In [6]:
print(len(answers_correct))

11


In [32]:
df = pd.read_excel('exps 6-9/Experimento_9.xlsx', engine='openpyxl')
primeras_9 = df.iloc[:9]
filas_13_14 = df.iloc[[12, 13]]


df_seleccionado = pd.concat([primeras_9, filas_13_14])
df_seleccionado.rename(columns={"Consulta": "Query"}, inplace=True)


In [33]:
print(df_seleccionado)

                                                Query  \
0   What is the objective of the project with gran...   
1   What is the total cost of the project with the...   
2   How much funding was allocated for the project...   
3   Which organisation played the role of coordina...   
4   What topic does the project with the acronym I...   
5   What legal basis was the project titled Europe...   
6   What type of proposal was the grant agreement ...   
7   To which master call was the project with the ...   
8   To which sub call was the project titled Elect...   
12  Provide the objective of 1 project related to ...   
13  Provide the objective of 3 different projects ...   

                                  Retrieved_context_1  \
0   .3.7.Secure societies - Protecting freedom and...   
1   . The grant agreement 814881 had a total cost ...   
2   . The grant agreement 709443 was framed within...   
3   Information about the organisations that parti...   
4   The grant agreement 654248

## EVALUACIÓN INGLES

In [34]:
resultados = []

for idx, (_, row) in enumerate(df_seleccionado.iterrows()):
    question = str(row['Query'])
    answer_generated = str(row['Answer'])
    answer_correct = str(answers_correct[idx])

    prompt = prompt_evaluation_response.format(
        question=question,
        answer_correct=answer_correct,
        answer_generated=answer_generated
    )

    response = llama_llm.invoke(prompt)

    match = re.search(r"Score:\s*(\d+)", response)
    puntuacion = int(match.group(1)) if match else None

    resultados.append({
        'Consulta': question,
        'Respuesta_correcta': answer_correct,
        'Respuesta_generada': answer_generated,
        'Justificación_LLM': response,
        'Puntuación': puntuacion
    })

df_resultados = pd.DataFrame(resultados)

NameError: name 'prompt_evaluation_response' is not defined

In [10]:
print(df_resultados)

                                             Consulta  \
0   What is the objective of the project with gran...   
1   What is the total cost of the project with the...   
2   How much funding was allocated for the project...   
3   Which organisation played the role of coordina...   
4   What topic does the project with the acronym I...   
5   What legal basis was the project titled Europe...   
6   What type of proposal was the grant agreement ...   
7   To which master call was the project with the ...   
8   To which sub call was the project titled Elect...   
9   Provide the objective of 1 project related to ...   
10  Provide the objective of 3 different projects ...   

                                   Respuesta_correcta  \
0   The project’s objective is to combat violent e...   
1   The total cost of the project with the acronym...   
2   Transforming Research through Innovative Pract...   
3   the role of coordinator in the grant agreement...   
4   The grant agreement 824330

In [11]:
def extract_score(response_text, keyword):
    pattern = rf"(?i)(?:score[_ ]*)?{re.escape(keyword)}\s*[:=]\s*([0-9]+)"
    match = re.search(pattern, response_text)
    if match:
        return int(match.group(1))
    else:
        raise AttributeError(f"Score for '{keyword}' not found in text.")

In [12]:
evaluaciones = {}
for idx, (_, row) in enumerate(df_seleccionado.iterrows()):
    question = str(row['Query'])
    answer_generated = str(row['Answer'])
    answer_correct = str(answers_correct[idx])
    
    fragmentos = []
    for col in df.columns:
        #if col.startswith('Fragment_') and pd.notna(row[col]): # REspuestas Ernesto
        if col.startswith('Retrieved_context_') and pd.notna(row[col]): # REspuestas Desiree
            fragmentos.append(str(row[col]))
    context = "\n".join(fragmentos)

    prompt = prompt_evaluation_context.format(
        question=question,
        answer_correct=answer_correct,
        answer_generated=answer_generated,
        context=context
    )

    response = llama_llm.invoke(prompt)
    
    print(f"\n--- Evaluación extendida para Query {idx} ---")
    print(f"Pregunta: {question}")
    print(f"Respuesta correcta: {answer_correct}")
    print(f"Respuesta generada: {answer_generated}")
    print(f"Evaluación del LLM:\n{response}")
    print("---\n")
    
    try:
        score_faithfulness = extract_score(response, "faithfulness")
        score_coverage = extract_score(response, "coverage")
        score_fluency = extract_score(response, "fluency")
    except AttributeError:
        print(f"Error: No se encontraron scores en la respuesta para Query {idx}")
        score_faithfulness = None
        score_coverage = None
        score_fluency = None
    
    evaluaciones[f"query_{idx}"] = {
        "score_faithfulness": score_faithfulness,
        "score_coverage": score_coverage,
        "score_fluency": score_fluency
    }
    
    print("Score faithfulness:", score_faithfulness)
    print("Score coverage:", score_coverage)
    print("Score fluency:", score_fluency)
    print("\n")


--- Evaluación extendida para Query 0 ---
Pregunta: What is the objective of the project with grant agreement 740934?
Respuesta correcta: The project’s objective is to combat violent extremism by analyzing its root causes, developing preventive and repressive measures, and countering extremist narratives through collaboration with civil society and LEAs, all while upholding fundamental rights.
Respuesta generada: The objective of the project with grant agreement 740934, also known as TRIVALENT, is to develop a comprehensive approach to violent radicalization in the EU, combining repressive measures with preventive measures, in collaboration with actors of civil society and communities of reference, while respecting fundamental rights, promoting integration, cultural dialogue, and fighting discrimination.
Evaluación del LLM:
Justification: The generated answer is mostly faithful to the retrieved context, but it introduces some minor variations in wording and phrasing.

Score_faithfulne


--- Evaluación extendida para Query 10 ---
Pregunta: Provide the objective of 3 different projects related to corrosion.
Respuesta correcta: These are the 12 projects whose objective is related to maritime structures. The correct answer must coincide of these responses:
1. Elastopoli (AquaComp)
Objetivo:
Replicar comercialmente en el sector automotriz un nanocompuesto de nanocelulosa (AquaComp), validado previamente en instrumentos musicales, mediante la ampliación de su producción a escala industrial, con el fin de sustituir compuestos poliméricos no renovables por materiales más ligeros, sostenibles y con mejores propiedades mecánicas, acústicas y hápticas.

2. SPNano (Nanodispersión con proteína SP1)
Objetivo:
Implementar la proteína SP1 en procesos industriales para optimizar la dispersión de nanopartículas (como grafeno o nanotubos de carbono) en materiales compuestos, logrando nanocompuestos con un 50% más de resistencia y propiedades mejoradas, superando los problemas de aglome

In [13]:
scores_faithfulness = []
scores_coverage = []
scores_fluency = []

for i in range(len(df_resultados)):
    eval_result = evaluaciones.get(f"query_{i}", {})
    scores_faithfulness.append(eval_result.get("score_faithfulness"))
    scores_coverage.append(eval_result.get("score_coverage"))
    scores_fluency.append(eval_result.get("score_fluency"))

df_resultados['Puntuación_faithfulness'] = scores_faithfulness
df_resultados['Puntuación_coverage'] = scores_coverage
df_resultados['Puntuación_fluency'] = scores_fluency
print(df_resultados)

                                             Consulta  \
0   What is the objective of the project with gran...   
1   What is the total cost of the project with the...   
2   How much funding was allocated for the project...   
3   Which organisation played the role of coordina...   
4   What topic does the project with the acronym I...   
5   What legal basis was the project titled Europe...   
6   What type of proposal was the grant agreement ...   
7   To which master call was the project with the ...   
8   To which sub call was the project titled Elect...   
9   Provide the objective of 1 project related to ...   
10  Provide the objective of 3 different projects ...   

                                   Respuesta_correcta  \
0   The project’s objective is to combat violent e...   
1   The total cost of the project with the acronym...   
2   Transforming Research through Innovative Pract...   
3   the role of coordinator in the grant agreement...   
4   The grant agreement 824330

In [14]:
df_resultados.to_excel('exps 6-9/evaluacion_1_3_ingles_coherencia_9.xlsx', index=False)

## EVALUACIÓN ESPAÑOL

In [35]:
resultados_evaluacion = []

for idx, (_, row) in enumerate(df_seleccionado.iterrows()):
    question = str(row['Query'])
    answer_generated = str(row['Answer'])
    answer_correct = str(answers_correct[idx])

    fragmentos = []
    for col in df.columns:
        #if col.startswith('Fragment_') and pd.notna(row[col]):
        if col.startswith('Retrieved_context_') and pd.notna(row[col]): # REspuestas Desiree
            fragmentos.append(str(row[col]))
    context = "\n".join(fragmentos)

    prompt = prompt_evaluacion_respuesta.format(
        question=question,
        answer_correct=answer_correct,
        answer_generated=answer_generated
    )

    response = llama_llm.invoke(prompt)
    print(f"\n-----\n {response} \n-----\n")

    match = re.search(
        r"\**\s*Puntuación\s*\**\s*(?:[:=])\s*\**\s*(\d+)\s*\**",
        response,
        re.IGNORECASE
    )
    puntuacion = int(match.group(1)) if match else None

    resultados_evaluacion.append({
        "Consulta": question,
        "Respuesta_generada": answer_generated,
        "Respuesta_correcta": answer_correct,
        "Evaluación_LLM": response,
        "Puntuación": puntuacion
    })


df_eval_global = pd.DataFrame(resultados_evaluacion)


print(df_eval_global.head())


-----
 **Justificación**

La respuesta generada transmite un significado similar al de la respuesta correcta, pero con algunas diferencias en el lenguaje y la estructura. La respuesta correcta se centra específicamente en combatir la violencia extrema analizando sus causas raíz, desarrollando medidas preventivas y represivas, y contrarrestando las narrativas extremistas a través de la colaboración con la sociedad civil y las LEAs, mientras se respetan los derechos fundamentales. La respuesta generada utiliza un lenguaje más amplio y abarca conceptos adicionales como la integración, el diálogo cultural y la lucha contra la discriminación.

**Puntuación: 4**

La respuesta generada es mayormente correcta, ya que conserva la mayor parte del significado previsto con solo omisiones o inexactitudes menores. 
-----


-----
 **Justificación**

La respuesta generada no transmite el mismo significado clave que la respuesta correcta, ya que no proporciona información sobre el costo total del proy

In [36]:
def extract_score(text, keyword):
    cleaned_text = re.sub(r'\*\*', '', text)
    cleaned_text = cleaned_text.lower()     
    keyword = keyword.lower()               

    pattern = rf"(puntuación[_ ]*)?{keyword}\s*[:=]\s*([0-9]+)"
    match = re.search(pattern, cleaned_text)
    
    if match:
        return int(match.group(2))
    return None

In [37]:
evaluaciones = {}
for idx, (_, row) in enumerate(df_seleccionado.iterrows()):
    question = str(row['Query'])
    answer_generated = str(row['Answer'])
    answer_correct = str(answers_correct[idx])
    
    fragmentos = []
    for col in df.columns:
        #if col.startswith('Fragment_') and pd.notna(row[col]):
        if col.startswith('Retrieved_context_') and pd.notna(row[col]): # REspuestas Desiree
            fragmentos.append(str(row[col]))
    context = "\n".join(fragmentos)

    prompt = prompt_evaluacion_contexto.format(
        question=question,
        answer_correct=answer_correct,
        answer_generated=answer_generated,
        context=context
    )

    response = llama_llm.invoke(prompt)
    
    print(f"\n--- Evaluación extendida para Query {idx} ---")
    print(f"Pregunta: {question}")
    print(f"Respuesta correcta: {answer_correct}")
    print(f"Respuesta generada: {answer_generated}")
    print(f"Evaluación del LLM:\n{response}")
    print("---\n")
    
    try:
        score_faithfulness = extract_score(response, "fidelidad")
        score_coverage = extract_score(response, "cobertura")
        score_fluency = extract_score(response, "fluidez")
    except AttributeError:
        print(f"Error: No se encontraron scores en la respuesta para Query {idx}")
        score_faithfulness = None
        score_coverage = None
        score_fluency = None
    
    evaluaciones[f"query_{idx}"] = {
        "score_faithfulness": score_faithfulness,
        "score_coverage": score_coverage,
        "score_fluency": score_fluency
    }
    
    print("Score faithfulness:", score_faithfulness)
    print("Score coverage:", score_coverage)
    print("Score fluency:", score_fluency)
    print("\n")



--- Evaluación extendida para Query 0 ---
Pregunta: What is the objective of the project with grant agreement 740934?
Respuesta correcta: The project’s objective is to combat violent extremism by analyzing its root causes, developing preventive and repressive measures, and countering extremist narratives through collaboration with civil society and LEAs, all while upholding fundamental rights.
Respuesta generada: The objective of the project with grant agreement 740934, also known as TRIVALENT, is to develop a comprehensive approach to violent radicalization in the EU, combining repressive measures with preventive measures, in collaboration with actors of civil society and communities of reference, while respecting fundamental rights, promoting integration, cultural dialogue, and fighting discrimination.
Evaluación del LLM:
**Justificación**

La respuesta generada es generalmente precisa y completa, pero presenta algunos errores de redacción y falta de claridad en algunas partes. La r


--- Evaluación extendida para Query 4 ---
Pregunta: What topic does the project with the acronym INTERRFACE belong to?
Respuesta correcta: The grant agreement 824330 was framed within the topic LC-SC3-ES-5-2018-2020TSO – DSO – Consumer: Large-scale demonstrations of innovative grid services through demand response, storage and small-scale (RES) generation. The grant agreement 824330 was framed within the master call H2020-LC-SC3-2018-2019-2020. The grant agreement 824330 was framed within the subcall H2020-LC-SC3-2018-ES-SCC.
Respuesta generada: I don't know the topic of the project with the acronym INTERRFACE as it is not mentioned in the provided context.
Evaluación del LLM:
**Evaluación de la respuesta**

La respuesta generada es "I don't know the topic of the project with the acronym INTERRFACE as it is not mentioned in the provided context."

**Justificación**

La respuesta no proporciona ninguna información relevante sobre el tema del proyecto con el acrónimo INTERRFACE. Aunque 


--- Evaluación extendida para Query 9 ---
Pregunta: Provide the objective of 1 project related to digital twin.
Respuesta correcta: The objective described corresponds to the ILIAD project, which aims at establishing an interoperable, data-intensive, and cost-effective Digital Twin of the Ocean (DTO). The project integrates Earth observation data, immersive technologies, and semantic frameworks to support sustainable ocean management and contributes to the European Green Deal and the UN Ocean Decade objectives. OR The objective described corresponds to the TwinECS project, which focuses on developing an efficient and accurate model for simulating an e-ECS system under the Dymola/Modelica framework. It involves the creation of physical and surrogate models for thermo-fluid and electrical components but does not explicitly refer to the concept of digital twin.
Respuesta generada: The objective of the VET-TWIN project related to digital twin is to increase the potential and research capa

In [38]:
scores_faithfulness = []
scores_coverage = []
scores_fluency = []

for i in range(len(df_eval_global)):
    eval_result = evaluaciones.get(f"query_{i}", {})
    scores_faithfulness.append(eval_result.get("score_faithfulness"))
    scores_coverage.append(eval_result.get("score_coverage"))
    scores_fluency.append(eval_result.get("score_fluency"))

df_eval_global['Puntuación_faithfulness'] = scores_faithfulness
df_eval_global['Puntuación_coverage'] = scores_coverage
df_eval_global['Puntuación_fluency'] = scores_fluency

In [39]:
print(df_eval_global)

                                             Consulta  \
0   What is the objective of the project with gran...   
1   What is the total cost of the project with the...   
2   How much funding was allocated for the project...   
3   Which organisation played the role of coordina...   
4   What topic does the project with the acronym I...   
5   What legal basis was the project titled Europe...   
6   What type of proposal was the grant agreement ...   
7   To which master call was the project with the ...   
8   To which sub call was the project titled Elect...   
9   Provide the objective of 1 project related to ...   
10  Provide the objective of 3 different projects ...   

                                   Respuesta_generada  \
0   The objective of the project with grant agreem...   
1   The total cost of the project with the acronym...   
2   The project titled "Transforming Research thro...   
3   I don't know the answer as there is no informa...   
4   I don't know the topic of 

In [40]:
df_eval_global.to_excel('exps 6-9/evaluacion_español_9.xlsx', index=False)