### Original Pipelieline

In [10]:
import llama_index
from llama_index.llms.ollama import Ollama 
from llama_index.core.agent.workflow import ReActAgent, AgentWorkflow, ToolCallResult, AgentStream
from llama_index.embeddings.ollama import OllamaEmbedding
from llama_index.core import SimpleDirectoryReader, Settings
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core.ingestion import IngestionPipeline, IngestionCache, DocstoreStrategy
from llama_index.core.node_parser import SentenceSplitter
# from llama_index.core.node_parser import TokenTextSplitter
from llama_index.core import VectorStoreIndex
from llama_index.core.tools import QueryEngineTool
from llama_index.core.prompts import PromptTemplate
from llama_index.core.evaluation import FaithfulnessEvaluator
from llama_index.core.schema import TransformComponent
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

import nest_asyncio
import pandas as pd
import chromadb
import asyncio
import os
from dotenv import load_dotenv
import glob
import PyPDF2
import json
import re
from typing import ClassVar
import torch

from llama_index.core.extractors import TitleExtractor, QuestionsAnsweredExtractor
from llama_index.core.llama_dataset.legacy.embedding import DEFAULT_QA_GENERATE_PROMPT_TMPL
import random


data_path = "./data/"
project_name = "neus_catala"
embed_model="jina/jina-embeddings-v2-base-es"
# embed_model="qllama/bge-small-en-v1.5:f16"
# embed_model="BAAI/bge-base-en-v1.5"
file_prefix = re.sub('[^A-Za-z0-9]', '_', f"{project_name}_{embed_model}")
default_path = f"{data_path}{file_prefix}"

chroma_file = default_path + '_chroma.db' #68MB
ingest_cache_file = default_path + "_cache.json"

# Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-base-en-v1.5")
Settings.embed_model = OllamaEmbedding(model_name=embed_model)
# Settings.llm = Ollama(model="qwen2.5:7b-instruct", base_url="http://localhost:11434") # Funciona de manera satisfactoria
Settings.llm = Ollama(model="llama3.2:latest", request_timeout=360.0)


db = chromadb.PersistentClient(path=chroma_file)
chroma_collection = db.get_or_create_collection(name=project_name)
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
ingest_cache = IngestionCache()
if os.path.isfile(ingest_cache_file):
    ingest_cache.from_persist_path(ingest_cache_file)


class TextCleaner(TransformComponent):

    rx_nl: ClassVar  = re.compile(r'(\n\s*\n)')
    rx_hyphen: ClassVar  = re.compile(r'(-\n)')
    rx_notdot: ClassVar  = re.compile(r'(?<!\.)\n')
    rx_num_leter: ClassVar  = re.compile(r'(\d)([a-zA-Z])')

    def __call__(self, nodes, **kwargs):
        # nodes = list(map(lambda node: TextCleaner()(node), nodes))
        for node in nodes:
            text = self.rx_nl.sub('\n', node.text)
            text = self.rx_hyphen.sub('', text)
            text = self.rx_notdot.sub(' ', text)
            text = self.rx_num_leter.sub(r'\1 \2', text)   
            node.set_content(text)
        return nodes


pipeline = IngestionPipeline(
    transformations=[
        TextCleaner(),
        SentenceSplitter(chunk_size=512),
        Settings.embed_model,
    ],
    vector_store=vector_store,
    cache=ingest_cache,
)



### Digest Documents

In [30]:
input_dir = data_path + "documents"
already_processed_files = list(set( m['file_path'] for m in chroma_collection.get(include=['metadatas'])['metadatas']))

try:
    documents = SimpleDirectoryReader(
        input_dir=input_dir, 
        exclude=already_processed_files
        ).load_data()


    nodes = await pipeline.arun(documents=documents, show_progress=True)
    ingest_cache.persist(ingest_cache_file)
    len(nodes)
except ValueError:
    print('No new Documents')

No new Documents


## Evaluation

### Generation of the pairs Question / answer

In [34]:
from llama_index.core.evaluation import (
    generate_question_context_pairs,
    EmbeddingQAFinetuneDataset
    )
from llama_index.core.llama_dataset.legacy.embedding import DEFAULT_QA_GENERATE_PROMPT_TMPL
import os

import torch
torch.cuda.empty_cache()

from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, StorageContext

index = VectorStoreIndex.from_vector_store(vector_store)

qa_file = default_path + "_qa.json"

nodes = [item.node for item in index
    .as_retriever(similarity_top_k=1000000)
    .retrieve("")]
    
qa_dataset = generate_question_context_pairs(
    random.sample(nodes, min(len(nodes), 100)), 
    num_questions_per_chunk=1,
    qa_generate_prompt_tmpl=DEFAULT_QA_GENERATE_PROMPT_TMPL+ "\n Please respond directly to the question without any introductory text or formatting.",
)
qa_dataset.save_json(qa_file)


100%|██████████| 100/100 [00:46<00:00,  2.15it/s]


### Generation of the responses

In [None]:
from llama_index.core.evaluation import RetrieverEvaluator

# Convert to retriever
retriever = index.as_retriever(similarity_top_k=5)
# Define metrics
metrics = ["hit_rate", "mrr", "precision", "recall", "ap", "ndcg"]

# Create RetrieverEvaluator
retriever_evaluator = RetrieverEvaluator.from_metric_names(metrics, retriever=retriever)
qa_dataset = EmbeddingQAFinetuneDataset.from_json(qa_file)

eval_results = await retriever_evaluator.aevaluate_dataset(qa_dataset, show_progress=False)


In [32]:

import pandas as pd


def display_results(name, eval_results):
    """Display results from evaluate."""

    metric_dicts = []
    for eval_result in eval_results:
        metric_dict = eval_result.metric_vals_dict
        metric_dicts.append(metric_dict)

    full_df = pd.DataFrame(metric_dicts)

    columns = {
        "retrievers": [name],
        **{k: [full_df[k].mean()] for k in metrics},
    }

    metric_df = pd.DataFrame(columns)

    return metric_df

display_results("top-2 eval", eval_results)

Unnamed: 0,retrievers,hit_rate,mrr,precision,recall,ap,ndcg
0,top-2 eval,0.45,0.3185,0.09,0.45,0.3185,0.351072


In [45]:
import nest_asyncio

nest_asyncio.apply()
query_engine = index.as_query_engine()

for (sample_id, sample_query) in random.sample(list(qa_dataset.queries.items()),1):
    sample_expected = qa_dataset.relevant_docs[sample_id]

    eval_result = retriever_evaluator.evaluate(sample_query, sample_expected)
    print(eval_result) 
    response = query_engine.query(sample_query)
    print(response)
    print('Context:', eval_result.retrieved_texts)


Query: What was the outcome of the vote on the convocatòria (congression) for a extraordinary congress of the PSUC?
Metrics: {'hit_rate': 1.0, 'mrr': 0.2, 'precision': 0.2, 'recall': 1.0, 'ap': 0.2, 'ndcg': 0.38685280723454163}

The convocatòria (congression) for an extraordinary congress of the PSUC was approved with 33 votes in favor, despite initial rejection by some members.
Context: ['Durante todo el verano de 1976 se sucedieron las  críticas entre el PSUC por un lado, y los tarradellistas Trias Fargas, Heribert Barrera, Josep Pallach y Antón Cañellas, por el otro. Pese al éxito de  la Diada del once de septiembre celebrada en Sant Boi de Llobregat, los  problemas dentro del Consell se agravarían de tal manera que terminaron  con su parálisis en diciembre de 1976. Por otra parte, la apabullante victoria del “Sí” en el referéndum del 12 de diciembre sobre la “Reforma Política” le dio a Suárez el control sustancial de la dinámica de transformación  del régimen en una democracia de t

In [None]:
query_engine_tool = QueryEngineTool.from_defaults(
    query_engine=query_engine,
    name="neus catala engine",
    description="Respon a les preguntes de l'usuari en catala",
)

# Create a RAG agent
query_engine_agent = AgentWorkflow.from_tools_or_functions(
    tools_or_functions=[query_engine_tool],
    system_prompt="You are a helpful assistant that has access to a database containing informatin about books. ",
)

evaluator = FaithfulnessEvaluator()

tests = [
    {"question": "Que havien de passar Les presoneres de Ravensbrück abans d incorporar-se als treballs forçats?", "Correct response": "Quarentena."},
    {"question": "La Neus Catala considera que va tenir una infància com?", "Correct response": "feliç"},
    # {"question": "La mateixa Neus, que presideix l Amical, va fer el discurs de presentació al paranimf de la Universitat de Barcelona, el juny de quin any?", "Correct response": "2006"},
    # {"question": "La Neus va néixer als Guiamets, un poblet de la comarca tarragonina del Priorat, a quin any?", "Correct response": "1915"},
    # {"question": "A principis del segle passat, a Els Guiamets hi arribava gent de quin pais?", "Correct response": "França"},
    # {"question": "En Baltasar Català era un pagès que combinava el treball al camp amb que mes?", "Correct response": "barber i pintor"},
    # {"question": "Quina fundació va fer el treball biogràfic sobre la Neus Catala?", "Correct response": "Fundació Pere Ardiaca"},
    # {"question": "Els diumenges a la tarda es reunia amb els camarades i les seves famílies en quin lloc?", "Correct response": "un petit teatret"},
    # {"question": "En aquell temps, el que més desitjava la Neus era treballar on?", "Correct response": "un bon hospital"},
    # {"question": "Qui era el Francisco Serrano?", "Correct response": "el jove que els repartia la sopa a la presó"},
    # {"question": "Les cendres d algunes d aquestes dones es troben en quin lloc?", "Correct response": "al fons del llac Schwedt"},     
]

for test in tests:
    question_str = f'Answer to the question allwais Using less than 5 words, in catalan language. The question: {test["question"]}'
    print('\n**Question:',question_str, '(',test['Correct response'],')**')

    # test['default_response'] = await llm.apredict(PromptTemplate(question_str))
    test['default_response'] = Settings.llm.complete(question_str)
    
    print('\tDefault response:',test['default_response'])

    nest_asyncio.apply()  # This is needed to run the query engine
    test['informed_response'] = query_engine.query(question_str)
    print('\tInformed response:',test['informed_response'])

    # handler = query_engine_agent.run(question)

    # # async for ev in handler.stream_events():
    # #     if isinstance(ev, ToolCallResult):
    # #         print("")
    # #         print("Called tool: ", ev.tool_name, ev.tool_kwargs, "=>", ev.tool_output)
    # #     elif isinstance(ev, AgentStream):  # showing the thought process
    # #         print(ev.delta, end="", flush=True)

    # informed_response = await handler    
    # print('*Informed agent response:',informed_response)

    test['informed_response_passing'] = evaluator.evaluate_response(query=test['question'], response=test['informed_response']).score

df = pd.DataFrame(tests)
df




**Question: Answer to the question allwais Using less than 5 words, in catalan language. The question: Que havien de passar Les presoneres de Ravensbrück abans d incorporar-se als treballs forçats? ( Quarentena. )**
	Default response: "Reconèixer el seu passat".
	Informed response: S'hi passaven 3 mesos

**Question: Answer to the question allwais Using less than 5 words, in catalan language. The question: La Neus Catala considera que va tenir una infància com? ( feliç )**
	Default response: "Com una infància normal".
	Informed response: Infància difícil i llunyana


Unnamed: 0,question,Correct response,default_response,informed_response,informed_response_passing
0,Que havien de passar Les presoneres de Ravensb...,Quarentena.,"""Reconèixer el seu passat"".",S'hi passaven 3 mesos,0.0
1,La Neus Catala considera que va tenir una infà...,feliç,"""Com una infància normal"".",Infància difícil i llunyana,0.0
