### Original Pipelieline

In [1]:
import llama_index
from llama_index.llms.ollama import Ollama 
from llama_index.core.agent.workflow import ReActAgent, AgentWorkflow, ToolCallResult, AgentStream
from llama_index.embeddings.ollama import OllamaEmbedding
from llama_index.core import SimpleDirectoryReader, Settings
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core.ingestion import IngestionPipeline, IngestionCache, DocstoreStrategy
from llama_index.core.node_parser import SentenceSplitter
# from llama_index.core.node_parser import TokenTextSplitter
from llama_index.core import VectorStoreIndex
from llama_index.core.tools import QueryEngineTool
from llama_index.core.prompts import PromptTemplate
from llama_index.core.evaluation import FaithfulnessEvaluator
from llama_index.core.schema import TransformComponent
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

import nest_asyncio
import pandas as pd
import chromadb
import asyncio
import os
from dotenv import load_dotenv
import glob
import PyPDF2
import json
import re
from typing import ClassVar
import torch

from llama_index.core.extractors import TitleExtractor, QuestionsAnsweredExtractor
from llama_index.core.llama_dataset.legacy.embedding import DEFAULT_QA_GENERATE_PROMPT_TMPL
import random


data_path = "./data/"
project_name = "neus_catala"
embed_model="jina/jina-embeddings-v2-base-es"
# embed_model="qllama/bge-small-en-v1.5:f16"
# embed_model="BAAI/bge-base-en-v1.5"
file_prefix = re.sub('[^A-Za-z0-9]', '_', f"{project_name}_{embed_model}")
default_path = f"{data_path}{file_prefix}"

chroma_file = default_path + '_chroma.db' #68MB
ingest_cache_file = default_path + "_cache.json"

from llama_index.core import Settings
# Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-base-en-v1.5")
Settings.embed_model = OllamaEmbedding(model_name=embed_model)
# Settings.llm = Ollama(model="qwen2.5:7b-instruct", base_url="http://localhost:11434") # Funciona de manera satisfactoria
Settings.llm = Ollama(model="llama3.2:latest", request_timeout=360.0)


db = chromadb.PersistentClient(path=chroma_file)
chroma_collection = db.get_or_create_collection(name=project_name)
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
ingest_cache = IngestionCache()
if os.path.isfile(ingest_cache_file):
    ingest_cache.from_persist_path(ingest_cache_file)


class TextCleaner(TransformComponent):

    rx_nl: ClassVar  = re.compile(r'(\n\s*\n)')
    rx_hyphen: ClassVar  = re.compile(r'(-\n)')
    rx_notdot: ClassVar  = re.compile(r'(?<!\.)\n')
    rx_num_leter: ClassVar  = re.compile(r'(\d)([a-zA-Z])')

    def __call__(self, nodes, **kwargs):
        # nodes = list(map(lambda node: TextCleaner()(node), nodes))
        for node in nodes:
            text = self.rx_nl.sub('\n', node.text)
            text = self.rx_hyphen.sub('', text)
            text = self.rx_notdot.sub(' ', text)
            text = self.rx_num_leter.sub(r'\1 \2', text)   
            node.set_content(text)
        return nodes


pipeline = IngestionPipeline(
    transformations=[
        TextCleaner(),
        SentenceSplitter(chunk_size=512),
        Settings.embed_model,
    ],
    vector_store=vector_store,
    cache=ingest_cache,
)



### Digest Documents

In [2]:
input_dir = data_path + "documents"
already_processed_files = list(set( m['file_path'] for m in chroma_collection.get(include=['metadatas'])['metadatas']))

try:
    documents = SimpleDirectoryReader(
        input_dir=input_dir, 
        exclude=already_processed_files
        ).load_data()


    nodes = await pipeline.arun(documents=documents, show_progress=True)
    ingest_cache.persist(ingest_cache_file)
    len(nodes)
except ValueError:
    print('No new Documents')

No new Documents


## Evaluation

### Generation of the pairs Question / answer

In [3]:
from llama_index.core.evaluation import (
    generate_question_context_pairs,
    EmbeddingQAFinetuneDataset
    )
from llama_index.core.llama_dataset.legacy.embedding import DEFAULT_QA_GENERATE_PROMPT_TMPL
import os

import torch
torch.cuda.empty_cache()

from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, StorageContext

index = VectorStoreIndex.from_vector_store(vector_store)

qa_file = default_path + "_qa.json"
regenerate = False

if not os.path.isfile(qa_file) or regenerate:
    

    nodes = [item.node for item in index
        .as_retriever(similarity_top_k=1000000)
        .retrieve("")]

    qa_dataset = generate_question_context_pairs(
        random.sample(nodes, min(len(nodes), 100)), 
        num_questions_per_chunk=1,
        qa_generate_prompt_tmpl=DEFAULT_QA_GENERATE_PROMPT_TMPL+ "\n Please respond directly to the question without any introductory text or formatting.",
    )
    qa_dataset.save_json(qa_file)


### Generation of the responses

In [4]:
from llama_index.core.evaluation import RetrieverEvaluator

# Convert to retriever
retriever = index.as_retriever(similarity_top_k=5)
# Define metrics
metrics = ["hit_rate", "mrr", "precision", "recall", "ap", "ndcg"]

# Create RetrieverEvaluator
retriever_evaluator = RetrieverEvaluator.from_metric_names(metrics, retriever=retriever)
qa_dataset = EmbeddingQAFinetuneDataset.from_json(qa_file)

eval_results = await retriever_evaluator.aevaluate_dataset(qa_dataset, show_progress=True)


100%|██████████| 100/100 [00:01<00:00, 65.51it/s]


In [5]:

import pandas as pd


def display_results(name, eval_results):
    """Display results from evaluate."""

    metric_dicts = []
    for eval_result in eval_results:
        metric_dict = eval_result.metric_vals_dict
        metric_dicts.append(metric_dict)

    full_df = pd.DataFrame(metric_dicts)

    columns = {
        "retrievers": [name],
        **{k: [full_df[k].mean()] for k in metrics},
    }

    metric_df = pd.DataFrame(columns)

    return metric_df

display_results("top-2 eval", eval_results)

Unnamed: 0,retrievers,hit_rate,mrr,precision,recall,ap,ndcg
0,top-2 eval,0.48,0.361167,0.096,0.48,0.361167,0.391114


In [9]:
import nest_asyncio
from IPython.display import display, HTML

nest_asyncio.apply()
query_engine = index.as_query_engine()

for (sample_id, sample_query) in random.sample(list(qa_dataset.queries.items()),1):
    sample_expected = qa_dataset.relevant_docs[sample_id]

    eval_result = retriever_evaluator.evaluate(sample_query, sample_expected)
    print(eval_result) 
    response = query_engine.query(sample_query)
    display(HTML(f'{response}'))
    
    print('retrieved_ids',f'{eval_result.retrieved_ids}')
    display(HTML(f'{eval_result.retrieved_texts}'))


Query: What was the outcome of Comorera's glosa (comment) on the debate, which settled a part of the discussion?
Metrics: {'hit_rate': 1.0, 'mrr': 0.5, 'precision': 0.2, 'recall': 1.0, 'ap': 0.5, 'ndcg': 0.6309297535714575}



retrieved_ids ['affe51fe-76e3-4b1a-98a8-0d25419d1f50', '9a809744-4f34-41c5-b43e-7871fad239d6', '895e1ab0-38ab-4992-839a-b64029731291', 'db1ee260-0f94-48c5-8c83-696aae49b3e5', 'a7ea7d7e-d726-4b95-af60-1e31f7429e3d']
