In [None]:
from talk_rag_scheduler.indexing_pipeline import (
    remote_files_generator,
)

from talk_rag_scheduler.bench_retrieval import (
    bench_bm25_retrieval,
    bench_vector_retrieval,
    pretty_confusion_info,
    pretty_scores_info,
)

## Init Benchmark

In [None]:
source_files = list(remote_files_generator())  # requires files to be already downloaded
n_documents = len(source_files)
n_documents
top_k = 10

## Bench BM25 Retrieval

In [None]:
is_true_positive, confusion_matrix = bench_bm25_retrieval(top_k=top_k)
print(is_true_positive)
print(pretty_confusion_info(confusion_matrix))
print(pretty_scores_info(confusion_matrix, top_k=top_k))

## Bench Vector Retrieval

In [None]:
is_true_positive, confusion_matrix = bench_vector_retrieval(top_k=top_k)
print(is_true_positive)
print(pretty_confusion_info(confusion_matrix))
print(pretty_scores_info(confusion_matrix, top_k=top_k))

## Bench Vector Retrieval with document Splitting

In [None]:
is_true_positive, confusion_matrix = bench_vector_retrieval(
    top_k=top_k, mode="semantic_split"
)
print(is_true_positive)
print(pretty_confusion_info(confusion_matrix))
print(pretty_scores_info(confusion_matrix, top_k=top_k))

## Improvement Options

* The above only tries retrieval over a single query. To get confidence in the retrieval quality,
  we should craft a much larger dataset.
* haystack provides a number of resources on evaluation:
    * [docs/evaluation](https://docs.haystack.deepset.ai/docs/evaluation)
    * [docs/evaluators](https://docs.haystack.deepset.ai/docs/evaluators)
    * [tutorials/guide_evaluation](https://haystack.deepset.ai/tutorials/guide_evaluation)
    * [tutorials/05_evaluation](https://haystack.deepset.ai/tutorials/05_evaluation)
    * [tutorials/35_evaluating_rag_pipelines](https://haystack.deepset.ai/tutorials/35_evaluating_rag_pipelines)
* Consult current literature:
    * [Large Language Models for Software Engineering: A Systematic Literature Review](https://arxiv.org/pdf/2308.10620v6)
    * [Retrieval-Augmented Generation for Large Language Models: A Survey](https://arxiv.org/pdf/2312.10997)