# Context

In `make_synthetic_questions.ipynb`, we generated synthetic questions to bootstrap evaluation of the retrieval system in our hardware store's Q&A system.

This notebook shows the first step in calculating precision and recall with different retrieval parameters. We will run more advanced experiments in future notebooks after we have these baseline scores.

## Data

Here is a brief review of the data.

In [8]:
import json
import lancedb
import pandas as pd
from typing import List, Dict
from concurrent.futures import ThreadPoolExecutor




In [11]:
import cohere
from diskcache import Cache
import lancedb
import os
from typing import List, Dict

from pydantic import BaseModel
from concurrent.futures import ThreadPoolExecutor


cohere_api_key = os.environ["COHERE_API_KEY"]


class EvalQuestion(BaseModel):
    question: str
    answer: str
    chunk_id: str
    question_with_context: str


def score(hits):
    n_retrieval_requests = len(hits)
    total_retrievals = sum(len(l) for l in hits)
    true_positives = sum(sum(sublist) for sublist in hits)
    precision = true_positives / total_retrievals if total_retrievals > 0 else 0
    recall = true_positives / n_retrieval_requests if n_retrieval_requests > 0 else 0
    return {"precision": precision, "recall": recall}


def run_reranked_request(
    q: EvalQuestion,
    reviews_table: lancedb.table.LanceTable,
    max_n_return_vals: int,
    n_to_rerank: int = 40,
    model: str = "rerank-english-v3.0",
) -> List[bool]:
    cache = Cache("./cohere_cache")
    cache_key = f"{q.question_with_context}_{max_n_return_vals}_{model}".replace(
        "?", ""
    )

    cached_result = cache.get(cache_key)
    if cached_result is not None:
        return cached_result

    initial_results = (
        reviews_table.search(q.question_with_context)
        .select(["id", "review"])
        .limit(n_to_rerank)
        .to_list()
    )

    texts = [r["review"] for r in initial_results]

    # Rerank using Cohere
    co = cohere.Client(cohere_api_key)
    reranked = co.rerank(
        query=q.question_with_context,
        documents=texts,
        top_n=max_n_return_vals,
        model=model,
    )

    # Map reranked results back to original IDs
    reranked_ids = [initial_results[r.index]["id"] for r in reranked.results]
    result = [str(q.chunk_id) == str(r) for r in reranked_ids]
    cache.set(cache_key, result)
    return result


def score_reranked_search(
    eval_questions: List[EvalQuestion],
    reviews_table: lancedb.table.LanceTable,
    k_values: List[int],
    n_to_rerank: int = 40,
    model="rerank-english-v3.0",
) -> Dict[int, Dict[str, float]]:
    max_k = max(k_values)
    with ThreadPoolExecutor() as executor:
        all_hits = list(
            executor.map(
                lambda q: run_reranked_request(
                    q, reviews_table, max_k, n_to_rerank, model
                ),
                eval_questions,
            )
        )

    results = {}
    for k in k_values:
        hits = [h[:k] for h in all_hits]
        results[k] = score(hits)

    return results


KeyError: 'COHERE_API_KEY'

In [None]:
pd.set_option("display.max_colwidth", 160)

db = lancedb.connect("./lancedb")
reviews_table = db.open_table("reviews")
reviews_table.to_pandas().head()

In [2]:
with open("synthetic_eval_dataset.json", "r") as f:
    synthetic_questions = json.load(f)
synthetic_questions[:5]
eval_questions = [EvalQuestion(**question) for question in synthetic_questions]

## Set Up Evaluation

Load the evaluation questions into a structured format.

Build a simple search function

In [2]:
eval_questions[0]

NameError: name 'eval_questions' is not defined

In [4]:
def run_simple_request(q: EvalQuestion, n_return_vals=5):
    results = (
        reviews_table.search(q.question_with_context).select(["id"]).limit(n_return_vals).to_list()
    )
    return [str(q.chunk_id) == str(r["id"]) for r in results]

Now do the benchmarking. For simplicity, we just compare retrieval sizes with a simple semantic search in this cell.

In [5]:
def score_simple_search(n_to_retrieve: List[int]) -> Dict[str, float]:
    # parallelize to speed this up 5-10X
    with ThreadPoolExecutor() as executor:
        hits = list(
            executor.map(lambda q: run_simple_request(q, n_to_retrieve), eval_questions)
        )
    return score(hits)

k_to_retrieve = [5, 10]
scores = pd.DataFrame([score_simple_search(n) for n in k_to_retrieve])
scores["n_retrieved"] = k_to_retrieve
scores

Unnamed: 0,precision,recall,n_retrieved
0,0.116889,0.584444,5
1,0.095333,0.953333,10


If you have Cohere set up, you can see uf a reranker improves results (we'll talk more about rerankers in the coming weeks).

In [6]:
k_to_retrieve = [5, 10]
reranked_scores = score_reranked_search(eval_questions, reviews_table, k_to_retrieve)
reranked_scores_df = pd.DataFrame([
    {"precision": scores["precision"], "recall": scores["recall"], "n_retrieved": k}
    for k, scores in reranked_scores.items()
])
print(reranked_scores_df)

   precision    recall  n_retrieved
0   0.134000  0.670000            5
1   0.096667  0.966667           10
