In [1]:
import os
import sys

ROOT_FOLDER = os.path.join(".", "..")
if ROOT_FOLDER not in sys.path:
    sys.path.insert(0, ROOT_FOLDER)
ROOT_FOLDER = os.path.join(".", "..", "..")
if ROOT_FOLDER not in sys.path:
    sys.path.insert(0, ROOT_FOLDER)


from pipeline import IndexerPipeline, RAGPipeline

In [11]:
import numpy as np


def precision_at_k(ground_truth, predicted, k):
    """Compute Precision@k"""
    if k == 0:
        return 0.0
    predicted = predicted[:k]
    relevant = set(ground_truth)
    hits = sum(1 for item in predicted if item in relevant)
    return hits / k


def recall_at_k(ground_truth, predicted, k):
    """Compute Recall@k"""
    if not ground_truth:
        return 0.0
    predicted = predicted[:k]
    relevant = set(ground_truth)
    hits = sum(1 for item in predicted if item in relevant)
    return hits / len(ground_truth)


def ap_at_k(ground_truth, predicted, k):
    """Compute Average Precision@k"""
    if not ground_truth or k == 0:
        return 0.0

    predicted = predicted[:k]
    relevant = set(ground_truth)
    hits = 0
    sum_precisions = 0.0

    for i, item in enumerate(predicted, 1):
        if item in relevant:
            hits += 1
            sum_precisions += hits / i

    return sum_precisions / min(len(ground_truth), k)


def map_at_k(ground_truth_list, predicted_list, k):
    """Compute Mean Average Precision@k across multiple queries"""
    return np.mean(
        [ap_at_k(gt, pred, k) for gt, pred in zip(ground_truth_list, predicted_list)]
    )


def recall_rate_at_k(ground_truth_list, predicted_list, k):
    """Compute Mean Recall Rate@k across multiple queries"""
    return np.mean(
        [
            recall_at_k(gt, pred[:k], k)
            for gt, pred in zip(ground_truth_list, predicted_list)
        ]
    )


def dcg_at_k(ground_truth, predicted, k):
    """Compute Discounted Cumulative Gain@k"""
    predicted = predicted[:k]
    relevant = set(ground_truth)
    gains = [
        1.0 / np.log2(i + 2) if item in relevant else 0.0
        for i, item in enumerate(predicted)
    ]
    return sum(gains)


def ndcg_at_k(ground_truth, predicted, k):
    """Compute Normalized DCG@k"""
    idcg = dcg_at_k(ground_truth, ground_truth, k)
    if idcg == 0:
        return 0.0
    dcg = dcg_at_k(ground_truth, predicted, k)
    return dcg / idcg


def mean_ndcg_at_k(ground_truth_list, predicted_list, k):
    """Compute Mean nDCG@k across multiple queries"""
    return np.mean(
        [ndcg_at_k(gt, pred, k) for gt, pred in zip(ground_truth_list, predicted_list)]
    )


def mrr_at_k(ground_truth_list, predicted_list, k):
    """Compute Mean Reciprocal Rank@k across multiple queries"""
    rr_scores = []
    for gt, pred in zip(ground_truth_list, predicted_list):
        pred = pred[:k]
        relevant = set(gt)
        for i, item in enumerate(pred, 1):
            if item in relevant:
                rr_scores.append(1.0 / i)
                break
        else:
            rr_scores.append(0.0)
    return np.mean(rr_scores)


def f1_at_k(ground_truth_list, predicted_list, k):
    """Compute F1-score@k across multiple queries"""
    f1_scores = []
    for gt, pred in zip(ground_truth_list, predicted_list):
        p = precision_at_k(gt, pred, k)
        r = recall_at_k(gt, pred, k)
        if (p + r) == 0:
            f1_scores.append(0.0)
        else:
            f1_scores.append(2 * (p * r) / (p + r))
    return np.mean(f1_scores)


def evaluate_models(ground_truth_list, llm_list, indexer_list, k_values=[1, 3, 5, 10]):
    """Evaluate both models at different k values"""
    results = {}

    for k in k_values:
        results[f"MAP@{k}"] = {
            "LLM": map_at_k(ground_truth_list, llm_list, k),
            "Indexer": map_at_k(ground_truth_list, indexer_list, k),
        }
        results[f"MAR@{k}"] = {
            "LLM": recall_rate_at_k(ground_truth_list, llm_list, k),
            "Indexer": recall_rate_at_k(ground_truth_list, indexer_list, k),
        }
        results[f"nDCG@{k}"] = {
            "LLM": mean_ndcg_at_k(ground_truth_list, llm_list, k),
            "Indexer": mean_ndcg_at_k(ground_truth_list, indexer_list, k),
        }
        results[f"MRR@{k}"] = {
            "LLM": mrr_at_k(ground_truth_list, llm_list, k),
            "Indexer": mrr_at_k(ground_truth_list, indexer_list, k),
        }
        results[f"F1@{k}"] = {
            "LLM": f1_at_k(ground_truth_list, llm_list, k),
            "Indexer": f1_at_k(ground_truth_list, indexer_list, k),
        }

    return results

In [12]:
import nltk
from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu
from rouge import Rouge
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

nltk.download("punkt")

model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
rouge = Rouge()
smooth = SmoothingFunction().method1


def get_embedding(text):
    return model.encode([text])[0]


def cosine_sim(a, b):
    return cosine_similarity([a], [b])[0][0]


def ndcg_score(contexts, answer, k=None):
    """
    contexts: список контекстов (в том порядке, как достал RAG)
    answer: сгенерированный ответ модели
    k: считаем по топ-k (если нужно)
    """

    if k is None or k > len(contexts):
        k = len(contexts)

    emb_answer = get_embedding(answer)
    context_embeddings = [get_embedding(c) for c in contexts]

    # relevance = насколько каждый контекст похож на ответ (релевантность)
    relevances = [
        cosine_similarity([emb_answer], [emb])[0][0] for emb in context_embeddings
    ]

    # DCG
    dcg = sum((rel / np.log2(i + 2) for i, rel in enumerate(relevances[:k])))

    # Идеальный порядок: отсортируем по убыванию релевантности
    ideal_relevances = sorted(relevances, reverse=True)
    idcg = sum((rel / np.log2(i + 2) for i, rel in enumerate(ideal_relevances[:k])))

    return round(dcg / idcg, 3) if idcg > 0 else 0.0


def evaluate_sample(sample):
    question = sample["question"]
    answer = sample["generated_answer"]
    contexts = sample["contexts"]
    ground_truth = sample["ground_truth"]

    emb_question = get_embedding(question)
    emb_answer = get_embedding(answer)
    emb_ground = get_embedding(ground_truth)
    emb_contexts = [get_embedding(c) for c in contexts]
    emb_context_all = get_embedding(" ".join(contexts))

    # 1. faithfulness: answer vs context
    faithfulness_score = cosine_sim(emb_answer, emb_context_all)

    # 2. answer relevancy: answer vs question
    answer_relevancy_score = cosine_sim(emb_answer, emb_question)

    # 3. context recall: ground truth vs context
    context_recall_score = cosine_sim(emb_ground, emb_context_all)

    # 4. context precision: answer vs each context (max similarity)
    precisions = [cosine_sim(emb_answer, ctx_emb) for ctx_emb in emb_contexts]
    context_precision_score = np.max(precisions) if precisions else 0.0

    # 5. BLEU: answer vs ground truth
    bleu_score = sentence_bleu(
        [nltk.word_tokenize(ground_truth.lower())],
        nltk.word_tokenize(answer.lower()),
        smoothing_function=smooth,
        weights=(0.5, 0.5),  # BLEU-2
    )

    # 6. ROUGE: answer vs ground truth
    rouge_scores = rouge.get_scores(answer, ground_truth)[0]
    rouge1 = rouge_scores["rouge-1"]["f"]
    rougeL = rouge_scores["rouge-l"]["f"]

    return {
        "faithfulness": round(faithfulness_score, 3),
        "answer_relevancy": round(answer_relevancy_score, 3),
        "context_recall": round(context_recall_score, 3),
        "context_precision": round(context_precision_score, 3),
        "bleu": round(bleu_score, 3),
        "rouge1": round(rouge1, 3),
        "rougeL": round(rougeL, 3),
        "metrics": evaluate_models(
            [sample["ground_truth_indices"]],
            [sample["responce_indices"]],
            [sample["tree_indices"]],
        ),
    }


# 🔹 Пример данных
# samples = [
#     {
#         "question": "Какая столица Франции?",
#         "generated_answer": "Париж является столицей Франции.",
#         "contexts": [
#             "Париж — столица Франции.",
#             "Берлин — столица Германии."
#         ],
#         "ground_truth": "Париж — это столица Франции."
#     },
#     {
#         "question": "Какой язык в Бразилии?",
#         "generated_answer": "В Бразилии говорят на португальском.",
#         "contexts": [
#             "Португальский язык — официальный язык Бразилии.",
#             "Испанский распространён в Латинской Америке."
#         ],
#         "ground_truth": "Официальный язык Бразилии — португальский."
#     }
# ]

# # 🔍 Оценка
# for i, s in enumerate(samples):
#     print(f"\n🔎 Sample {i+1}")
#     scores = evaluate_sample(s)
#     for k, v in scores.items():
#         print(f"{k:>18}: {v}")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Kiaver\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### 1 How to calculate the square root of a number?

In [8]:
from src.utils import load

DATA_PATH = os.path.join("../../data/scrapped/class_data_function__1_1")

NAME_TO_IDX = {
    filename[:-4]: document_id + 1
    for document_id, filename in enumerate(os.listdir(DATA_PATH))
}
IDX_TO_NAME = {v: k for k, v in NAME_TO_IDX.items()}

In [9]:
import re


def extract_from_response(text):
    # Pattern to match numbers inside brackets after "References:"
    pattern = r"References:\s*((?:\[\d+(?:,\s*\d+)*\](?:\s*,\s*\[\d+\])*))"

    try:
        # Find all matches
        ref_matches = re.findall(r"\[(\d+(?:,\s*\d+)*)\]", text.split("References:")[1])

        # Extract individual numbers
        numbers = []
        for match in ref_matches:
            numbers.extend([num.strip() for num in match.split(",")])

        # Convert to integers
        numbers = list(map(int, numbers))
        return numbers
    except:
        return []

In [None]:
from tqdm import tqdm

rag = RAGPipeline()
indexer = IndexerPipeline()


def questions_to_samples(
    questions, grount_truthes, model_name="qwen-2-72b", indexer_name="llm_tree_idx"
):
    samples = []
    for i in tqdm(range(len(questions))):
        req = questions[i]
        ground_truth = grount_truthes[i]
        idx = indexer.index(req, "llm_tree_idx")
        docs = [
            {
                "idx": i + 1,
                "name": x[0],
                "desc": load(os.path.join(DATA_PATH, f"{x[0]}.txt")),
            }
            for i, x in enumerate(idx[1])
        ]
        res, err = rag.request_full(req, model_name, 10, indexer_name)
        cleaned_res = re.sub(r"<think>.*?</think>", "", res, flags=re.DOTALL).strip()
        local_indices = extract_from_response(cleaned_res)
        global_indices = [NAME_TO_IDX[docs[li - 1]["name"]] for li in local_indices]
        samples.append(
            {
                "question": req,
                "generated_answer": cleaned_res,
                "contexts": [x["desc"] for x in docs],
                "ground_truth": "Use " + ", ".join(ground_truth),
                "ground_truth_indices": [NAME_TO_IDX[x] for x in ground_truth],
                "responce_indices": global_indices,
                "tree_indices": [NAME_TO_IDX[x["name"]] for x in docs],
            },
        )
    return samples


# samp = questions_to_samples(["How to get the phase angle of a complex number?"], [["cmath.phase"]])
samp = questions_to_samples(
    ["How to compute sin in Python?"], [["turtle.setheading", "cmath.sin"]]
)
# print(samp)
for i, s in enumerate(samp):
    print(f"\n🔎 Sample {i + 1}")
    scores = evaluate_sample(s)
    for k, v in scores.items():
        print(f"{k:>18}: {v}")

100%|██████████| 1/1 [00:03<00:00,  3.20s/it]


🔎 Sample 1
      faithfulness: 0.515999972820282
  answer_relevancy: 0.8420000076293945
    context_recall: 0.42399999499320984
 context_precision: 0.5410000085830688
              bleu: 0.016
            rouge1: 0.0
            rougeL: 0.0
           metrics: {'MAP@1': {'LLM': 1.0, 'Indexer': 1.0}, 'MAR@1': {'LLM': 0.5, 'Indexer': 0.5}, 'nDCG@1': {'LLM': 1.0, 'Indexer': 1.0}, 'MRR@1': {'LLM': 1.0, 'Indexer': 1.0}, 'F1@1': {'LLM': 0.6666666666666666, 'Indexer': 0.6666666666666666}, 'MAP@3': {'LLM': 0.5, 'Indexer': 0.5}, 'MAR@3': {'LLM': 0.5, 'Indexer': 0.5}, 'nDCG@3': {'LLM': 0.6131471927654584, 'Indexer': 0.6131471927654584}, 'MRR@3': {'LLM': 1.0, 'Indexer': 1.0}, 'F1@3': {'LLM': 0.4, 'Indexer': 0.4}, 'MAP@5': {'LLM': 0.5, 'Indexer': 0.5}, 'MAR@5': {'LLM': 0.5, 'Indexer': 0.5}, 'nDCG@5': {'LLM': 0.6131471927654584, 'Indexer': 0.6131471927654584}, 'MRR@5': {'LLM': 1.0, 'Indexer': 1.0}, 'F1@5': {'LLM': 0.28571428571428575, 'Indexer': 0.28571428571428575}, 'MAP@10': {'LLM': 0.5, 'Indexe




In [17]:
samp

[{'question': 'How to compute sin in Python?',
  'generated_answer': 'To compute the sine of a number in Python, you can use the `math.sin` function for real numbers or `cmath.sin` for complex numbers.\nReferences: [1, 2]',
  'contexts': ['FUNCTION\n\ncmath.sin FROM cmath\n\nPARAMETERS\nz\n\nDESCRIPTION\nReturn the sine of z.',
   'FUNCTION\n\nmath.sin FROM math\n\nPARAMETERS\nx\n\nDESCRIPTION\nReturn the sine of x radians.',
   'FUNCTION\n\ncmath.sinh FROM cmath\n\nPARAMETERS\nz\n\nDESCRIPTION\nReturn the hyperbolic sine of z.',
   'FUNCTION\n\nmath.asin FROM math\n\nPARAMETERS\nx\n\nDESCRIPTION\nReturn the arc sine of x, in radians. The result is between -pi/2 and\npi/2.',
   'FUNCTION\n\nmath.sinh FROM math\n\nPARAMETERS\nx\n\nDESCRIPTION\nReturn the hyperbolic sine of x.',
   'FUNCTION\n\ncmath.asinh FROM cmath\n\nPARAMETERS\nz\n\nDESCRIPTION\nReturn the inverse hyperbolic sine of z. There are two branch cuts:\nOne extends from 1j along the imaginary axis to â\x88\x9ej.  The other\

In [18]:
IDX_TO_NAME[5039]

'turtle.setheading'

In [4]:
docs

[{'idx': 0,
  'name': 'cmath.phase',
  'desc': 'FUNCTION\n\ncmath.phase FROM cmath\n\nPARAMETERS\nz\n\nDESCRIPTION\nReturn the phase of z (also known as the argument of z), as a float.\nphase(z) is equivalent to math.atan2(z.imag, z.real).  The result\nlies in the range [-Ï\x80, Ï\x80], and the branch cut for this operation lies\nalong the negative real axis.  The sign of the result is the same as the\nsign of z.imag, even when z.imag is zero:'},
 {'idx': 1,
  'name': 'cmath.polar',
  'desc': 'FUNCTION\n\ncmath.polar FROM cmath\n\nPARAMETERS\nz\n\nDESCRIPTION\nReturn the representation of z in polar coordinates.  Returns a\npair (r, phi) where r is the modulus of z and phi is the\nphase of z.  polar(z) is equivalent to (abs(z),\nphase(z)).'},
 {'idx': 2,
  'name': 'math.degrees',
  'desc': 'FUNCTION\n\nmath.degrees FROM math\n\nPARAMETERS\nx\n\nDESCRIPTION\nConvert angle x from radians to degrees.'},
 {'idx': 3,
  'name': 'cmath.rect',
  'desc': 'FUNCTION\n\ncmath.rect FROM cmath\n\nPA

In [19]:
for i, x in enumerate(idx[1]):
    print(x)
    break

('cmath.phase', 0.9160112204987061)


In [18]:
idx

('how to get the phase angle of a complex number',
 [('cmath.phase', 0.9160112204987061),
  ('cmath.polar', 0.9966004249494234),
  ('math.degrees', 1.0101149087869998),
  ('cmath.rect', 1.0187175809742715),
  ('cmath.atan', 1.0259317958401877),
  ('math.radians', 1.0506910129703781),
  ('numbers.Complex.conjugate', 1.0697134886148076),
  ('cmath.infj', 1.0699878685387159),
  ('math.atan', 1.0700892928387011),
  ('cmath.acos', 1.0712685355494305)])