In [None]:
import pandas as pd
import mlflow
from mlflow.metrics.genai import answer_correctness

In [None]:
import sys
sys.path.append('../../src')

from search import search
from embeddings import generate_embeddings
from baml_client import b

In [None]:
df = pd.read_parquet('../../data/qa_testset.parquet')
df = df.iloc[:5]

In [None]:
mlflow.set_tracking_uri("../../mlruns")
mlflow.set_experiment("end2end")

In [None]:
def qa_pipeline(question, method='hybrid'):
    query = b.QueryExpansion(question).question
    query_embedding = generate_embeddings([query])[0]

    docs = search(
        pd.Series({
            "question": query,
            "question_embedding": query_embedding,
        }),
        method='hybrid',
        k=10,
        filter_by_entity=False,
        do_rerank=True,
    )

    docs_str = [doc.payload['text'] for doc in docs]
    answer = b.AskQuestion(question, docs_str)

    return answer.answer

In [None]:
from sentence_transformers import CrossEncoder
import torch
model = CrossEncoder('cross-encoder/nli-deberta-v3-base')

def nli_evaluation(predictions, targets):
    pairs = [
        (target, pred)
        for pred, target in zip(predictions, targets)
    ]

    scores = model.predict(pairs, activation_fn=torch.nn.Softmax(dim=1))[:,1] # 1 => entailment

    return mlflow.metrics.MetricValue(
        scores=scores.tolist()
    )

entailment_metric = mlflow.metrics.make_metric(
    eval_fn=nli_evaluation,
    greater_is_better=True,
    name="entailment_score"
)

In [None]:
def eval_end2end(df):
    mlflow.set_experiment("end2end")

    df['prediction'] = df.question.apply(qa_pipeline)

    with mlflow.start_run() as run:
        mlflow.evaluate(
                data=df.rename(columns={"question": "inputs"}),
                model_type="question-answering",
                targets="answer",
                predictions="prediction",
                evaluators="default",
                extra_metrics=[
                    answer_correctness(model='openai:/gpt-4.1-mini-2025-04-14'),
                    entailment_metric
                ]
            )

In [None]:
eval_end2end(df)