In [1]:
import os, re, json
from LiteLLM.common import CONFIG

from phoenix.evals import (
    llm_classify,
    LiteLLMModel,
    llm_generate,
    RelevanceEvaluator,
    run_evals
)

import litellm
litellm._turn_on_debug()

os.environ["PHOENIX_COLLECTOR_ENDPOINT"] = "http://localhost:6006"
os.environ["HUGGINGFACE_API_KEY"] = CONFIG.api_key

import phoenix as px
import pandas as pd

from phoenix.trace.dsl import SpanQuery
from phoenix.trace import SpanEvaluations, using_project

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
pd.set_option("display.max_colwidth", None)

def normalize_newline(s: str) -> str:
    """Chuyển '\\n' literal thành newline thật."""
    return s.replace("\\n", "\n") if isinstance(s, str) else s

def split_ref_items(s: str):
    """
    Tách chuỗi ref dạng:
    [1] Intel focuses...
    [2] AMD Ryzen offers...
    [4] Laptop battery life...
    Thành list [(id, text), ...]
    """
    if not isinstance(s, str):
        return []
    s = normalize_newline(s)
    pattern = re.compile(r"\[(\d+)\]\s*(.*?)(?=(?:\n\[\d+\])|$)", flags=re.S)
    return pattern.findall(s)

def explode_refs(df: pd.DataFrame, ref_col: str = "ref") -> pd.DataFrame:
    """
    Nhận DataFrame có cột 'ref', trả về DataFrame chỉ gồm context.trace_id và reference.
    """
    tmp = df.copy()
    tmp["ref_items"] = tmp[ref_col].apply(split_ref_items)
    out = tmp.explode("ref_items", ignore_index=True)
    out[["ref_id", "reference"]] = pd.DataFrame(out["ref_items"].tolist(), index=out.index)
    out["ref_id"] = out["ref_id"].astype(int)
    return out[["reference"]]

In [6]:
import phoenix as px
from phoenix.trace.dsl import SpanQuery

query = SpanQuery().where("span_kind == 'CHAIN'", ).select("trace_id", input="input.value", output="output.value")

df = px.Client().query_spans(query, project_name="hugging-face")

reference = SpanQuery().where("span_kind == 'TOOL'").select("trace_id", ref="prompt.context.preview")
spans_with_docs_df = px.Client().query_spans(reference, project_name="hugging-face")
print(len(spans_with_docs_df))

document_chunks_df = explode_refs(spans_with_docs_df)
print("Exploded format:\n", document_chunks_df)

1
Exploded format:
                                                                                                                                                                                                                                                                                                                                                                                                        reference
0  Tổng Quan Về Thế Giới Máy Tính: Từ Phần Cứng Đến An Ninh Mạng\nMáy tính đã trở thành một công cụ không thể thiếu trong cuộc sống hiện đại, từ công việc, học tập đến giải trí. Để hiểu rõ cách thức hoạt động của một cỗ máy phức tạp này, chúng ta cần tìm hiểu hai thành phần cốt lõi cấu tạo nên nó: phần cứng (hardware) và phần mềm (software). Nếu phần cứng là thể xác hữu hình, là những linh kiện vậ


In [21]:
generate_questions_template = """\
Context information is below.

---------------------
{ref_text}
---------------------

Given the context information and not prior knowledge.
generate only questions based on the below query.

You are a Teacher/ Professor. Your task is to setup \
3 questions for an upcoming \
quiz/examination. The questions should be diverse in nature \
across the document. Restrict the questions to the \
context information provided."

Output the questions in JSON format with the keys question_1, question_2, question_3.
"""

In [22]:
def normalize_newline(s: str) -> str:
    """Chuyển '\\n' literal thành newline thật."""
    return s.replace("\\n", "\n") if isinstance(s, str) else s

def split_ref_items(s: str):
    """
    Tách chuỗi ref dạng:
    [1] Intel focuses...
    [2] AMD Ryzen offers...
    [4] Laptop battery life...
    Thành list [(id, text), ...]
    """
    if not isinstance(s, str):
        return []
    s = normalize_newline(s)
    pattern = re.compile(r"\[(\d+)\]\s*(.*?)(?=(?:\n\[\d+\])|$)", flags=re.S)
    return pattern.findall(s)

def explode_refs(df: pd.DataFrame, ref_col: str = "ref") -> pd.DataFrame:
    """
    Nhận DataFrame có cột 'ref', trả về DataFrame chỉ gồm context.trace_id và ref_text.
    """
    tmp = df.copy()
    tmp["ref_items"] = tmp[ref_col].apply(split_ref_items)
    out = tmp.explode("ref_items", ignore_index=True)
    out[["ref_id", "ref_text"]] = pd.DataFrame(out["ref_items"].tolist(), index=out.index)
    out["ref_id"] = out["ref_id"].astype(int)
    return out[["context.trace_id", "ref_text"]]

In [23]:
def robust_output_parser(response: str, index: int):
    s = (response or "").strip()
    # log vài mẫu đầu để bạn xem raw
    if index < 3:
        print(f"\nRAW[{index}]:\n{repr(s)}\n")

    if not s:
        return {"__error__": "empty", "question_1": None, "question_2": None, "question_3": None}

    # 1) bỏ code fences ```json ... ```
    if s.startswith("```"):
        s = re.sub(r"^```[a-zA-Z]*\s*", "", s)
        s = re.sub(r"\s*```$", "", s, flags=re.S)

    # 2) lấy block JSON đầu tiên
    start, end = s.find("{"), s.rfind("}")
    if start != -1 and end != -1 and end > start:
        try:
            obj = json.loads(s[start:end+1])
            # đảm bảo trả về dict có các key mong muốn
            return {
                "question_1": obj.get("question_1"),
                "question_2": obj.get("question_2"),
                "question_3": obj.get("question_3"),
            }
        except json.JSONDecodeError as e:
            pass

    # 3) fallback: nếu model trả plain text, nhét vào question_1
    return {"question_1": s, "question_2": None, "question_3": None}

In [25]:
model = LiteLLMModel(
    model="huggingface/together/Qwen/Qwen2.5-7B-Instruct",
    temperature=0.0,
)

questions_df = llm_generate(
    dataframe=document_chunks_df,
    template=generate_questions_template,
    model=model,
    output_parser=robust_output_parser,
    concurrency=20,
)

🐌!! If running inside a notebook, patching the event loop with nest_asyncio will allow asynchronous eval submission, and is significantly faster. To patch the event loop, run `nest_asyncio.apply()`.
llm_generate |          | 0/6 (0.0%) | ⏳ 00:00<? | ?it/s[92m20:26:59 - LiteLLM:DEBUG[0m: utils.py:349 - 

[92m20:26:59 - LiteLLM:DEBUG[0m: utils.py:349 - [92mRequest to litellm:[0m
[92m20:26:59 - LiteLLM:DEBUG[0m: utils.py:349 - [92mlitellm.completion(model='huggingface/together/Qwen/Qwen2.5-7B-Instruct', messages=[{'content': 'Context information is below.\n\n---------------------\nthe more affordable choice, but in recent years, some of its flagship products even surpassed Intel’s in both price and raw per\n---------------------\n\nGiven the context information and not prior knowledge.\ngenerate only questions based on the below query.\n\nYou are a Teacher/ Professor. Your task is to setup 3 questions for an upcoming quiz/examination. The questions should be diverse in nature acr


RAW[0]:
'```json\n{\n  "question_1": "According to the context, which company\'s flagship products have recently surpassed Intel\'s in terms of both price and performance?",\n  "question_2": "What does the phrase \'the more affordable choice\' likely refer to in the given context?",\n  "question_3": "Based on the information provided, what trend can be observed in the competitive landscape between the two companies mentioned?"\n}\n```'



`max_retries` is not supported. It will be ignored.
[92m20:27:01 - LiteLLM:DEBUG[0m: litellm_logging.py:929 - [92m

POST Request Sent from LiteLLM:
curl -X POST \
https://router.huggingface.co/together/v1/chat/completions \
-H 'content-type: application/json' -H 'Authorization: Be****Hh' \
-d '{'model': 'Qwen/Qwen2.5-7B-Instruct-Turbo', 'messages': [{'content': 'Context information is below.\n\n---------------------\nits efficiency cores and hybrid architecture, especially with the Alder Lake and Raptor Lake generations. "\n    "The launch of \n---------------------\n\nGiven the context information and not prior knowledge.\ngenerate only questions based on the below query.\n\nYou are a Teacher/ Professor. Your task is to setup 3 questions for an upcoming quiz/examination. The questions should be diverse in nature across the document. Restrict the questions to the context information provided."\n\nOutput the questions in JSON format with the keys question_1, question_2, question_3.\n


RAW[1]:
'```json\n{\n  "question_1": "What are the two specific processor generations mentioned in the context that feature efficiency cores and a hybrid architecture?",\n  "question_2": "Identify the term used in the context to describe the combination of different types of processor cores in these generations.",\n  "question_3": "Based on the context, which event is indicated to have occurred with the mention of \'The launch of\'?"\n}\n```'



`max_retries` is not supported. It will be ignored.
[92m20:27:03 - LiteLLM:DEBUG[0m: litellm_logging.py:929 - [92m

POST Request Sent from LiteLLM:
curl -X POST \
https://router.huggingface.co/together/v1/chat/completions \
-H 'content-type: application/json' -H 'Authorization: Be****Hh' \
-d '{'model': 'Qwen/Qwen2.5-7B-Instruct-Turbo', 'messages': [{'content': 'Context information is below.\n\n---------------------\ndvanced process nodes to fabricate its CPUs and GPUs, allowing it to deliver competitive products without owning its own fabs. "\n---------------------\n\nGiven the context information and not prior knowledge.\ngenerate only questions based on the below query.\n\nYou are a Teacher/ Professor. Your task is to setup 3 questions for an upcoming quiz/examination. The questions should be diverse in nature across the document. Restrict the questions to the context information provided."\n\nOutput the questions in JSON format with the keys question_1, question_2, question_3.\n


RAW[2]:
'```json\n{\n  "question_1": "What does the term \'advanced process nodes\' refer to in the context of CPU and GPU fabrication?",\n  "question_2": "Why might a company choose not to own its own fabrication plants (fabs) when producing CPUs and GPUs?",\n  "question_3": "How does using advanced process nodes contribute to delivering competitive products in the CPU and GPU market?"\n}\n```'



`max_retries` is not supported. It will be ignored.
[92m20:27:04 - LiteLLM:DEBUG[0m: litellm_logging.py:929 - [92m

POST Request Sent from LiteLLM:
curl -X POST \
https://router.huggingface.co/together/v1/chat/completions \
-H 'content-type: application/json' -H 'Authorization: Be****Hh' \
-d '{'model': 'Qwen/Qwen2.5-7B-Instruct-Turbo', 'messages': [{'content': 'Context information is below.\n\n---------------------\n"AMD\'s Ryzen processors have been praised for their strong multi-core performance, making them attractive for content creat\n---------------------\n\nGiven the context information and not prior knowledge.\ngenerate only questions based on the below query.\n\nYou are a Teacher/ Professor. Your task is to setup 3 questions for an upcoming quiz/examination. The questions should be diverse in nature across the document. Restrict the questions to the context information provided."\n\nOutput the questions in JSON format with the keys question_1, question_2, question_3.\n', '

Process was interrupted. The return value will be incomplete...





In [13]:
questions_df.head()

Unnamed: 0,question_1,question_2,question_3
0,What feature of AMD's Ryzen processors has contributed to their popularity among content creators?,How does the multi-core performance of AMD's Ryzen processors benefit content creation tasks?,"Based on the context, which type of users are likely to find AMD's Ryzen processors particularly appealing?"
1,What was the impact of AMD's EPYC server processors on the data center market according to the given context?,"Based on the information provided, what specific feature of AMD's EPYC processors is mentioned as a key factor in disrupting the market?",Summarize the main point of the given context in one sentence.
2,What are the two companies mentioned in the context that are expanding into GPU technologies?,Which company is focusing on Radeon graphics according to the context?,What is the focus of Intel's expansion into GPU technologies based on the given information?
3,"According to the context, what are both AMD and Intel focusing on in the future?",How does the context describe the current state of price and raw performance between AMD and Intel?,Summarize the main points mentioned about AMD and Intel's future strategies in the given context.
4,"What does the term 'fabs' refer to in the context of semiconductor manufacturing, and why might a company like AMD not own its own fabs?","How has AMD's performance in the desktop market changed in recent years, and what does the context suggest about Intel's position in this market?","Based on the given context, what can be inferred about the competitive landscape between AMD and Intel in the desktop market?"


In [14]:
# Construct a dataframe of the questions and the document chunks
questions_with_document_chunk_df = pd.concat([questions_df, document_chunks_df], axis=1)
questions_with_document_chunk_df = questions_with_document_chunk_df.melt(
    id_vars=["ref_text"], value_name="question"
).drop("variable", axis=1)
# If the above step was interrupted, there might be questions missing. Let's run this to clean up the dataframe.
questions_with_document_chunk_df = questions_with_document_chunk_df[
    questions_with_document_chunk_df["question"].notnull()
]

In [15]:
questions_with_document_chunk_df.head(10)

Unnamed: 0,ref_text,question
0,"""AMD's Ryzen processors have been praised for their strong multi-core performance, making them attractive for content creat",What feature of AMD's Ryzen processors has contributed to their popularity among content creators?
1,"nerations. ""\n ""The launch of AMD's EPYC server processors disrupted the data center market, offering higher core counts and b",What was the impact of AMD's EPYC server processors on the data center market according to the given context?
2,"nd pre-built PCs. ""\n ""Both companies are expanding into GPU technologies, with AMD focusing on Radeon graphics and Intel laun",What are the two companies mentioned in the context that are expanding into GPU technologies?
3,"ntel’s in both price and raw performance. ""\n ""Looking forward, both AMD and Intel are betting on AI, high-performance computi","According to the context, what are both AMD and Intel focusing on in the future?"
4,"s without owning its own fabs. ""\n ""While AMD gained significant momentum in the desktop market, Intel still maintains strong","What does the term 'fabs' refer to in the context of semiconductor manufacturing, and why might a company like AMD not own its own fabs?"
5,"nd pre-built PCs. ""\n ""Both companies are expanding into GPU technologies, with AMD focusing on Radeon graphics and Intel laun",What are the two companies mentioned in the context that are expanding into GPU technologies?
6,"""AMD's Ryzen processors have been praised for their strong multi-core performance, making them attractive for content creat",What feature of AMD's Ryzen processors has contributed to their popularity among content creators?
7,"nerations. ""\n ""The launch of AMD's EPYC server processors disrupted the data center market, offering higher core counts and b",What was the impact of AMD's EPYC server processors on the data center market according to the given context?
8,"el has faced challenges with delays in adopting smaller nanometer processes, while AMD leveraged TSMC’s 7nm and 5nm technology t",What challenge has el faced in adopting newer nanometer processes?
9,"""AMD's Ryzen processors have been praised for their strong multi-core performance, making them attractive for content creat",How does the multi-core performance of AMD's Ryzen processors benefit content creation tasks?


In [None]:
# import os
# import pandas as pd
# from tqdm import tqdm
# from opentelemetry.trace import Status, StatusCode
# from Phoenix.trace.tracing import tracer

# from LiteLLM.lite import LiteLLMClient
# from LiteLLM.Response import ResponseInput
# from tools.rag import build_prompt

# # --------- CONFIG ---------
# # df nguồn: lấy cột "question"
# SOURCE_DF = questions_with_document_chunk_df   # <- đã có sẵn ở phía bạn
# QUESTION_COL = "question"
# TOP_K = 3          # build_prompt(query, top_k=TOP_K)
# MODEL = os.getenv("LLM_MODEL", "gpt-4o-mini")  # đổi nếu cần
# TEMP = float(os.getenv("LLM_TEMP", "0.2"))
# MAX_NUM = None     # giới hạn số câu chạy (None = chạy hết)
# # --------------------------

# def main():
#     # Chuẩn bị danh sách câu hỏi
#     queries = (
#         SOURCE_DF[QUESTION_COL]
#         .dropna()
#         .map(lambda s: str(s).strip())
#         .loc[lambda s: s.ne("")]
#         .tolist()
#     )
#     if MAX_NUM is not None:
#         queries = queries[:MAX_NUM]

#     print(f"Running {len(queries)} questions…")

#     # Client LLM (tuỳ môi trường bạn cấu hình API key/endpoint)
#     client = LiteLLMClient()

#     rows = []  # lưu kết quả

#     for i, query in enumerate(tqdm(queries, desc="RAG batch", unit="q")):
#         with tracer.start_as_current_span("Thought") as span:
#             span.set_attribute("openinference.span.kind", "CHAIN")
#             span.set_attribute("input.value", query)

#             try:
#                 # tạo prompt từ RAG (lấy context top-k cho câu hỏi)
#                 prompt = build_prompt(query, top_k=TOP_K)

#                 # gọi model
#                 msg = ResponseInput(prompt)
#                 resp = client.complete([msg])           # one-by-one (ổn định)
#                 answer = resp.transform()
#                 usage = None
#                 try:
#                     usage = resp.usage()
#                 except Exception:
#                     pass

#                 # log lên trace
#                 span.set_attribute("output.value", (answer or "")[:400])
#                 span.set_status(Status(StatusCode.OK))

#                 rows.append({
#                     "index": i,
#                     "question": query,
#                     "prompt": prompt,
#                     "answer": answer,
#                     "usage": usage,
#                     "error": None,
#                 })

#             except Exception as e:
#                 # ghi lỗi nhưng không dừng batch
#                 span.record_exception(e)
#                 span.set_status(Status(StatusCode.ERROR, str(e)))
#                 rows.append({
#                     "index": i,
#                     "question": query,
#                     "prompt": None,
#                     "answer": None,
#                     "usage": None,
#                     "error": str(e),
#                 })

#     # Thành DataFrame kết quả
#     results_df = pd.DataFrame(rows)
#     pd.set_option("display.max_colwidth", None)
#     print("\n=== SAMPLE RESULTS ===")
#     print(results_df.head(10)[["question", "answer", "error"]])

#     # (tuỳ chọn) lưu file
#     # results_df.to_csv("rag_batch_results.csv", index=False)
#     return results_df

# if __name__ == "__main__":
#     _ = main()


OpenTelemetry Tracing Details
|  Phoenix Project: hugging-face
|  Span Processor: SimpleSpanProcessor
|  Collector Endpoint: localhost:4317
|  Transport: gRPC
|  Transport Headers: {}
|  
|  Using a default SpanProcessor. `add_span_processor` will overwrite this default.
|  
|  
|  `register` has set this TracerProvider as the global OpenTelemetry default.
|  To disable this behavior, call `register` with `set_global_tracer_provider=False`.

Running 27 questions…


RAG batch:   0%|          | 0/27 [00:00<?, ?q/s]You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
RAG batch:   0%|          | 0/27 [00:12<?, ?q/s]


KeyboardInterrupt: 

In [3]:
client = px.Client()

query = (
    SpanQuery()
    .where("span_kind == 'RETRIEVER' and evals['relevance'].label is None   ")
    .select("context.span_id", "context.trace_id", "input.value", "retrieval.documents")
)

df = client.query_spans(query, project_name="hugging-face")
df = df.reset_index().rename(columns={"index": "context.span_id"})
print(df.columns)
print(df.head())

Index(['context.span_id', 'context.trace_id', 'input.value',
       'retrieval.documents'],
      dtype='object')
    context.span_id                  context.trace_id  \
0  0224631cead9f25d  500b3bdc9e9c658bee05bdea991a793a   
1  6fe4de290b8d7478  582261bea40ac7541550f4369e7342b1   
2  6b384c0a008fcdc1  2bb550be5926f49d23cef2b2443893d0   
3  4abdb3f2ef462568  3969182ba42aa8040638bd5352d2339c   
4  5551e37d1a9fdeef  58481aa2e6a562825c0a9c419ac0582d   

                                         input.value  \
0               I want to know information about AMD   
1                I want some information about Intel   
2  What feature of AMD's Ryzen processors has con...   
3  What was the impact of AMD's EPYC server proce...   
4  What are the two companies mentioned in the co...   

                                 retrieval.documents  
0  [{'document': {'content': '
    "AMD's Ryzen p...  
1  [{'document': {'content': 'ntel’s in both pric...  
2  [{'document': {'content': '
    "AMD's

In [4]:
df_exploded = df.explode("retrieval.documents", ignore_index=True)

print(df_exploded.columns)

Index(['context.span_id', 'context.trace_id', 'input.value',
       'retrieval.documents'],
      dtype='object')


In [6]:
# normalize nested dict
docs = pd.json_normalize(df_exploded["retrieval.documents"])

# gộp lại với trace_id
retrieved_documents_df = pd.concat([df_exploded.drop(columns=["retrieval.documents"]), docs], axis=1)

print(retrieved_documents_df.columns)

Index(['context.span_id', 'context.trace_id', 'input.value',
       'document.content', 'document.id', 'document.score'],
      dtype='object')


In [7]:
retrieved_documents_df = retrieved_documents_df.rename(
    columns={
        "document.content": "reference",
        "input.value": "input"
    }
)

In [8]:
retrieved_documents_df

Unnamed: 0,context.span_id,context.trace_id,input,reference,document.id,document.score
0,0224631cead9f25d,500b3bdc9e9c658bee05bdea991a793a,I want to know information about AMD,"\n ""AMD's Ryzen processors have been praise...",6e13a8d5-0deb-4982-8f6a-601164f66a73,0.025658
1,0224631cead9f25d,500b3bdc9e9c658bee05bdea991a793a,I want to know information about AMD,"nerations. ""\n ""The launch of AMD's EPYC se...",825177e1-6c77-4de7-bb02-c332e8dbaf54,0.00709
2,0224631cead9f25d,500b3bdc9e9c658bee05bdea991a793a,I want to know information about AMD,"nd pre-built PCs. ""\n ""Both companies are e...",a23b9fae-a936-451e-b8d5-573ab30f08dd,0.004895
3,6fe4de290b8d7478,582261bea40ac7541550f4369e7342b1,I want some information about Intel,"ntel’s in both price and raw performance. ""\n ...",def89f0c-d7d3-4c57-964a-4689cfda6ad3,0.023789
4,6fe4de290b8d7478,582261bea40ac7541550f4369e7342b1,I want some information about Intel,hem attractive for content creators and gamers...,36084d3b-9775-4e1b-87a6-c542507e0074,0.023743
5,6fe4de290b8d7478,582261bea40ac7541550f4369e7342b1,I want some information about Intel,"t, Intel still maintains strong relationships ...",dc8a928d-6cc7-4393-a0c5-3754e4cb8cb9,0.010969
6,6b384c0a008fcdc1,2bb550be5926f49d23cef2b2443893d0,What feature of AMD's Ryzen processors has con...,"\n ""AMD's Ryzen processors have been praise...",6e13a8d5-0deb-4982-8f6a-601164f66a73,0.995588
7,6b384c0a008fcdc1,2bb550be5926f49d23cef2b2443893d0,What feature of AMD's Ryzen processors has con...,hem attractive for content creators and gamers...,36084d3b-9775-4e1b-87a6-c542507e0074,0.018183
8,6b384c0a008fcdc1,2bb550be5926f49d23cef2b2443893d0,What feature of AMD's Ryzen processors has con...,"its efficiency cores and hybrid architecture,...",a0149ea1-fb03-4f53-be3a-891702d05314,0.000928
9,4abdb3f2ef462568,3969182ba42aa8040638bd5352d2339c,What was the impact of AMD's EPYC server proce...,"nerations. ""\n ""The launch of AMD's EPYC se...",825177e1-6c77-4de7-bb02-c332e8dbaf54,0.977319


In [10]:
from phoenix.evals import (
    RelevanceEvaluator,
    run_evals,
)

model = LiteLLMModel(
    model="huggingface/together/Qwen/Qwen2.5-7B-Instruct",
    temperature=0.0,
)

relevance_evaluator = RelevanceEvaluator(model)

retrieved_documents_relevance_df = run_evals(
    evaluators=[relevance_evaluator],
    dataframe=retrieved_documents_df,
    provide_explanation=True,
    concurrency=20,
)[0]

🐌!! If running inside a notebook, patching the event loop with nest_asyncio will allow asynchronous eval submission, and is significantly faster. To patch the event loop, run `nest_asyncio.apply()`.
run_evals |          | 0/39 (0.0%) | ⏳ 00:00<? | ?it/s[92m10:20:40 - LiteLLM:DEBUG[0m: utils.py:349 - 

[92m10:20:40 - LiteLLM:DEBUG[0m: utils.py:349 - [92mRequest to litellm:[0m
[92m10:20:40 - LiteLLM:DEBUG[0m: utils.py:349 - [92mlitellm.completion(model='huggingface/together/Qwen/Qwen2.5-7B-Instruct', messages=[{'content': '\nYou are comparing a reference text to a question and trying to determine if the reference text\ncontains information relevant to answering the question. Here is the data:\n    [BEGIN DATA]\n    ************\n    [Question]: I want to know information about AMD\n    ************\n    [Reference text]: \n    "AMD\'s Ryzen processors have been praised for their strong multi-core performance, making them attractive for content creat\n    ************\n    [END D


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.


[1;31mProvider List: https://docs.litellm.ai/docs/providers[0m

Exception in worker on attempt 1: litellm.APIError: HuggingfaceException - {"error":"You have exceeded your monthly included credits for Inference Providers. Subscribe to PRO to get 20x more monthly included credits."}
Retrying...


`max_retries` is not supported. It will be ignored.
[92m10:20:58 - LiteLLM:DEBUG[0m: litellm_logging.py:929 - [92m

POST Request Sent from LiteLLM:
curl -X POST \
https://router.huggingface.co/together/v1/chat/completions \
-H 'content-type: application/json' -H 'Authorization: Be****MM' \
-d '{'model': 'Qwen/Qwen2.5-7B-Instruct-Turbo', 'messages': [{'content': '\nYou are comparing a reference text to a question and trying to determine if the reference text\ncontains information relevant to answering the question. Here is the data:\n    [BEGIN DATA]\n    ************\n    [Question]: What feature of AMD\'s Ryzen processors has contributed to their popularity among content creators?\n    ************\n    [Reference text]:  its efficiency cores and hybrid architecture, especially with the Alder Lake and Raptor Lake generations. "\n    "The launch of \n    ************\n    [END DATA]\nCompare the Question above to the Reference text. You must determine whether the Reference text\ncon


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.


[1;31mProvider List: https://docs.litellm.ai/docs/providers[0m

Exception in worker on attempt 2: litellm.APIError: HuggingfaceException - {"error":"You have exceeded your monthly included credits for Inference Providers. Subscribe to PRO to get 20x more monthly included credits."}
Retrying...


`max_retries` is not supported. It will be ignored.
[92m10:20:58 - LiteLLM:DEBUG[0m: litellm_logging.py:929 - [92m

POST Request Sent from LiteLLM:
curl -X POST \
https://router.huggingface.co/together/v1/chat/completions \
-H 'content-type: application/json' -H 'Authorization: Be****MM' \
-d '{'model': 'Qwen/Qwen2.5-7B-Instruct-Turbo', 'messages': [{'content': '\nYou are comparing a reference text to a question and trying to determine if the reference text\ncontains information relevant to answering the question. Here is the data:\n    [BEGIN DATA]\n    ************\n    [Question]: What feature of AMD\'s Ryzen processors has contributed to their popularity among content creators?\n    ************\n    [Reference text]:  its efficiency cores and hybrid architecture, especially with the Alder Lake and Raptor Lake generations. "\n    "The launch of \n    ************\n    [END DATA]\nCompare the Question above to the Reference text. You must determine whether the Reference text\ncon


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.


[1;31mProvider List: https://docs.litellm.ai/docs/providers[0m

Exception in worker on attempt 3: litellm.APIError: HuggingfaceException - {"error":"You have exceeded your monthly included credits for Inference Providers. Subscribe to PRO to get 20x more monthly included credits."}
Retrying...


`max_retries` is not supported. It will be ignored.
[92m10:20:59 - LiteLLM:DEBUG[0m: litellm_logging.py:929 - [92m

POST Request Sent from LiteLLM:
curl -X POST \
https://router.huggingface.co/together/v1/chat/completions \
-H 'content-type: application/json' -H 'Authorization: Be****MM' \
-d '{'model': 'Qwen/Qwen2.5-7B-Instruct-Turbo', 'messages': [{'content': '\nYou are comparing a reference text to a question and trying to determine if the reference text\ncontains information relevant to answering the question. Here is the data:\n    [BEGIN DATA]\n    ************\n    [Question]: What feature of AMD\'s Ryzen processors has contributed to their popularity among content creators?\n    ************\n    [Reference text]:  its efficiency cores and hybrid architecture, especially with the Alder Lake and Raptor Lake generations. "\n    "The launch of \n    ************\n    [END DATA]\nCompare the Question above to the Reference text. You must determine whether the Reference text\ncon


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.


[1;31mProvider List: https://docs.litellm.ai/docs/providers[0m

Exception in worker on attempt 4: litellm.APIError: HuggingfaceException - {"error":"You have exceeded your monthly included credits for Inference Providers. Subscribe to PRO to get 20x more monthly included credits."}
Retrying...


`max_retries` is not supported. It will be ignored.
[92m10:21:00 - LiteLLM:DEBUG[0m: litellm_logging.py:929 - [92m

POST Request Sent from LiteLLM:
curl -X POST \
https://router.huggingface.co/together/v1/chat/completions \
-H 'content-type: application/json' -H 'Authorization: Be****MM' \
-d '{'model': 'Qwen/Qwen2.5-7B-Instruct-Turbo', 'messages': [{'content': '\nYou are comparing a reference text to a question and trying to determine if the reference text\ncontains information relevant to answering the question. Here is the data:\n    [BEGIN DATA]\n    ************\n    [Question]: What feature of AMD\'s Ryzen processors has contributed to their popularity among content creators?\n    ************\n    [Reference text]:  its efficiency cores and hybrid architecture, especially with the Alder Lake and Raptor Lake generations. "\n    "The launch of \n    ************\n    [END DATA]\nCompare the Question above to the Reference text. You must determine whether the Reference text\ncon


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.


[1;31mProvider List: https://docs.litellm.ai/docs/providers[0m

Exception in worker on attempt 5: litellm.APIError: HuggingfaceException - {"error":"You have exceeded your monthly included credits for Inference Providers. Subscribe to PRO to get 20x more monthly included credits."}
Retrying...


`max_retries` is not supported. It will be ignored.
[92m10:21:00 - LiteLLM:DEBUG[0m: litellm_logging.py:929 - [92m

POST Request Sent from LiteLLM:
curl -X POST \
https://router.huggingface.co/together/v1/chat/completions \
-H 'content-type: application/json' -H 'Authorization: Be****MM' \
-d '{'model': 'Qwen/Qwen2.5-7B-Instruct-Turbo', 'messages': [{'content': '\nYou are comparing a reference text to a question and trying to determine if the reference text\ncontains information relevant to answering the question. Here is the data:\n    [BEGIN DATA]\n    ************\n    [Question]: What feature of AMD\'s Ryzen processors has contributed to their popularity among content creators?\n    ************\n    [Reference text]:  its efficiency cores and hybrid architecture, especially with the Alder Lake and Raptor Lake generations. "\n    "The launch of \n    ************\n    [END DATA]\nCompare the Question above to the Reference text. You must determine whether the Reference text\ncon


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.


[1;31mProvider List: https://docs.litellm.ai/docs/providers[0m

Exception in worker on attempt 6: litellm.APIError: HuggingfaceException - {"error":"You have exceeded your monthly included credits for Inference Providers. Subscribe to PRO to get 20x more monthly included credits."}
Retrying...


`max_retries` is not supported. It will be ignored.
[92m10:21:01 - LiteLLM:DEBUG[0m: litellm_logging.py:929 - [92m

POST Request Sent from LiteLLM:
curl -X POST \
https://router.huggingface.co/together/v1/chat/completions \
-H 'content-type: application/json' -H 'Authorization: Be****MM' \
-d '{'model': 'Qwen/Qwen2.5-7B-Instruct-Turbo', 'messages': [{'content': '\nYou are comparing a reference text to a question and trying to determine if the reference text\ncontains information relevant to answering the question. Here is the data:\n    [BEGIN DATA]\n    ************\n    [Question]: What feature of AMD\'s Ryzen processors has contributed to their popularity among content creators?\n    ************\n    [Reference text]:  its efficiency cores and hybrid architecture, especially with the Alder Lake and Raptor Lake generations. "\n    "The launch of \n    ************\n    [END DATA]\nCompare the Question above to the Reference text. You must determine whether the Reference text\ncon


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.


[1;31mProvider List: https://docs.litellm.ai/docs/providers[0m

Exception in worker on attempt 7: litellm.APIError: HuggingfaceException - {"error":"You have exceeded your monthly included credits for Inference Providers. Subscribe to PRO to get 20x more monthly included credits."}
Retrying...


`max_retries` is not supported. It will be ignored.
[92m10:21:01 - LiteLLM:DEBUG[0m: litellm_logging.py:929 - [92m

POST Request Sent from LiteLLM:
curl -X POST \
https://router.huggingface.co/together/v1/chat/completions \
-H 'content-type: application/json' -H 'Authorization: Be****MM' \
-d '{'model': 'Qwen/Qwen2.5-7B-Instruct-Turbo', 'messages': [{'content': '\nYou are comparing a reference text to a question and trying to determine if the reference text\ncontains information relevant to answering the question. Here is the data:\n    [BEGIN DATA]\n    ************\n    [Question]: What feature of AMD\'s Ryzen processors has contributed to their popularity among content creators?\n    ************\n    [Reference text]:  its efficiency cores and hybrid architecture, especially with the Alder Lake and Raptor Lake generations. "\n    "The launch of \n    ************\n    [END DATA]\nCompare the Question above to the Reference text. You must determine whether the Reference text\ncon


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.


[1;31mProvider List: https://docs.litellm.ai/docs/providers[0m

Exception in worker on attempt 8: litellm.APIError: HuggingfaceException - {"error":"You have exceeded your monthly included credits for Inference Providers. Subscribe to PRO to get 20x more monthly included credits."}
Retrying...


`max_retries` is not supported. It will be ignored.
[92m10:21:02 - LiteLLM:DEBUG[0m: litellm_logging.py:929 - [92m

POST Request Sent from LiteLLM:
curl -X POST \
https://router.huggingface.co/together/v1/chat/completions \
-H 'content-type: application/json' -H 'Authorization: Be****MM' \
-d '{'model': 'Qwen/Qwen2.5-7B-Instruct-Turbo', 'messages': [{'content': '\nYou are comparing a reference text to a question and trying to determine if the reference text\ncontains information relevant to answering the question. Here is the data:\n    [BEGIN DATA]\n    ************\n    [Question]: What feature of AMD\'s Ryzen processors has contributed to their popularity among content creators?\n    ************\n    [Reference text]:  its efficiency cores and hybrid architecture, especially with the Alder Lake and Raptor Lake generations. "\n    "The launch of \n    ************\n    [END DATA]\nCompare the Question above to the Reference text. You must determine whether the Reference text\ncon


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.


[1;31mProvider List: https://docs.litellm.ai/docs/providers[0m

Exception in worker on attempt 9: litellm.APIError: HuggingfaceException - {"error":"You have exceeded your monthly included credits for Inference Providers. Subscribe to PRO to get 20x more monthly included credits."}
Retrying...


`max_retries` is not supported. It will be ignored.
[92m10:21:02 - LiteLLM:DEBUG[0m: litellm_logging.py:929 - [92m

POST Request Sent from LiteLLM:
curl -X POST \
https://router.huggingface.co/together/v1/chat/completions \
-H 'content-type: application/json' -H 'Authorization: Be****MM' \
-d '{'model': 'Qwen/Qwen2.5-7B-Instruct-Turbo', 'messages': [{'content': '\nYou are comparing a reference text to a question and trying to determine if the reference text\ncontains information relevant to answering the question. Here is the data:\n    [BEGIN DATA]\n    ************\n    [Question]: What feature of AMD\'s Ryzen processors has contributed to their popularity among content creators?\n    ************\n    [Reference text]:  its efficiency cores and hybrid architecture, especially with the Alder Lake and Raptor Lake generations. "\n    "The launch of \n    ************\n    [END DATA]\nCompare the Question above to the Reference text. You must determine whether the Reference text\ncon


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.


[1;31mProvider List: https://docs.litellm.ai/docs/providers[0m

Exception in worker on attempt 10: litellm.APIError: HuggingfaceException - {"error":"You have exceeded your monthly included credits for Inference Providers. Subscribe to PRO to get 20x more monthly included credits."}
Retrying...


`max_retries` is not supported. It will be ignored.
[92m10:21:03 - LiteLLM:DEBUG[0m: litellm_logging.py:929 - [92m

POST Request Sent from LiteLLM:
curl -X POST \
https://router.huggingface.co/together/v1/chat/completions \
-H 'content-type: application/json' -H 'Authorization: Be****MM' \
-d '{'model': 'Qwen/Qwen2.5-7B-Instruct-Turbo', 'messages': [{'content': '\nYou are comparing a reference text to a question and trying to determine if the reference text\ncontains information relevant to answering the question. Here is the data:\n    [BEGIN DATA]\n    ************\n    [Question]: What feature of AMD\'s Ryzen processors has contributed to their popularity among content creators?\n    ************\n    [Reference text]:  its efficiency cores and hybrid architecture, especially with the Alder Lake and Raptor Lake generations. "\n    "The launch of \n    ************\n    [END DATA]\nCompare the Question above to the Reference text. You must determine whether the Reference text\ncon


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.


[1;31mProvider List: https://docs.litellm.ai/docs/providers[0m

Retries exhausted after 11 attempts: litellm.APIError: HuggingfaceException - {"error":"You have exceeded your monthly included credits for Inference Providers. Subscribe to PRO to get 20x more monthly included credits."}


In [12]:
retrieved_documents_relevance_df.head()

Unnamed: 0,label,score,explanation
0,relevant,1.0,EXPLANATION: The question asks for information...
1,relevant,1.0,EXPLANATION: The question asks for information...
2,relevant,1.0,EXPLANATION: The question asks for information...
3,relevant,1.0,EXPLANATION: The question asks for information...
4,relevant,1.0,EXPLANATION: The question asks for information...


In [13]:
retrieved_documents_df.columns

Index(['context.span_id', 'context.trace_id', 'input', 'reference',
       'document.id', 'document.score'],
      dtype='object')

In [34]:
documents_with_relevance_df = pd.concat(
    [retrieved_documents_df, retrieved_documents_relevance_df.add_prefix("eval_")], axis=1
)
documents_with_relevance_df

Unnamed: 0,context.span_id,context.trace_id,input,reference,document.id,document.score,eval_label,eval_score,eval_explanation
0,0224631cead9f25d,500b3bdc9e9c658bee05bdea991a793a,I want to know information about AMD,"\n ""AMD's Ryzen processors have been praise...",6e13a8d5-0deb-4982-8f6a-601164f66a73,0.025658,relevant,1.0,EXPLANATION: The question asks for information...
1,0224631cead9f25d,500b3bdc9e9c658bee05bdea991a793a,I want to know information about AMD,"nerations. ""\n ""The launch of AMD's EPYC se...",825177e1-6c77-4de7-bb02-c332e8dbaf54,0.00709,relevant,1.0,EXPLANATION: The question asks for information...
2,0224631cead9f25d,500b3bdc9e9c658bee05bdea991a793a,I want to know information about AMD,"nd pre-built PCs. ""\n ""Both companies are e...",a23b9fae-a936-451e-b8d5-573ab30f08dd,0.004895,relevant,1.0,EXPLANATION: The question asks for information...
3,6fe4de290b8d7478,582261bea40ac7541550f4369e7342b1,I want some information about Intel,"ntel’s in both price and raw performance. ""\n ...",def89f0c-d7d3-4c57-964a-4689cfda6ad3,0.023789,relevant,1.0,EXPLANATION: The question asks for information...
4,6fe4de290b8d7478,582261bea40ac7541550f4369e7342b1,I want some information about Intel,hem attractive for content creators and gamers...,36084d3b-9775-4e1b-87a6-c542507e0074,0.023743,relevant,1.0,EXPLANATION: The question asks for information...
5,6fe4de290b8d7478,582261bea40ac7541550f4369e7342b1,I want some information about Intel,"t, Intel still maintains strong relationships ...",dc8a928d-6cc7-4393-a0c5-3754e4cb8cb9,0.010969,relevant,1.0,EXPLANATION: The question asks for information...
6,6b384c0a008fcdc1,2bb550be5926f49d23cef2b2443893d0,What feature of AMD's Ryzen processors has con...,"\n ""AMD's Ryzen processors have been praise...",6e13a8d5-0deb-4982-8f6a-601164f66a73,0.995588,relevant,1.0,EXPLANATION: The question asks about a specifi...
7,6b384c0a008fcdc1,2bb550be5926f49d23cef2b2443893d0,What feature of AMD's Ryzen processors has con...,hem attractive for content creators and gamers...,36084d3b-9775-4e1b-87a6-c542507e0074,0.018183,unrelated,0.0,EXPLANATION: The question asks about a specifi...
8,6b384c0a008fcdc1,2bb550be5926f49d23cef2b2443893d0,What feature of AMD's Ryzen processors has con...,"its efficiency cores and hybrid architecture,...",a0149ea1-fb03-4f53-be3a-891702d05314,0.000928,,,
9,4abdb3f2ef462568,3969182ba42aa8040638bd5352d2339c,What was the impact of AMD's EPYC server proce...,"nerations. ""\n ""The launch of AMD's EPYC se...",825177e1-6c77-4de7-bb02-c332e8dbaf54,0.977319,,,


In [15]:
documents_with_relevance_df = documents_with_relevance_df.rename(
    columns={
        "eval_label": "label",
        "eval_score": "score",
        "eval_explanation": "explanation",
        "document.score": "document_score",
    }
)

print(documents_with_relevance_df.head())

    context.span_id                  context.trace_id  \
0  0224631cead9f25d  500b3bdc9e9c658bee05bdea991a793a   
1  0224631cead9f25d  500b3bdc9e9c658bee05bdea991a793a   
2  0224631cead9f25d  500b3bdc9e9c658bee05bdea991a793a   
3  6fe4de290b8d7478  582261bea40ac7541550f4369e7342b1   
4  6fe4de290b8d7478  582261bea40ac7541550f4369e7342b1   

                                  input  \
0  I want to know information about AMD   
1  I want to know information about AMD   
2  I want to know information about AMD   
3   I want some information about Intel   
4   I want some information about Intel   

                                           reference  \
0  \n    "AMD's Ryzen processors have been praise...   
1  nerations. "\n    "The launch of AMD's EPYC se...   
2  nd pre-built PCs. "\n    "Both companies are e...   
3  ntel’s in both price and raw performance. "\n ...   
4  hem attractive for content creators and gamers...   

                            document.id  document_score     l

In [16]:
documents_with_relevance_df

Unnamed: 0,context.span_id,context.trace_id,input,reference,document.id,document_score,label,score,explanation
0,0224631cead9f25d,500b3bdc9e9c658bee05bdea991a793a,I want to know information about AMD,"\n ""AMD's Ryzen processors have been praise...",6e13a8d5-0deb-4982-8f6a-601164f66a73,0.025658,relevant,1.0,EXPLANATION: The question asks for information...
1,0224631cead9f25d,500b3bdc9e9c658bee05bdea991a793a,I want to know information about AMD,"nerations. ""\n ""The launch of AMD's EPYC se...",825177e1-6c77-4de7-bb02-c332e8dbaf54,0.00709,relevant,1.0,EXPLANATION: The question asks for information...
2,0224631cead9f25d,500b3bdc9e9c658bee05bdea991a793a,I want to know information about AMD,"nd pre-built PCs. ""\n ""Both companies are e...",a23b9fae-a936-451e-b8d5-573ab30f08dd,0.004895,relevant,1.0,EXPLANATION: The question asks for information...
3,6fe4de290b8d7478,582261bea40ac7541550f4369e7342b1,I want some information about Intel,"ntel’s in both price and raw performance. ""\n ...",def89f0c-d7d3-4c57-964a-4689cfda6ad3,0.023789,relevant,1.0,EXPLANATION: The question asks for information...
4,6fe4de290b8d7478,582261bea40ac7541550f4369e7342b1,I want some information about Intel,hem attractive for content creators and gamers...,36084d3b-9775-4e1b-87a6-c542507e0074,0.023743,relevant,1.0,EXPLANATION: The question asks for information...
5,6fe4de290b8d7478,582261bea40ac7541550f4369e7342b1,I want some information about Intel,"t, Intel still maintains strong relationships ...",dc8a928d-6cc7-4393-a0c5-3754e4cb8cb9,0.010969,relevant,1.0,EXPLANATION: The question asks for information...
6,6b384c0a008fcdc1,2bb550be5926f49d23cef2b2443893d0,What feature of AMD's Ryzen processors has con...,"\n ""AMD's Ryzen processors have been praise...",6e13a8d5-0deb-4982-8f6a-601164f66a73,0.995588,relevant,1.0,EXPLANATION: The question asks about a specifi...
7,6b384c0a008fcdc1,2bb550be5926f49d23cef2b2443893d0,What feature of AMD's Ryzen processors has con...,hem attractive for content creators and gamers...,36084d3b-9775-4e1b-87a6-c542507e0074,0.018183,unrelated,0.0,EXPLANATION: The question asks about a specifi...
8,6b384c0a008fcdc1,2bb550be5926f49d23cef2b2443893d0,What feature of AMD's Ryzen processors has con...,"its efficiency cores and hybrid architecture,...",a0149ea1-fb03-4f53-be3a-891702d05314,0.000928,,,
9,4abdb3f2ef462568,3969182ba42aa8040638bd5352d2339c,What was the impact of AMD's EPYC server proce...,"nerations. ""\n ""The launch of AMD's EPYC se...",825177e1-6c77-4de7-bb02-c332e8dbaf54,0.977319,,,


In [29]:
import numpy as np

final_df = documents_with_relevance_df.copy()

# ép score về số, biến lỗi thành NaN
final_df['__score__'] = pd.to_numeric(final_df['document_score'], errors='coerce')

# sort theo span_id và score giảm dần; NaN đẩy xuống cuối nhóm
final_df = final_df.sort_values(
    ['context.span_id', '__score__'], ascending=[True, False]
)

# đánh số 0,1,2,... trong từng span
final_df['document_position'] = final_df.groupby('context.span_id').cumcount() 

# MultiIndex đúng chuẩn Phoenix (nếu bạn cần)
final_df = final_df.set_index(['context.span_id', 'document_position']).drop(columns='__score__')

In [30]:
final_df.columns

Index(['context.trace_id', 'input', 'reference', 'document.id',
       'document_score', 'label', 'score', 'explanation'],
      dtype='object')

In [31]:
final_df

Unnamed: 0_level_0,Unnamed: 1_level_0,context.trace_id,input,reference,document.id,document_score,label,score,explanation
context.span_id,document_position,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0224631cead9f25d,0,500b3bdc9e9c658bee05bdea991a793a,I want to know information about AMD,"\n ""AMD's Ryzen processors have been praise...",6e13a8d5-0deb-4982-8f6a-601164f66a73,0.025658,relevant,1.0,EXPLANATION: The question asks for information...
0224631cead9f25d,1,500b3bdc9e9c658bee05bdea991a793a,I want to know information about AMD,"nerations. ""\n ""The launch of AMD's EPYC se...",825177e1-6c77-4de7-bb02-c332e8dbaf54,0.00709,relevant,1.0,EXPLANATION: The question asks for information...
0224631cead9f25d,2,500b3bdc9e9c658bee05bdea991a793a,I want to know information about AMD,"nd pre-built PCs. ""\n ""Both companies are e...",a23b9fae-a936-451e-b8d5-573ab30f08dd,0.004895,relevant,1.0,EXPLANATION: The question asks for information...
1a81efb791e850df,0,54b2e9d0aa6bd4e32494f66888f35770,"Based on the information provided, what specif...","nerations. ""\n ""The launch of AMD's EPYC se...",825177e1-6c77-4de7-bb02-c332e8dbaf54,0.966074,,,
1a81efb791e850df,1,54b2e9d0aa6bd4e32494f66888f35770,"Based on the information provided, what specif...","its efficiency cores and hybrid architecture,...",a0149ea1-fb03-4f53-be3a-891702d05314,0.001705,,,
1a81efb791e850df,2,54b2e9d0aa6bd4e32494f66888f35770,"Based on the information provided, what specif...",TSMC’s 7nm and 5nm technology to stay ahead i...,0511cc03-6907-47b7-a5b6-2e59816aa6b9,0.001625,,,
1ae2af8f3837139e,0,ef83a9f7bb737cac4040ca084b1f5418,"According to the context, what are both AMD an...","ntel’s in both price and raw performance. ""\n ...",def89f0c-d7d3-4c57-964a-4689cfda6ad3,0.978433,,,
1ae2af8f3837139e,1,ef83a9f7bb737cac4040ca084b1f5418,"According to the context, what are both AMD an...","nd pre-built PCs. ""\n ""Both companies are e...",a23b9fae-a936-451e-b8d5-573ab30f08dd,0.949401,,,
1ae2af8f3837139e,2,ef83a9f7bb737cac4040ca084b1f5418,"According to the context, what are both AMD an...","s without owning its own fabs. ""\n ""While A...",207f6d67-3dbf-4aea-88dc-c9df94f43b43,0.11251,,,
1e9fbe7fd91c299d,0,cb3b1808165551a70eaa1a0edc6fd1a5,How does the multi-core performance of AMD's R...,"\n ""AMD's Ryzen processors have been praise...",6e13a8d5-0deb-4982-8f6a-601164f66a73,0.996719,,,


In [23]:
import numpy as np
from sklearn.metrics import ndcg_score


def _compute_ndcg(df: pd.DataFrame, k: int):
    """Compute NDCG@k in the presence of missing values"""
    n = max(2, len(df))
    eval_scores = np.zeros(n)
    doc_scores = np.zeros(n)
    eval_scores[: len(df)] = df.score
    doc_scores[: len(df)] = df.document_score
    try:
        return ndcg_score([eval_scores], [doc_scores], k=k)
    except ValueError:
        return np.nan


ndcg_at_2 = pd.DataFrame(
    {"score": final_df.groupby("context.span_id").apply(_compute_ndcg, k=2)}
)

In [24]:
ndcg_at_2

Unnamed: 0_level_0,score
context.span_id,Unnamed: 1_level_1
0224631cead9f25d,1.0
1a81efb791e850df,
1ae2af8f3837139e,
1e9fbe7fd91c299d,
4abdb3f2ef462568,
5115f7ecca3a55ea,
5551e37d1a9fdeef,
6b384c0a008fcdc1,
6fe4de290b8d7478,1.0
7d64a5556cec5907,


In [25]:
precision_at_2 = pd.DataFrame(
    {
        "score": final_df.groupby("context.span_id").apply(
            lambda x: x.score[:2].sum(skipna=False) / 2
        )
    }
)

In [26]:
precision_at_2

Unnamed: 0_level_0,score
context.span_id,Unnamed: 1_level_1
0224631cead9f25d,1.0
1a81efb791e850df,
1ae2af8f3837139e,
1e9fbe7fd91c299d,
4abdb3f2ef462568,
5115f7ecca3a55ea,
5551e37d1a9fdeef,
6b384c0a008fcdc1,0.5
6fe4de290b8d7478,1.0
7d64a5556cec5907,


In [27]:
hit = pd.DataFrame(
    {
        "hit": documents_with_relevance_df.groupby("context.span_id").apply(
            lambda x: x.score[:2].sum(skipna=False) > 0
        )
    }
)

In [32]:
from phoenix.trace import DocumentEvaluations, SpanEvaluations

px.Client().log_evaluations(
    SpanEvaluations(dataframe=ndcg_at_2, eval_name="ndcg@2"),
    SpanEvaluations(dataframe=precision_at_2, eval_name="precision@2"),
    DocumentEvaluations(dataframe=final_df, eval_name="relevance"),
)