In [1]:
# !pip install ../.

In [1]:
import pandas as pd
import numpy as np

In [2]:
import os
from langchain_openai import AzureChatOpenAI

In [3]:
azure_llm = AzureChatOpenAI(
    azure_deployment="gemini-1.5-flash-001",
    api_version="2023-03-15-preview",
    max_tokens=1024,
    timeout=600,
    temperature=0,
    seed=None,
)

In [4]:
from s3fs.core import S3FileSystem

os.environ["AWS_DEFAULT_PROFILE"] = "rtc"
os.environ["AWS_PROFILE"] = "rtc"
s3 = S3FileSystem(anon=False)

In [5]:
import os

ground_truth = "s3://deltix-staging-dial-rag-eval/ground_truth/epam10k/ground_truth_v2.parquet"
answer_url = "s3://deltix-staging-dial-rag-eval/answers/epam10k"
metrics_url = "s3://deltix-staging-dial-rag-eval/evaluation/nli/tmp_to_check/"
fns = [
    "answers_azure_rag.parquet",
]

In [6]:
from aidial_rag_eval import evaluate

  from tqdm.autonotebook import tqdm


In [26]:
from aidial_rag_eval.metric_binds import CONTEXT_TO_ANSWER_INFERENCE,\
    ANSWER_TO_GROUND_TRUTH_INFERENCE, GROUND_TRUTH_TO_ANSWER_INFERENCE, ANSWER_REFUSAL, GROUND_TRUTH_REFUSAL

In [27]:
ret = evaluate(
    ground_truth,
    os.path.join(answer_url, fns[0]),
    os.path.join(metrics_url, "02_11_2025_" + fns[0]),
    llm=azure_llm.with_retry(stop_after_attempt=4),
    metric_binds=[
        CONTEXT_TO_ANSWER_INFERENCE,
        ANSWER_TO_GROUND_TRUTH_INFERENCE,
        GROUND_TRUTH_TO_ANSWER_INFERENCE,
        ANSWER_REFUSAL,
        GROUND_TRUTH_REFUSAL,
    ],
    fs=s3,
    show_progress_bar=True,
)

Converting hypothesis...


100%|███████████████████████████████████████████████████████████████████████████████████| 13/13 [00:03<00:00,  3.83it/s]


Getting inference...


100%|███████████████████████████████████████████████████████████████████████████████████| 48/48 [00:11<00:00,  4.20it/s]


Converting hypothesis...


100%|███████████████████████████████████████████████████████████████████████████████████| 11/11 [00:03<00:00,  3.64it/s]


Getting inference...


100%|███████████████████████████████████████████████████████████████████████████████████| 57/57 [00:14<00:00,  4.02it/s]


Converting hypothesis...


100%|███████████████████████████████████████████████████████████████████████████████████| 13/13 [00:02<00:00,  5.10it/s]


Getting inference...


100%|███████████████████████████████████████████████████████████████████████████████████| 48/48 [00:10<00:00,  4.68it/s]


Getting refusal...


100%|█████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:03<00:00,  1.32s/it]


Getting refusal...


100%|█████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00,  3.02it/s]


In [8]:
df_gt = pd.read_parquet(ground_truth, filesystem=s3)
df_gt.head()

Unnamed: 0,question,answer,facts,documents
0,Full legal name of the Company,"EPAM SYSTEMS, INC.","[EPAM SYSTEMS, INC.\n\n(Exact name of registra...",[https://d18rn0p25nwr6d.cloudfront.net/CIK-000...
1,Address,"42 University Drive, Suite 202, Newtown, Penns...","[41 University Drive, Suite 202, Newtown, Penn...",[https://d18rn0p25nwr6d.cloudfront.net/CIK-000...
2,Provide your corporate HQ location and geograp...,Corporate HQ location is in 41 University Driv...,"[EPAM SYSTEMS, INC.\n\n(Exact name of registra...",[https://d18rn0p25nwr6d.cloudfront.net/CIK-000...
3,How many staff members/resources do you have a...,India is one of EPAM largest delivery location...,"[Outside of Ukraine and Belarus, our largest d...",[https://d18rn0p25nwr6d.cloudfront.net/CIK-000...
4,"Main Office Location (City, State and Country)",Our principal executive offices are located at...,"[EPAM Systems, Inc. was incorporated in the St...",[https://d18rn0p25nwr6d.cloudfront.net/CIK-000...


In [9]:
df_a = pd.read_parquet(os.path.join(answer_url, fns[0]), filesystem=s3)
df_a.head()

Unnamed: 0,question,answer,context,documents
0,Full legal name of the Company,The full legal name of the company is EPAM Sys...,[Chief\nAccounting Officer \n\n(principal acco...,[https://d18rn0p25nwr6d.cloudfront.net/CIK-000...
1,Address,The requested information is not available in ...,[],[https://d18rn0p25nwr6d.cloudfront.net/CIK-000...
2,Provide your corporate HQ location and geograp...,"EPAM Systems, Inc.'s corporate headquarters ar...",[may not be certain.\nOur determination of tax...,[https://d18rn0p25nwr6d.cloudfront.net/CIK-000...
3,How many staff members/resources do you have a...,The requested information is not available in ...,"[had approximately 59,300, 58,800, and 41,150 ...",[https://d18rn0p25nwr6d.cloudfront.net/CIK-000...
4,"Main Office Location (City, State and Country)","The main office of EPAM Systems, Inc. is locat...",[or systems we develop for them. Our suppliers...,[https://d18rn0p25nwr6d.cloudfront.net/CIK-000...


In [10]:
# df_metrics = pd.read_parquet(os.path.join(metrics_url, "01_22_2025_" + fns[0]), filesystem=s3)
# df_metrics.head()

In [11]:
from aidial_rag_eval import create_rag_eval_metrics_report

In [12]:
df_metrics = create_rag_eval_metrics_report(
    df_gt,
    df_a,
    llm=azure_llm.with_retry(stop_after_attempt=4),
    metric_binds=[
        GROUND_TRUTH_TO_ANSWER_INFERENCE
    ],
    show_progress_bar=True,
)

Converting hypothesis...


100%|███████████████████████████████████████████████████████████████████████████████████| 13/13 [00:03<00:00,  3.33it/s]


Getting inference...


100%|███████████████████████████████████████████████████████████████████████████████████| 48/48 [00:11<00:00,  4.33it/s]


In [13]:
df_metrics.head()

Unnamed: 0,question,ground_truth_answer,facts,documents,answer,context,facts_ranks,context_relevance,context_highlight,recall,precision,mrr,f1,gt_ans_inference,gt_ans_json,gt_ans_highlight,mean_inference,median_inference
0,Full legal name of the Company,"EPAM SYSTEMS, INC.","[EPAM SYSTEMS, INC.\n\n(Exact name of registra...",[https://d18rn0p25nwr6d.cloudfront.net/CIK-000...,The full legal name of the company is EPAM Sys...,[Chief\nAccounting Officer \n\n(principal acco...,[-1],"[0, 0]","[{""match"": [{""text"": ""Chief\nAccounting Office...",0.0,0.0,0.0,0.0,1.0,"[{""inference"": 1.0, ""hypothesis"": ""The full le...","{""corpus"": [{""text"": ""The full legal name of t...",1.0,1.0
1,Address,"42 University Drive, Suite 202, Newtown, Penns...","[41 University Drive, Suite 202, Newtown, Penn...",[https://d18rn0p25nwr6d.cloudfront.net/CIK-000...,The requested information is not available in ...,[],[-1],[],[],0.0,0.0,0.0,0.0,0.0,"[{""inference"": 0.0, ""hypothesis"": ""The request...","{""corpus"": [{""text"": ""The requested informatio...",0.0,0.0
2,Provide your corporate HQ location and geograp...,Corporate HQ location is in 41 University Driv...,"[EPAM SYSTEMS, INC.\n\n(Exact name of registra...",[https://d18rn0p25nwr6d.cloudfront.net/CIK-000...,"EPAM Systems, Inc.'s corporate headquarters ar...",[may not be certain.\nOur determination of tax...,"[-1, -1]","[0, 0, 0, 0, 0, 0, 0]","[{""match"": [{""text"": ""may not be certain.\nOur...",0.0,0.0,0.0,0.0,0.825,"[{""inference"": 1.0, ""hypothesis"": ""EPAM System...","{""corpus"": [{""text"": ""EPAM Systems, Inc.'s cor...",0.825,0.825
3,How many staff members/resources do you have a...,India is one of EPAM largest delivery location...,"[Outside of Ukraine and Belarus, our largest d...",[https://d18rn0p25nwr6d.cloudfront.net/CIK-000...,The requested information is not available in ...,"[had approximately 59,300, 58,800, and 41,150 ...",[2],"[0, 0, 1, 0]","[{""match"": [{""text"": ""had approximately 59,300...",1.0,0.25,0.333333,0.4,0.25,"[{""inference"": 0.5, ""hypothesis"": ""The request...","{""corpus"": [{""text"": ""The requested informatio...",0.25,0.25
4,"Main Office Location (City, State and Country)",Our principal executive offices are located at...,"[EPAM Systems, Inc. was incorporated in the St...",[https://d18rn0p25nwr6d.cloudfront.net/CIK-000...,"The main office of EPAM Systems, Inc. is locat...",[or systems we develop for them. Our suppliers...,[0],"[1, 0, 0, 0, 0, 0, 0]","[{""match"": [{""text"": ""or systems we develop fo...",1.0,0.142857,1.0,0.25,1.0,"[{""inference"": 1.0, ""hypothesis"": ""The main of...","{""corpus"": [{""text"": ""The main office of EPAM ...",1.0,1.0


In [23]:
from aidial_rag_eval.metrics import calculate_inference, calculate_batch_inference

In [24]:
ret = calculate_inference("I am a smart person.", "I am not a smart person.", llm=azure_llm.with_retry(stop_after_attempt=4), show_progress_bar=False)
ret

InferenceReturn(inference=0.0, json='[{"inference": 0.0, "hypothesis": "I am not a smart person.", "premise": ["I am a smart person."], "explanation": "[{\\"fact\\": \\"I am not a smart person.\\", \\"explanation\\": \\"The premise states that the person is smart.\\", \\"tag\\": \\"CONT\\"}]"}]', highlight='{"corpus": [{"text": "I am not a smart person.", "score": -1.0, "title": 0.0}, {"text": "", "score": 0.0}]}')

In [25]:
ret = calculate_batch_inference(["I am a smart person.", "I am a smart person."], ["I am not a smart person.", "I am a smart person."], llm=azure_llm.with_retry(stop_after_attempt=4), show_progress_bar=False)

ret

[InferenceReturn(inference=0.0, json='[{"inference": 0.0, "hypothesis": "I am not a smart person.", "premise": ["I am a smart person."], "explanation": "[{\\"fact\\": \\"I am not a smart person.\\", \\"explanation\\": \\"The premise states that the person is smart.\\", \\"tag\\": \\"CONT\\"}]"}]', highlight='{"corpus": [{"text": "I am not a smart person.", "score": -1.0, "title": 0.0}, {"text": "", "score": 0.0}]}'),
 InferenceReturn(inference=1.0, json='[{"inference": 1.0, "hypothesis": "I am a smart person.", "premise": ["I am a smart person."], "explanation": "[{\\"fact\\": \\"I am a smart person.\\", \\"explanation\\": \\"The premise states that the speaker is a smart person.\\", \\"tag\\": \\"ENT\\"}]"}]', highlight='{"corpus": [{"text": "I am a smart person.", "score": 0.0, "title": 1.0}, {"text": "", "score": 0.0}]}')]