In [3]:
!pip install datasets langchain chromadb openai deepeval langchain-community

Collecting langchain-community
  Obtaining dependency information for langchain-community from https://files.pythonhosted.org/packages/c8/bc/f8c7dae8321d37ed39ac9d7896617c4203248240a4835b136e3724b3bb62/langchain_community-0.3.27-py3-none-any.whl.metadata
  Downloading langchain_community-0.3.27-py3-none-any.whl.metadata (2.9 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Obtaining dependency information for dataclasses-json<0.7,>=0.5.7 from https://files.pythonhosted.org/packages/c3/be/d0d44e092656fe7a06b55e6103cbce807cdbdee17884a5367c68c9860853/dataclasses_json-0.6.7-py3-none-any.whl.metadata
  Using cached dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Obtaining dependency information for httpx-sse<1.0.0,>=0.4.0 from https://files.pythonhosted.org/packages/25/0a/6269e3473b09aed2dab8aa1a600c70f31f00ae1349bee30658f7e358a159/httpx_sse-0.4.1-py3-none-any.whl.metadata
  Downloading httpx_ss


[notice] A new release of pip is available: 23.2.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
from datasets import load_dataset
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.chat_models import ChatOpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
import json

In [7]:
ds = load_dataset("virattt/financial-qa-10K")
dataset = ds["train"]

Generating train split: 100%|██████████| 7000/7000 [00:00<00:00, 40975.78 examples/s]


In [21]:
ds

DatasetDict({
    train: Dataset({
        features: ['question', 'answer', 'context', 'ticker', 'filing'],
        num_rows: 7000
    })
})

In [12]:
splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=20)
docs = []
for row in dataset:
    context = row['context']
    docs.extend(splitter.create_documents([context]))

In [16]:
import os
OPENAI_API_KEY="sk-C6d4lxoZnn2bJBx2rHkOxHjCF6PF_cDb9k8BdWseD6T3BlbkFJm9dOsFS-w59fFqk4qNbkWmzhM4jETehAXdNn6jho0A"
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

In [17]:
embeddings = OpenAIEmbeddings()
db = Chroma.from_documents(docs, embeddings)

In [18]:
retriever = db.as_retriever(search_kwargs={"k": 1})
llm = ChatOpenAI(model='gpt-4o')

  llm = ChatOpenAI(model='gpt-4o')


In [42]:
results = []

# Use proper dataset iteration - select first 20 rows
sample_dataset = dataset.select(range(20))

for row in sample_dataset:
    query = row['question']
    ground_truth = row['answer']

    retrieved_docs = retriever.get_relevant_documents(query)
    retrieved_context = "\n\n".join([doc.page_content for doc in retrieved_docs])

    prompt = f"""Answer the following question based only on the context below.

Context:
{retrieved_context}

Question:
{query}
"""

    answer = llm.predict(prompt)

    results.append({
        "query": query,
        "context":context,
        "retrieved_context": retrieved_context,
        "generated_answer": answer,
        "ground_truth": ground_truth
    })

print(f"Processed {len(results)} questions successfully!")

Processed 20 questions successfully!


In [43]:
print(results[:5])  # Display first 5 results for verification

[{'query': 'What area did NVIDIA initially focus on before expanding to other computationally intensive fields?', 'context': 'As of December 31, 2023, SGD 3.69 billion (approximately $2.79 billion at exchange rates in effect on December 31, 2023) remains available to be drawn under the Singapore Delayed Draw Term Facility once the construction cost estimate and construction schedule for the MBS Expansion Project are delivered to lenders.', 'retrieved_context': 'NVIDIA as the leader in computer graphics.', 'generated_answer': 'NVIDIA initially focused on computer graphics.', 'ground_truth': 'NVIDIA initially focused on PC graphics.'}, {'query': 'What are some of the recent applications of GPU-powered deep learning as mentioned by NVIDIA?', 'context': 'As of December 31, 2023, SGD 3.69 billion (approximately $2.79 billion at exchange rates in effect on December 31, 2023) remains available to be drawn under the Singapore Delayed Draw Term Facility once the construction cost estimate and c

In [48]:

from deepeval.metrics import (
    AnswerRelevancyMetric, HallucinationMetric, BiasMetric,
    FaithfulnessMetric,
    ContextualRelevancyMetric,ContextualRecallMetric,ContextualPrecisionMetric
)


In [None]:
from deepeval.test_case import LLMTestCase

# Prepare test cases for RAGMetric
test_cases = [
    LLMTestCase(
        input=entry['query'],
        actual_output=entry['generated_answer'],
        context=[entry['context']],
        retrieval_context=[entry['retrieved_context']],
        expected_output=entry['ground_truth']
    )
    for entry in results
]

print(f"Prepared {len(test_cases)} test cases for evaluation.")
print(test_cases[:5])  # Display first 5 test cases for verification

# Initialize RAGMetric
metrics = [
    AnswerRelevancyMetric(),
    HallucinationMetric(),
    BiasMetric(),
    ContextualRelevancyMetric(),
    ContextualRecallMetric(),
    ContextualPrecisionMetric(),
    FaithfulnessMetric(),
]

# # Run evaluations

metric_results = {}
for metric in metrics:
    scores = []
    for test_case in test_cases:
        metric.measure(test_case)
        scores.append(metric.score)
    average_score = sum(scores) / len(scores)
    metric_results[metric.__class__.__name__] = average_score

In [50]:
print(metric_results)

{'AnswerRelevancyMetric': 0.8761904761904763, 'HallucinationMetric': 1.0, 'BiasMetric': 0.0, 'ContextualRelevancyMetric': 0.725, 'ContextualRecallMetric': 0.7416666666666667, 'ContextualPrecisionMetric': 0.75, 'FaithfulnessMetric': 1.0}


In [59]:
import requests


BASE_URL = "https://qa-backend.cognitiveview.com"
AUTH_TOKEN ="Bearer eyJhbGciOiJSUzI1NiIsImNhdCI6ImNsX0I3ZDRQRDIyMkFBQSIsImtpZCI6Imluc18yckY5Qll3RDh6WHBnMGI1T0t1bnlUeFA4d0UiLCJ0eXAiOiJKV1QifQ.eyJhenAiOiJodHRwczovL3FhLWZyb250ZW5kLmNvZ25pdGl2ZXZpZXcuY29tIiwiZXhwIjoxNzUxNDk5ODI1LCJpYXQiOjE3NTE0Mzk4MjUsImlzcyI6Imh0dHBzOi8vc3VwZXJiLW9jdG9wdXMtOTIuY2xlcmsuYWNjb3VudHMuZGV2IiwianRpIjoiYTk4OWJiMDdmOTRlNmIzYzlmOTUiLCJuYmYiOjE3NTE0Mzk2MjUsInN1YiI6InVzZXJfMnhkS1AyZGRDVFlXQ1YyQzJWTjZTTmZYRlRUIiwidXNlcl9pbmZvIjp7ImZpcnN0X25hbWUiOiJBc2h1dG9zaCAiLCJpZCI6InVzZXJfMnhkS1AyZGRDVFlXQ1YyQzJWTjZTTmZYRlRUIiwiaW5mbyI6eyJjbGllbnRfaWQiOiJDNDczNDIxIiwidGVuYW50X2lkIjoiVDE4MTc1MSIsInVzZXJfaWQiOiJVLTcwODcyMSJ9LCJsYXN0X25hbWUiOiJQYWRoaSIsIm1ldGFfZGF0YSI6eyJjbGllbnRfaWQiOiJDNDczNDIxIiwidGVuYW50X2lkIjoiVDE4MTc1MSIsInVzZXJfaWQiOiJVLTcwODcyMSJ9fX0.HAu6Qm-BBId5xLHv_IfbWO2pG_6wmGHry8IWd6ru0yo1sLKJSMoLc1ODZUDSJgyydh1zIEc44lBRrxUhG_-e8ApdWmBPYSnHXkxXnHV3VDARjHekYyiCMqIFdOk07esFcbDM4MQvNSZ-nlMm3639Lrlow7ZwRJHjVxeb6wZ5JUNgRwU-XWGnJDoeolk-afqB1ryn0P9OaqigyjMZK-IJN30Vz_IbW3vIDicarzL9P6w4yWa4XhIYyXW1zEcJfHdT-_OHyyJQC2HSGlO7yMIvayoxNfhb-518VBH8TLvg_3rnOLQD7oIjRW1hB3p-fd41i2BHWSJ_2ZFIFlPINMW6iA"  # Replace with your actual token
url = f"{BASE_URL}/cv/v1/metrics"

headers = {
    "Authorization": AUTH_TOKEN,
    "Content-Type": "application/json",
    "X-User-Id": "C473421_T181751",  
}

payload = {
  "metric_metadata": {
    "application_name": "chat-application",
    "version": "1.0.0",
    "resource_name": "chat-completion",
    "resource_id": "R-756",
    "provider": "deepeval",
    "use_case": "transportation"
  },
  "metric_data": {
    "resource_id": "res_123456",
    "resource_name": "chat-completion",
    "deepeval": metric_results,
  } 
}

response = requests.post(url, headers=headers, json=payload)

# Output the response
print(f"Status Code: {response.status_code}")
print("Response JSON:", response.json())

Status Code: 201
Response JSON: {'message': 'Metrics ingested, and evaluation completed.', 'report_id': 'gW3eQpV63sRTrQey9uJPNp'}


In [None]:
import requests

def fetch_report_result(report_id, auth_token, user_id):
    """
    Fetches the result of a report from the CognitiveView API.

    Args:
        report_id (str): The ID of the report to fetch.
        auth_token (str): The authorization token for the API.
        user_id (str): The user ID for the API.

    Returns:
        dict: The JSON response from the API if successful, else None.
    """
    base_url = "https://qa-backend.cognitiveview.com"
    endpoint = f"/cv/v1/metrics/{report_id}"
    url = base_url + endpoint

    headers = {
        "Authorization": auth_token,
        "Content-Type": "application/json",
        "X-User-Id": user_id,
    }

    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Failed to fetch report. Status code: {response.status_code}")
        return None

# Example usage:
# AUTH_TOKEN = "Bearer eyJhbGciOiJSUzI1NiIsImNhdCI6ImNsX0I3ZDRQRDIyMkFBQSIsImtpZCI6Imluc18yckY5Qll3RDh6WHBnMGI1T0t1bnlUeFA4d0UiLCJ0eXAiOiJKV1QifQ.eyJhenAiOiJodHRwczovL3FhLWZyb250ZW5kLmNvZ25pdGl2ZXZpZXcuY29tIiwiZXhwIjoxNzUxNDk5ODI1LCJpYXQiOjE3NTE0Mzk4MjUsImlzcyI6Imh0dHBzOi8vc3VwZXJiLW9jdG9wdXMtOTIuY2xlcmsuYWNjb3VudHMuZGV2IiwianRpIjoiYTk4OWJiMDdmOTRlNmIzYzlmOTUiLCJuYmYiOjE3NTE0Mzk2MjUsInN1YiI6InVzZXJfMnhkS1AyZGRDVFlXQ1YyQzJWTjZTTmZYRlRUIiwidXNlcl9pbmZvIjp7ImZpcnN0X25hbWUiOiJBc2h1dG9zaCAiLCJpZCI6InVzZXJfMnhkS1AyZGRDVFlXQ1YyQzJWTjZTTmZYRlRUIiwiaW5mbyI6eyJjbGllbnRfaWQiOiJDNDczNDIxIiwidGVuYW50X2lkIjoiVDE4MTc1MSIsInVzZXJfaWQiOiJVLTcwODcyMSJ9LCJsYXN0X25hbWUiOiJQYWRoaSIsIm1ldGFfZGF0YSI6eyJjbGllbnRfaWQiOiJDNDczNDIxIiwidGVuYW50X2lkIjoiVDE4MTc1MSIsInVzZXJfaWQiOiJVLTcwODcyMSJ9fX0.HAu6Qm-BBId5xLHv_IfbWO2pG_6wmGHry8IWd6ru0yo1sLKJSMoLc1ODZUDSJgyydh1zIEc44lBRrxUhG_-e8ApdWmBPYSnHXkxXnHV3VDARjHekYyiCMqIFdOk07esFcbDM4MQvNSZ-nlMm3639Lrlow7ZwRJHjVxeb6wZ5JUNgRwU-XWGnJDoeolk-afqB1ryn0P9OaqigyjMZK-IJN30Vz_IbW3vIDicarzL9P6w4yWa4XhIYyXW1zEcJfHdT-_OHyyJQC2HSGlO7yMIvayoxNfhb-518VBH8TLvg_3rnOLQD7oIjRW1hB3p-fd41i2BHWSJ_2ZFIFlPINMW6iA"  # Replace with your actual token
report_id = "gW3eQpV63sRTrQey9uJPNp"  # Replace with the actual report ID you want to fetch
user_id = "C473421_T181751"  # Replace with your actual user ID
report = fetch_report_result(report_id, AUTH_TOKEN, user_id)
print(report)

In [72]:
print(report)

{'report_id': 'gW3eQpV63sRTrQey9uJPNp', 'application_id': 'DOC-2vwKEOHqH1fWLbcR', 'provider': 'deepeval', 'use_case': 'transportation', 'application_name': 'chat-application', 'resource_type': 'genai', 'pillars': [{'pillar': 'performance', 'score': 0.49, 'colour': '🔴', 'metrics_count': 3}, {'pillar': 'fairness_and_bias', 'score': 0.55, 'colour': '🟠', 'metrics_count': 5}, {'pillar': 'safety_and_truthfulness', 'score': 0.67, 'colour': '🟠', 'metrics_count': 3}, {'pillar': 'task_adherence', 'score': 0.0, 'colour': '🔴', 'metrics_count': 0}, {'pillar': 'reliability', 'score': 0.0, 'colour': '🔴', 'metrics_count': 0}, {'pillar': 'robustness', 'score': 0.0, 'colour': '🔴', 'metrics_count': 0}, {'pillar': 'privacy', 'score': 0.0, 'colour': '🔴', 'metrics_count': 0}], 'metrics': [{'metric_name': 'AnswerRelevancyMetric', 'canonical_details': [{'name': 'relevance_and_accuracy', 'description': 'Measures how well responses align with the user’s query and expected information.'}, {'name': 'factuality_an

In [76]:
import pandas as pd

# Normalize the main sections of the JSON
pillars_df = pd.json_normalize(report, record_path=['pillars'])
metrics_df = pd.json_normalize(report, record_path=['metrics'])
business_impact_df = pd.json_normalize(metrics_df.to_dict(orient='records'), record_path=['business_impact'], meta=['metric_name'])
action_details_df = pd.json_normalize(metrics_df.to_dict(orient='records'), record_path=['action_details'], meta=['metric_name'])

# Display the tables
print("Pillars Table:")
display(pillars_df)

print("\nMetrics Table:")
display(metrics_df)

print("\nBusiness Impact Table:")
display(business_impact_df)

print("\nAction Details Table:")
display(action_details_df)

Pillars Table:


Unnamed: 0,pillar,score,colour,metrics_count
0,performance,0.49,🔴,3
1,fairness_and_bias,0.55,🟠,5
2,safety_and_truthfulness,0.67,🟠,3
3,task_adherence,0.0,🔴,0
4,reliability,0.0,🔴,0
5,robustness,0.0,🔴,0
6,privacy,0.0,🔴,0



Metrics Table:


Unnamed: 0,metric_name,canonical_details,common_metric_name,common_metric_description,raw_value,original_value,risk_score,risk_band,technical_risk_name,technical_risk_id,technical_risk_description,pillar,threshold_min,threshold_max,better_high,control_details,common_risk_details,business_impact,action_details
0,AnswerRelevancyMetric,"[{'name': 'relevance_and_accuracy', 'descripti...",answer_relevance,Measures how relevant the model's response is ...,0.87619,87.619048,0.295238,med,low_answer_relevance,TR-ME-001,Responses are not relevant to the user’s query...,[performance],0.7,0.95,True,"[{'id': 'CTRL-010', 'name': 'LLM Fact-Verifica...","[{'id': 'RISK-011', 'name': 'Untruthful Conten...","[{'id': 'IMP-001', 'name': 'User Churn', 'desc...","[{'action_id': 'ACTN-001', 'ml_engineer_action..."
1,ContextualPrecisionMetric,"[{'name': 'relevance_and_accuracy', 'descripti...",context_precision,Assesses how much relevant context is captured...,0.75,75.0,0.6,med,low_context_precision,TR-ME-006,"Irrelevant context is included, distracting fr...","[performance, fairness_and_bias]",0.65,0.9,True,"[{'id': 'CTRL-009', 'name': 'Sensitive Data an...","[{'id': 'RISK-031', 'name': 'Privacy Leakage i...","[{'id': 'IMP-018', 'name': 'Misconfiguration E...","[{'action_id': 'ACTN-006', 'ml_engineer_action..."
2,ContextualRecallMetric,"[{'name': 'relevance_and_accuracy', 'descripti...",context_recall,Measures how much of the relevant context is r...,0.741667,74.166667,0.633333,med,low_context_recall,TR-ME-007,"Relevant context is omitted, leading to incomp...","[performance, fairness_and_bias]",0.65,0.9,True,"[{'id': 'CTRL-009', 'name': 'Sensitive Data an...","[{'id': 'RISK-036', 'name': 'Privacy Leakage T...","[{'id': 'IMP-018', 'name': 'Misconfiguration E...","[{'action_id': 'ACTN-007', 'ml_engineer_action..."
3,FaithfulnessMetric,"[{'name': 'factuality_and_faithfulness', 'desc...",faithfulness,Checks whether outputs remain factually accurate.,1.0,100.0,0.0,low,unfaithful_generation_risk,TR-ME-016,Generated content is not supported by source d...,"[fairness_and_bias, safety_and_truthfulness]",0.8,0.98,True,"[{'id': 'CTRL-010', 'name': 'LLM Fact-Verifica...","[{'id': 'RISK-015', 'name': 'Faithfulness Erro...","[{'id': 'IMP-001', 'name': 'User Churn', 'desc...","[{'action_id': 'ACTN-016', 'ml_engineer_action..."
4,BiasMetric,"[{'name': 'safety', 'description': 'Checks for...",bias,Detects unfair or prejudiced content in respon...,0.0,0.0,0.0,low,model_bias_risk,TR-ME-004,Outputs reflect unfair or prejudiced patterns ...,"[fairness_and_bias, safety_and_truthfulness]",0.0,0.1,False,"[{'id': 'CTRL-040', 'name': 'Bias Instance Log...","[{'id': 'RISK-070', 'name': 'Bias in Language ...","[{'id': 'IMP-003', 'name': 'Export Control Ris...","[{'action_id': 'ACTN-004', 'ml_engineer_action..."
5,HallucinationMetric,"[{'name': 'factuality_and_faithfulness', 'desc...",hallucination,Identifies content that deviates from known fa...,1.0,100.0,1.0,high,hallucination_risk,TR-ME-018,Model produces fabricated or ungrounded inform...,"[fairness_and_bias, safety_and_truthfulness]",0.0,0.1,False,"[{'id': 'CTRL-010', 'name': 'LLM Fact-Verifica...","[{'id': 'RISK-037', 'name': 'LLM Hallucination...","[{'id': 'IMP-001', 'name': 'User Churn', 'desc...","[{'action_id': 'ACTN-017', 'ml_engineer_action..."



Business Impact Table:


Unnamed: 0,id,name,description,category,code,severity,example,risk_types,explaination,created_at,updated_at,metric_name
0,IMP-001,User Churn,Impact related to user churn due to AI system ...,User Churn,CX,Critical,Example scenario involving user churn in live ...,"[PromptInjection, PoorUX]",Irrelevant answers drive frustration and user ...,2025-07-07T11:04:59.126913Z,2025-07-07T11:04:59.126913Z,AnswerRelevancyMetric
1,IMP-012,Control Gap,Impact related to control gap due to AI system...,Control Gap,GOV,Critical,Example scenario involving control gap in live...,"[PromptInjection, PoorUX]","Skewed scoring hides issues, leaving governanc...",2025-07-07T11:04:59.126913Z,2025-07-07T11:04:59.126913Z,AnswerRelevancyMetric
2,IMP-018,Misconfiguration Exploit,Impact related to misconfiguration exploit due...,Misconfiguration Exploit,SEC,Critical,Example scenario involving misconfiguration ex...,"[UndocumentedChange, Bias]",Mis-wired or insecure tool calls open vectors ...,2025-07-07T11:04:59.126913Z,2025-07-07T11:04:59.126913Z,AnswerRelevancyMetric
3,IMP-022,Incorrect Response,Impact related to incorrect response due to AI...,Incorrect Response,CX,Medium,Example scenario involving incorrect response ...,"[Bias, UndocumentedChange]",Wrong answers create downstream errors and re-...,2025-07-07T11:04:59.126913Z,2025-07-07T11:04:59.126913Z,AnswerRelevancyMetric
4,IMP-027,User Churn,Impact related to user churn due to AI system ...,User Churn,CX,Critical,Example scenario involving user churn in live ...,"[PoorUX, Bias]",Irrelevant responses frustrate users and drive...,2025-07-07T11:04:59.126913Z,2025-07-07T11:04:59.126913Z,AnswerRelevancyMetric
...,...,...,...,...,...,...,...,...,...,...,...,...
113,IMP-062,FTC Complaint,Impact related to ftc complaint due to AI syst...,FTC Complaint,REG,Low,Example scenario involving ftc complaint in li...,"[AccessControl, PromptInjection]",Misleading or false claims can trigger consume...,2025-07-07T11:04:59.126913Z,2025-07-07T11:04:59.126913Z,HallucinationMetric
114,IMP-067,Influencer Criticism,Impact related to influencer criticism due to ...,Influencer Criticism,REP,Medium,Example scenario involving influencer criticis...,"[Drift, AccessControl]",Factually incorrect or misleading outputs draw...,2025-07-07T11:04:59.126913Z,2025-07-07T11:04:59.126913Z,HallucinationMetric
115,IMP-068,Influencer Criticism,Impact related to influencer criticism due to ...,Influencer Criticism,REP,Low,Example scenario involving influencer criticis...,"[AccessControl, Bias]",Factually incorrect or misleading outputs draw...,2025-07-07T11:04:59.126913Z,2025-07-07T11:04:59.126913Z,HallucinationMetric
116,IMP-071,FTC Complaint,Impact related to ftc complaint due to AI syst...,FTC Complaint,REG,High,Example scenario involving ftc complaint in li...,"[Drift, Bias]",Misleading or false claims can trigger consume...,2025-07-07T11:04:59.126913Z,2025-07-07T11:04:59.126913Z,HallucinationMetric



Action Details Table:


Unnamed: 0,action_id,ml_engineer_action,business_manager_action,compliance_manager_action,description,source,control_ids,common_metric,metric_name
0,ACTN-001,"[Refine prompts, improve data quality, fine-tu...",[Ensure product requirements are aligned with ...,[Verify that the model outputs comply with con...,This control ensures that the model consistent...,[https://docs.ragas.io/en/v0.1.21/concepts/met...,[CTRL-010],answer_relevance,AnswerRelevancyMetric
1,ACTN-006,"[Optimize retrieval ranking, implement reranki...",[Ensure retriever system performance aligns wi...,[Maintain documentation of retrieval configura...,Improving contextual precision in retrieval sy...,[https://docs.ragas.io/en/v0.1.21/concepts/met...,[CTRL-009],context_precision,ContextualPrecisionMetric
2,ACTN-007,"[Increase top-k values, implement hybrid searc...",[Monitor retrieval and dialogue performance to...,[Document search and dialogue logic to meet tr...,Enhancing retrieval and conversational complet...,[https://docs.ragas.io/en/v0.1.21/concepts/met...,[CTRL-009],context_recall,ContextualRecallMetric
3,ACTN-016,[Ground generations in retrieved source docume...,[Ensure product quality by requiring traceable...,[Mandate grounding and entailment validation p...,Promoting factual generation by grounding outp...,[],[CTRL-010],faithfulness,FaithfulnessMetric
4,ACTN-004,"[Audit dataset balance, run automated bias sca...",[Promote inclusive design practices and ensure...,"[Establish procedures for bias evaluation, mit...",Ensuring models are assessed and improved for ...,[https://www.deepeval.com/docs/metrics-bias],[CTRL-040],bias,BiasMetric
5,ACTN-017,[Inject retrieval grounding and run claim-leve...,[Ensure generated content aligns with verifiab...,[Implement fact-checking protocols to validate...,Improving factual reliability by grounding gen...,[],[CTRL-010],hallucination,HallucinationMetric


In [75]:
metrics_df = pd.json_normalize(report, record_path=['metrics'])
display(metrics_df)


Unnamed: 0,metric_name,canonical_details,common_metric_name,common_metric_description,raw_value,original_value,risk_score,risk_band,technical_risk_name,technical_risk_id,technical_risk_description,pillar,threshold_min,threshold_max,better_high,control_details,common_risk_details,business_impact,action_details
0,AnswerRelevancyMetric,"[{'name': 'relevance_and_accuracy', 'descripti...",answer_relevance,Measures how relevant the model's response is ...,0.87619,87.619048,0.295238,med,low_answer_relevance,TR-ME-001,Responses are not relevant to the user’s query...,[performance],0.7,0.95,True,"[{'id': 'CTRL-010', 'name': 'LLM Fact-Verifica...","[{'id': 'RISK-011', 'name': 'Untruthful Conten...","[{'id': 'IMP-001', 'name': 'User Churn', 'desc...","[{'action_id': 'ACTN-001', 'ml_engineer_action..."
1,ContextualPrecisionMetric,"[{'name': 'relevance_and_accuracy', 'descripti...",context_precision,Assesses how much relevant context is captured...,0.75,75.0,0.6,med,low_context_precision,TR-ME-006,"Irrelevant context is included, distracting fr...","[performance, fairness_and_bias]",0.65,0.9,True,"[{'id': 'CTRL-009', 'name': 'Sensitive Data an...","[{'id': 'RISK-031', 'name': 'Privacy Leakage i...","[{'id': 'IMP-018', 'name': 'Misconfiguration E...","[{'action_id': 'ACTN-006', 'ml_engineer_action..."
2,ContextualRecallMetric,"[{'name': 'relevance_and_accuracy', 'descripti...",context_recall,Measures how much of the relevant context is r...,0.741667,74.166667,0.633333,med,low_context_recall,TR-ME-007,"Relevant context is omitted, leading to incomp...","[performance, fairness_and_bias]",0.65,0.9,True,"[{'id': 'CTRL-009', 'name': 'Sensitive Data an...","[{'id': 'RISK-036', 'name': 'Privacy Leakage T...","[{'id': 'IMP-018', 'name': 'Misconfiguration E...","[{'action_id': 'ACTN-007', 'ml_engineer_action..."
3,FaithfulnessMetric,"[{'name': 'factuality_and_faithfulness', 'desc...",faithfulness,Checks whether outputs remain factually accurate.,1.0,100.0,0.0,low,unfaithful_generation_risk,TR-ME-016,Generated content is not supported by source d...,"[fairness_and_bias, safety_and_truthfulness]",0.8,0.98,True,"[{'id': 'CTRL-010', 'name': 'LLM Fact-Verifica...","[{'id': 'RISK-015', 'name': 'Faithfulness Erro...","[{'id': 'IMP-001', 'name': 'User Churn', 'desc...","[{'action_id': 'ACTN-016', 'ml_engineer_action..."
4,BiasMetric,"[{'name': 'safety', 'description': 'Checks for...",bias,Detects unfair or prejudiced content in respon...,0.0,0.0,0.0,low,model_bias_risk,TR-ME-004,Outputs reflect unfair or prejudiced patterns ...,"[fairness_and_bias, safety_and_truthfulness]",0.0,0.1,False,"[{'id': 'CTRL-040', 'name': 'Bias Instance Log...","[{'id': 'RISK-070', 'name': 'Bias in Language ...","[{'id': 'IMP-003', 'name': 'Export Control Ris...","[{'action_id': 'ACTN-004', 'ml_engineer_action..."
5,HallucinationMetric,"[{'name': 'factuality_and_faithfulness', 'desc...",hallucination,Identifies content that deviates from known fa...,1.0,100.0,1.0,high,hallucination_risk,TR-ME-018,Model produces fabricated or ungrounded inform...,"[fairness_and_bias, safety_and_truthfulness]",0.0,0.1,False,"[{'id': 'CTRL-010', 'name': 'LLM Fact-Verifica...","[{'id': 'RISK-037', 'name': 'LLM Hallucination...","[{'id': 'IMP-001', 'name': 'User Churn', 'desc...","[{'action_id': 'ACTN-017', 'ml_engineer_action..."


In [74]:
# Check the first few rows to understand the structure
print("Dataset type:", type(dataset))
print("First row type:", type(dataset[0]))
print("First row:", dataset[0])
print("\nColumn names:", dataset.column_names)
print("Dataset features:", dataset.features)

Dataset type: <class 'datasets.arrow_dataset.Dataset'>
First row type: <class 'dict'>
First row: {'question': 'What area did NVIDIA initially focus on before expanding to other computationally intensive fields?', 'answer': 'NVIDIA initially focused on PC graphics.', 'context': 'Since our original focus on PC graphics, we have expanded to several other large and important computationally intensive fields.', 'ticker': 'NVDA', 'filing': '2023_10K'}

Column names: ['question', 'answer', 'context', 'ticker', 'filing']
Dataset features: {'question': Value('string'), 'answer': Value('string'), 'context': Value('string'), 'ticker': Value('string'), 'filing': Value('string')}
