In [1]:
!pip install -q -U ragas datasets

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.2/81.2 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m16.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m22.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.9/302.9 kB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m33.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m320.1/320.1 kB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m71.1/71.1 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━

In [2]:
import pandas as pd
import json
from datasets import Dataset
from datasets import load_dataset
from ragas import evaluate
from ragas.metrics import (
    context_precision,
    context_recall,
    faithfulness,
)

# Initialization

In [17]:
import os
os.environ["OPENAI_API_KEY"] = "YOUR_API_KEY"

In [18]:
# Select the model for the evaluation
from langchain_openai.chat_models import ChatOpenAI
gpt4 = ChatOpenAI(model_name="gpt-4-turbo-2024-04-09")

In [19]:
# Set the API for the tracing with langchain
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ["LANGCHAIN_API_KEY"] = "YOUR_API_KEY"

In [20]:
# Initialize the tracer
from langchain.callbacks.tracers import LangChainTracer
tracer = LangChainTracer(project_name="test")

# Data loading

In [2]:
DATA_PATH = './data'

In [5]:
# Load medqa samples (150 positive and 50 negative)
medqa_samples = pd.read_json(f'{DATA_PATH}/medqa_150_50_samples_ragas.json')

In [6]:
# Load zephyr predictions
with open(f'{DATA_PATH}/zephyr_preds_medqa_4opt.json') as f:
    zephyr_preds_medqa = json.load(f)

In [7]:
# load retrieved contexts
with open(f'{DATA_PATH}/medqa_opt4_rerank_usemedwiki.json') as f:
    retrieved_ctxs = json.load(f)

In [8]:
# Extraxt options from the question
options = []
for idx in range(len(medqa_samples)):
    q = str(medqa_samples.iloc[idx]["question"])

    options.append({
        "A": q.split("\nA. ")[1].split("\nB. ")[0],
        "B": q.split("\nB. ")[1].split("\nC. ")[0],
        "C": q.split("\nC. ")[1].split("\nD. ")[0],
        "D": q.split("\nD. ")[1]
    })

# Add a column dataset with the question options
medqa_samples["options"] = options

# Evaluation

In [25]:
# Fix the number of samples to evaluate
N_SAMPLES = 2

## Generated contexts

Evaluation of both the correct and incorrect answers.

In [26]:
dt = {
        "question": [q.replace("\nA. ", "\n- ").replace("\nB. ", "\n- ").replace("\nC. ", "\n- ").replace("\nD. ", "\n- ") for q in medqa_samples["question"][:N_SAMPLES]],
        "contexts": [[c["text"] for c in ctx] for ctx in medqa_samples["ctxs"][:N_SAMPLES]],
        "ground_truth": [medqa_samples.iloc[idx]["options"][medqa_samples.iloc[idx]["target"]] for idx in range(N_SAMPLES)],
        "answer": [zephyr_preds_medqa['outputs'][idx]['answer'].split(')', 1)[1].strip() for idx in medqa_samples.index[:N_SAMPLES]],
        "correct_answer": medqa_samples['correct_answer'][:N_SAMPLES].to_list()

}

medqa_to_eval_w_ops = Dataset.from_dict(dt)

In [27]:
result = evaluate(
    medqa_to_eval_w_ops,
    metrics=[
        context_precision,
        faithfulness,
        context_recall

    ],
    llm=gpt4,
    callbacks=[tracer]
)

df = result.to_pandas()

Evaluating:   0%|          | 0/6 [00:00<?, ?it/s]

In [28]:
df.head()

Unnamed: 0,question,contexts,ground_truth,answer,correct_answer,context_precision,faithfulness,context_recall
0,A 47-year-old woman with metastatic breast can...,[Filgrastim is a colony-stimulating factor tha...,Filgrastim,Filgrastim.,True,1.0,0.2,1.0
1,A 9-month-old boy is brought to his pediatrici...,[The laboratory results reveal that the patien...,Proper diet and iron supplementation,Proper diet and iron supplementation.,True,1.0,0.25,1.0


In [29]:
# Save the results
# df.to_json(f"generated_metric_result_medqa_w_ops.json")
df.to_csv(f"generated_metric_result_medqa_w_ops.csv")

## Retrieved contexts

Evaluation of both the correct and incorrect answers.

In [30]:
dt = {
        "question": [q.replace("\nA. ", "\n- ").replace("\nB. ", "\n- ").replace("\nC. ", "\n- ").replace("\nD. ", "\n- ") for q in medqa_samples["question"][:N_SAMPLES]],
        "contexts": [[c for c in retrieved_ctxs["questions"][idx]["context"][:5]] for idx in medqa_samples.index[:N_SAMPLES]],
        "ground_truth": [medqa_samples.iloc[idx]["options"][medqa_samples.iloc[idx]["target"]] for idx in range(N_SAMPLES)],
        "answer": [retrieved_ctxs["questions"][idx]['response'].split(')', 1)[1].strip() for idx in medqa_samples.index[:N_SAMPLES]],
        "correct_answer": medqa_samples['correct_answer'][:N_SAMPLES].to_list()
}

medqa_to_eval_retriever_w_ops_positive = Dataset.from_dict(dt)

In [31]:
result = evaluate(
        medqa_to_eval_retriever_w_ops_positive,
        metrics=[
            context_precision,
            faithfulness,
            context_recall
        ],
        llm=gpt4,
        callbacks=[tracer]
    )

df = result.to_pandas()

Evaluating:   0%|          | 0/6 [00:00<?, ?it/s]

In [32]:
df.head()

Unnamed: 0,question,contexts,ground_truth,answer,correct_answer,context_precision,faithfulness,context_recall
0,A 47-year-old woman with metastatic breast can...,[Leukopaenia a comparatively low white blood c...,Filgrastim,Filgrastim.,True,0.25,0.2,1.0
1,A 9-month-old boy is brought to his pediatrici...,[The child's autopsy indicated he likely died ...,Proper diet and iron supplementation,Proper diet and iron supplementation.,True,0.0,0.0,0.0


In [33]:
# Save results
# df.to_json(f"retrieved_results_medqa_w_ops.json")
df.to_csv(f"retrieved_results_medqa_w_ops.csv")