# Open inference files

In [1]:
import os
import ast
import numpy as np
import pandas as pd
import warnings

from tqdm.auto import tqdm

%load_ext autoreload
%autoreload 2

from medqa.classical_metrics import get_bleu_score
from medqa.classical_metrics import get_rouge_scores

warnings.filterwarnings("ignore", module='nltk')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Douglas\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Douglas\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [2]:
metrics_files = os.listdir("metrics")
metrics_files = [os.path.join("metrics", x) for x in metrics_files]
metrics_files

['metrics\\glove_model_metrics.zip',
 'metrics\\qa_retrieval_metrics.zip',
 'metrics\\zero_shot_metrics.zip']

In [7]:
def compute_nlp_metrics(df):
    """ Receives a dataframe with "answer" and "candidate_answer"
    and computes BLEU and rouge scores
    """
    scores = {"LLM_judge": [], "bleu": [], "rouge1": [], 'rouge2': [], 'rougeL': []}
    # judge metrics
    for idx, row in df.iterrows():
        eval_string = row["metrics"]

        # since the LLM is small, sometimes it forgets the brackets []
        if not eval_string.startswith("["):
            eval_string = f"[{eval_string}]"
        judge_score = ast.literal_eval(eval_string)

        # since the LLM is small, sometimes the LLM judge assigns score > 1
        judge_score = min(1.0, np.sum(judge_score) / 6.0)
        scores["LLM_judge"].append(judge_score)

    # BLEU and rouge
    for idx, row in tqdm(df.iterrows(), desc="Computing scores", total=len(df)):
        ref_answers = ast.literal_eval(row["answer"])
        cand_answer = row["candidate_ans"]
        bleu_score = get_bleu_score(ref_answers, cand_answer)
        scores["bleu"].append(bleu_score)
        rouge_scores = get_rouge_scores(ref_answers, cand_answer)
        scores["rouge1"].append(rouge_scores[0])
        scores["rouge2"].append(rouge_scores[1])
        scores["rougeL"].append(rouge_scores[2])
    report = {
        "number_of_qa_pairs": len(df),
        "LLM_judge_avg": str(np.mean(scores["LLM_judge"])) + " N/A",
        "LLM_judge_std": str(np.std(scores["LLM_judge"])) + " N/A",
        "bleu_avg": np.mean(scores["bleu"]),
        "bleu_std": np.std(scores["bleu"]),
        "rouge1_avg": np.mean(scores["rouge1"]),
        "rouge1_std": np.std(scores["rouge1"]),
        "rouge2_avg": np.mean(scores["rouge2"]),
        "rouge2_std": np.std(scores["rouge2"]),
        "rougeL_avg": np.mean(scores["rougeL"]),
        "rougeL_std": np.std(scores["rougeL"]),
    }
    return report


In [8]:
all_metrics = []
for file in metrics_files:
    df = pd.read_csv(file)
    metrics = compute_nlp_metrics(df)
    metrics["file"] = file
    all_metrics.append(metrics)

Computing scores:   0%|          | 0/1634 [00:00<?, ?it/s]

Computing scores:   0%|          | 0/1634 [00:00<?, ?it/s]

Computing scores:   0%|          | 0/1634 [00:00<?, ?it/s]

In [9]:
pd.DataFrame(all_metrics).set_index("file").T

file,metrics\glove_model_metrics.zip,metrics\qa_retrieval_metrics.zip,metrics\zero_shot_metrics.zip
number_of_qa_pairs,1634.0,1634.0,1634.0
LLM_judge_avg,0.0,1.0,0.852407
LLM_judge_std,0.0,0.0,0.231246
bleu_avg,0.157639,0.034624,0.013353
bleu_std,0.265394,0.056064,0.017178
rouge1_avg,0.382012,0.30111,0.266303
rouge1_std,0.23621,0.110183,0.089088
rouge2_avg,0.211948,0.087507,0.057408
rouge2_std,0.278014,0.072245,0.033116
rougeL_avg,0.289674,0.178179,0.137508


In [6]:
df

Unnamed: 0,question,answer,candidate_ans,metrics,reasoning
0,Do you have information about Acupuncture,"[""Summary : Acupuncture has been practiced in ...",\n\nAcupuncture is a traditional Chinese medic...,"[2, 1, 2, 1]","<think>\nOkay, let's tackle this evaluation. T..."
1,Do you have information about Advance Directives,['Summary : What kind of medical care would yo...,\n\nAn **advance directive** is a legal docume...,"[2, 1, 2, 1]","<think>\nOkay, let's tackle this evaluation. T..."
2,Do you have information about Breast Reconstru...,"[""Summary : If you need a mastectomy, you have...",\n\nBreast reconstruction is a complex and ind...,"[2, 1, 1, 0]","<think>\nOkay, let's tackle this evaluation. T..."
3,Do you have information about Breastfeeding,"[""Summary : Breastfeeding offers many benefits...",\n\nBreastfeeding is a natural and beneficial ...,"[2, 1, 2, 1]","<think>\nOkay, let's tackle this evaluation. T..."
4,Do you have information about Cancer Chemotherapy,"['Summary : Normally, your cells grow and die ...",\n\nChemotherapy is a systemic treatment that ...,"[2, 1, 2, 1]","<think>\nOkay, let's tackle this evaluation. T..."
...,...,...,...,...,...
1629,what research (or clinical trials) is being do...,['The NINDS supports and conducts a wide range...,\n\nSotos Syndrome (SS) is a rare genetic diso...,"[2, 1, 2, 1]","<think>\nOkay, let's tackle this evaluation. T..."
1630,what research (or clinical trials) is being do...,"[""The National Institute of Neurological Disor...",\n\nResearch and clinical trials for stroke fo...,"[2, 1, 2, 1]","<think>\nOkay, let's tackle this evaluation. T..."
1631,what research (or clinical trials) is being do...,['The National Institute of Neurological Disor...,\n\nThoracic Outlet Syndrome (TOS) is a comple...,"[2, 1, 2, 1]","<think>\nOkay, let's tackle this evaluation. T..."
1632,what research (or clinical trials) is being do...,"[""New types of treatment are being tested in c...",\n\nResearch and clinical trials for **urethra...,"[2, 1, 2, 1]","<think>\nOkay, let's tackle this evaluation. T..."
