# Analyze data and LLM answers

In [1]:
import pandas as pd
import gat_llm.llm_invoker as inv
from tqdm.auto import tqdm

%load_ext autoreload
%autoreload 2

import medqa.judge
from medqa.datasplit import get_split
from medqa.qa_retrieval_agent import QARetrievalAgent
from medqa.glove_retriever import GloVeSimilarity

## Create LLMs to evaluate

In [2]:
llm_name = "Qwen 3 1.7b - Ollama"
llm = inv.LLM_Provider.get_llm(None, llm_name)

medqa_prompt = """You are medical question-answering system expert at providing medical information.
Your goal is to effectively answer user queries related to medical diseases.
Refuse to answer questions not related to your expertise."""

judge_llm = medqa.judge.AnswerJudge()

## Load data

In [3]:
file = "data/intern_screening_dataset.csv"
df = pd.read_csv(file)
row_split = [get_split(x) for x in list(df.index)]
df["split"] = row_split

In [4]:
df

Unnamed: 0,question,answer,split
0,What is (are) Glaucoma ?,Glaucoma is a group of diseases that can damag...,train
1,What is (are) Glaucoma ?,The optic nerve is a bundle of more than 1 mil...,train
2,What is (are) Glaucoma ?,Open-angle glaucoma is the most common form of...,train
3,Who is at risk for Glaucoma? ?,Anyone can develop glaucoma. Some people are a...,train
4,How to prevent Glaucoma ?,"At this time, we do not know how to prevent gl...",train
...,...,...,...
16401,What is (are) Diabetic Neuropathies: The Nerve...,Autonomic neuropathy affects the nerves that c...,train
16402,What is (are) Diabetic Neuropathies: The Nerve...,"Proximal neuropathy, sometimes called lumbosac...",train
16403,What is (are) Diabetic Neuropathies: The Nerve...,Focal neuropathy appears suddenly and affects ...,train
16404,How to prevent Diabetic Neuropathies: The Nerv...,The best way to prevent neuropathy is to keep ...,test


In [5]:
df[df.split == "train"]["question"].to_csv("data/train_questions.csv")

In [6]:
qara = QARetrievalAgent(df[df.split == "train"])

Reading GloVe:   0%|          | 0/400000 [00:00<?, ?it/s]

Precomputing QA embeddings:   0%|          | 0/13042 [00:00<?, ?it/s]

In [7]:
# import json
# with open("temp.json", "w") as f:
#    f.write(json.dumps(qara.questions))

In [8]:
df_val = df[df.split == "val"].reset_index(drop=True)
df_val

Unnamed: 0,question,answer,split
0,Who is at risk for High Blood Pressure? ?,Not a Normal Part of Aging Nearly 1 in 3 Ameri...,val
1,What is (are) High Blood Pressure ?,Normal blood pressure for adults is defined as...,val
2,What is (are) High Blood Pressure ?,Abnormal blood pressure is higher than 120/80 ...,val
3,What is (are) High Blood Pressure ?,Older adults should limit their sodium (salt) ...,val
4,What causes Urinary Tract Infections ?,"Most urinary tract infections, or UTIs, are ca...",val
...,...,...,...
1656,What causes Mntriers Disease ?,Scientists are unsure about what causes Mntrie...,val
1657,What are the symptoms of Mntriers Disease ?,The most common symptom of Mntriers disease is...,val
1658,What are the symptoms of Microscopic Colitis: ...,The most common symptom of microscopic colitis...,val
1659,How to prevent Microscopic Colitis: Collagenou...,Researchers do not know how to prevent microsc...,val


In [9]:
df_val_agg = df_val.groupby('question')['answer'].apply(list).reset_index()
df_val_agg

Unnamed: 0,question,answer
0,Do you have information about Acupuncture,[Summary : Acupuncture has been practiced in C...
1,Do you have information about Advance Directives,[Summary : What kind of medical care would you...
2,Do you have information about Breast Reconstru...,"[Summary : If you need a mastectomy, you have ..."
3,Do you have information about Breastfeeding,[Summary : Breastfeeding offers many benefits ...
4,Do you have information about Cancer Chemotherapy,"[Summary : Normally, your cells grow and die i..."
...,...,...
1629,what research (or clinical trials) is being do...,[The NINDS supports and conducts a wide range ...
1630,what research (or clinical trials) is being do...,[The National Institute of Neurological Disord...
1631,what research (or clinical trials) is being do...,[The National Institute of Neurological Disord...
1632,what research (or clinical trials) is being do...,[New types of treatment are being tested in cl...


In [10]:
df_val_agg.iloc[10].answer

["Summary : In coronary artery disease (CAD), the arteries that supply blood and oxygen to your heart muscle grow hardened and narrowed. You may try treatments such as lifestyle changes, medicines, and angioplasty, a procedure to open the arteries. If these treatments don't help, you may need coronary artery bypass surgery.    The surgery creates a new path for blood to flow to the heart. The surgeon takes a healthy piece of vein from the leg or artery from the chest or wrist. Then the surgeon attaches it to the coronary artery, just above and below the narrowed area or blockage. This allows blood to bypass (get around) the blockage. Sometimes people need more than one bypass.    The results of the surgery usually are excellent. Many people remain symptom-free for many years. You may need surgery again if blockages form in the grafted arteries or veins or in arteries that weren't blocked before. Lifestyle changes and medicines may help prevent arteries from becoming clogged again.    N

## Assign scores

In [11]:
# df_val_agg = df_val_agg[0:5].copy()

In [12]:
"""
all_metrics = []
all_reasoning = []
all_ans = []
for idx, row in tqdm(df_val_agg.iterrows(), total=len(df_val_agg)):
    ans = llm(
        row["question"],
        chat_history=[],
        system_prompt=medqa_prompt,
    )
    for x in ans:
        pass
    candidate_ans = x
    if "</think>" in candidate_ans:
        candidate_ans = candidate_ans.split("</think>")[-1]
    reasoning, metrics = judge_llm.judge_answer(row["question"], row["answer"], candidate_ans)
    all_ans.append(candidate_ans)
    all_metrics.append(metrics)
    all_reasoning.append(reasoning)

df_val_agg["candidate_ans"] = all_ans
df_val_agg["metrics"] = all_metrics
df_val_agg["reasoning"] = all_reasoning

df_val_agg.to_csv("zero_shot_metrics.csv", index=False)
"""

'\nall_metrics = []\nall_reasoning = []\nall_ans = []\nfor idx, row in tqdm(df_val_agg.iterrows(), total=len(df_val_agg)):\n    ans = llm(\n        row["question"],\n        chat_history=[],\n        system_prompt=medqa_prompt,\n    )\n    for x in ans:\n        pass\n    candidate_ans = x\n    if "</think>" in candidate_ans:\n        candidate_ans = candidate_ans.split("</think>")[-1]\n    reasoning, metrics = judge_llm.judge_answer(row["question"], row["answer"], candidate_ans)\n    all_ans.append(candidate_ans)\n    all_metrics.append(metrics)\n    all_reasoning.append(reasoning)\n\ndf_val_agg["candidate_ans"] = all_ans\ndf_val_agg["metrics"] = all_metrics\ndf_val_agg["reasoning"] = all_reasoning\n\ndf_val_agg.to_csv("zero_shot_metrics.csv", index=False)\n'

In [13]:
row = df_val_agg.iloc[10]
row["question"]

'Do you have information about Coronary Artery Bypass Surgery'

In [14]:
# df_val_agg.to_csv("zero_shot_metrics.csv", index=False)

In [15]:
import ast
df_val_agg = pd.read_csv("metrics/zero_shot_metrics.csv")
df_val_agg.metrics = df_val_agg.metrics.map(ast.literal_eval)
df_val_agg

Unnamed: 0,question,answer,candidate_ans,metrics,reasoning
0,Do you have information about Acupuncture,"[""Summary : Acupuncture has been practiced in ...",\n\nAcupuncture is a traditional Chinese medic...,"[2, 1, 2, 1]","<think>\nOkay, let's tackle this evaluation. T..."
1,Do you have information about Advance Directives,['Summary : What kind of medical care would yo...,\n\nAn **advance directive** is a legal docume...,"[2, 1, 2, 1]","<think>\nOkay, let's tackle this evaluation. T..."
2,Do you have information about Breast Reconstru...,"[""Summary : If you need a mastectomy, you have...",\n\nBreast reconstruction is a complex and ind...,"[2, 1, 1, 0]","<think>\nOkay, let's tackle this evaluation. T..."
3,Do you have information about Breastfeeding,"[""Summary : Breastfeeding offers many benefits...",\n\nBreastfeeding is a natural and beneficial ...,"[2, 1, 2, 1]","<think>\nOkay, let's tackle this evaluation. T..."
4,Do you have information about Cancer Chemotherapy,"['Summary : Normally, your cells grow and die ...",\n\nChemotherapy is a systemic treatment that ...,"[2, 1, 2, 1]","<think>\nOkay, let's tackle this evaluation. T..."
...,...,...,...,...,...
1629,what research (or clinical trials) is being do...,['The NINDS supports and conducts a wide range...,\n\nSotos Syndrome (SS) is a rare genetic diso...,"[2, 1, 2, 1]","<think>\nOkay, let's tackle this evaluation. T..."
1630,what research (or clinical trials) is being do...,"[""The National Institute of Neurological Disor...",\n\nResearch and clinical trials for stroke fo...,"[2, 1, 2, 1]","<think>\nOkay, let's tackle this evaluation. T..."
1631,what research (or clinical trials) is being do...,['The National Institute of Neurological Disor...,\n\nThoracic Outlet Syndrome (TOS) is a comple...,"[2, 1, 2, 1]","<think>\nOkay, let's tackle this evaluation. T..."
1632,what research (or clinical trials) is being do...,"[""New types of treatment are being tested in c...",\n\nResearch and clinical trials for **urethra...,"[2, 1, 2, 1]","<think>\nOkay, let's tackle this evaluation. T..."


### Debug

In [17]:
ans = llm(
    row["question"],
    chat_history=[],
    system_prompt=medqa_prompt,
)
prev = ""
for x in ans:
    cur_ans = x
    # print(cur_ans.replace(prev, ''))
    # prev = cur_ans
    print('.', end='')
print('\n')
print(x)
candidate_ans = x

Invoking Qwen 3 1.7b (Tiny-size LLM) - locally from Ollama. Word count: 50
.............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

In [18]:
if "</think>" in candidate_ans:
    candidate_ans = candidate_ans.split("</think>")[-1]
reasoning, metrics = judge_llm.judge_answer(row["question"], row["answer"], candidate_ans)

Invoking Qwen 3 1.7b (Tiny-size LLM) - locally from Ollama. Word count: 732


In [19]:
row["question"], row["answer"]

('Do you have information about Coronary Artery Bypass Surgery',
 ["Summary : In coronary artery disease (CAD), the arteries that supply blood and oxygen to your heart muscle grow hardened and narrowed. You may try treatments such as lifestyle changes, medicines, and angioplasty, a procedure to open the arteries. If these treatments don't help, you may need coronary artery bypass surgery.    The surgery creates a new path for blood to flow to the heart. The surgeon takes a healthy piece of vein from the leg or artery from the chest or wrist. Then the surgeon attaches it to the coronary artery, just above and below the narrowed area or blockage. This allows blood to bypass (get around) the blockage. Sometimes people need more than one bypass.    The results of the surgery usually are excellent. Many people remain symptom-free for many years. You may need surgery again if blockages form in the grafted arteries or veins or in arteries that weren't blocked before. Lifestyle changes and med

In [20]:
print(reasoning)

<think>
Okay, let's tackle this evaluation. The user provided an answer about Coronary Artery Bypass Surgery (CABG) and wants me to judge it based on the given criteria.

First, I need to check the correctness. The reference answer mentions that CABG is used for CAD when other treatments fail. The evaluation answer starts by stating that CABG is for CAD, which matches. It also explains the purpose, method, types of grafts, procedure steps, when it's needed, risks, recovery, lifestyle changes, and alternatives. The reference answer mentions the NIH and that the results are usually excellent, but the evaluation answer doesn't mention that. However, the evaluation answer is comprehensive in covering all the main points.

Next, was the data consistent? The evaluation answer uses data from the reference answer. For example, it mentions the types of grafts (PAG, ITA, GSV), which are part of the reference. The reference also talks about risks like infection, bleeding, heart attack, and graft 

In [21]:
metrics

[2, 2, 2, 2]

In [22]:
qara = QARetrievalAgent(df[df.split == "train"])

Reading GloVe:   0%|          | 0/400000 [00:00<?, ?it/s]

Precomputing QA embeddings:   0%|          | 0/13042 [00:00<?, ?it/s]

In [23]:
sel_q = qara("What is (are) diabetes")
print(sel_q)

Invoking Qwen 3 1.7b (Tiny-size LLM) - locally from Ollama. Word count: 1992
<think>
Okay, the user is asking about diabetes. Let me check the QA pairs to see what information is available.

Looking at the first QA pair, it explains that diabetes is a complex group of diseases with various causes. It mentions type 1, type 2, and gestational diabetes. The answer covers how diabetes affects the body, the role of insulin, and the complications. 

The second QA pair is similar, providing more details on the types of diabetes, the pancreas's role, and the factors that contribute to diabetes. It also mentions the interaction between genetic and environmental factors.

The other QA pairs are duplicates of the first two, so I can ignore them. The key points are the different types of diabetes, how insulin works, the causes (genetic and environmental factors), and the complications.

The user might want a general overview of diabetes, its types, causes, and effects. I should structure the answe

### Glove

In [24]:
gs = GloVeSimilarity(df[df.split == "train"])

Reading GloVe:   0%|          | 0/400000 [00:00<?, ?it/s]

Precomputing QA embeddings:   0%|          | 0/13042 [00:00<?, ?it/s]

In [25]:
ans = gs.find_documents("Tell me about weaver syndrome")
ans

Unnamed: 0,question,answer,split
7333,What is (are) Weaver syndrome ?,Weaver syndrome is a rare condition that is ch...,train
10151,What is (are) Weaver syndrome ?,Weaver syndrome is a condition that involves t...,train
3805,What is (are) Barber Say syndrome ?,Barber Say syndrome is a very rare condition c...,train
3132,What is (are) Williams syndrome ?,Williams syndrome is a developmental disorder ...,train
12186,What is (are) Williams syndrome ?,Williams syndrome is a developmental disorder ...,train


In [26]:
ans.iloc[4].to_dict()

{'question': 'What is (are) Williams syndrome ?',
 'answer': "Williams syndrome is a developmental disorder that affects many parts of the body. This condition is characterized by mild to moderate intellectual disability or learning problems, unique personality characteristics, distinctive facial features, and heart and blood vessel (cardiovascular) problems.  People with Williams syndrome typically have difficulty with visual-spatial tasks such as drawing and assembling puzzles, but they tend to do well on tasks that involve spoken language, music, and learning by repetition (rote memorization). Affected individuals have outgoing, engaging personalities and tend to take an extreme interest in other people. Attention deficit disorder (ADD), problems with anxiety, and phobias are common among people with this disorder.  Young children with Williams syndrome have distinctive facial features including a broad forehead, a short nose with a broad tip, full cheeks, and a wide mouth with full

In [27]:
gs._get_sentence_emb("What is (are) Glaucoma ?")

array([ 0.31366   ,  0.3148742 , -0.042571  , -0.13993   , -0.035968  ,
        0.32456   ,  0.05939078, -0.178115  ,  0.14628   ,  0.265968  ,
        0.057822  ,  0.2560208 , -0.1843158 , -0.2127931 ,  0.312108  ,
        0.479494  ,  0.157942  ,  0.113314  , -0.04547126, -0.169932  ,
       -0.575756  ,  0.336336  ,  0.2468612 ,  0.1148392 ,  0.3475182 ,
       -1.360592  , -0.68252   ,  0.218984  ,  0.4459204 , -0.3003208 ,
        2.027504  ,  0.174394  , -0.17413   , -0.438442  , -0.155947  ,
       -0.2054892 ,  0.109008  ,  0.1422782 ,  0.368612  , -0.252406  ,
       -0.13916448,  0.260114  ,  0.0724464 ,  0.5174178 , -0.0187284 ,
        0.022978  ,  0.058023  , -0.149932  ,  0.003832  ,  0.47312   ])

In [28]:
gs("What is (are) Glaucoma ?")

"Glaucoma is a group of diseases that can damage the eye's optic nerve and result in vision loss and blindness. The most common form of the disease is open-angle glaucoma. With early treatment, you can often protect your eyes against serious vision loss. (Watch the video to learn more about glaucoma. To enlarge the video, click the brackets in the lower right-hand corner. To reduce the video, press the Escape (Esc) button on your keyboard.)  See this graphic for a quick overview of glaucoma, including how many people it affects, whos at risk, what to do if you have it, and how to learn more.  See a glossary of glaucoma terms."