# Analyze data and LLM answers

In [None]:
import pandas as pd
import gat_llm.llm_invoker as inv
from tqdm.auto import tqdm

%load_ext autoreload
%autoreload 2

import medqa.judge
from medqa.datasplit import get_split
from medqa.qa_retrieval_agent import QARetrievalAgent
from medqa.glove_retriever import GloVeSimilarity

## Create LLMs to evaluate

In [None]:
llm_name = "Qwen 3 1.7b - Ollama"
llm = inv.LLM_Provider.get_llm(None, llm_name)

medqa_prompt = """You are medical question-answering system expert at providing medical information.
Your goal is to effectively answer user queries related to medical diseases.
Refuse to answer questions not related to your expertise."""

judge_llm = medqa.judge.AnswerJudge()

## Load data

In [None]:
file = "data/intern_screening_dataset.csv"
df = pd.read_csv(file)
row_split = [get_split(x) for x in list(df.index)]
df["split"] = row_split

In [None]:
df

In [None]:
df[df.split == "train"]["question"].to_csv("data/train_questions.csv")

In [None]:
qara = QARetrievalAgent(df[df.split == "train"])

In [None]:
# import json
# with open("temp.json", "w") as f:
#    f.write(json.dumps(qara.questions))

In [None]:
df_val = df[df.split == "val"].reset_index(drop=True)
df_val

In [None]:
df_val_agg = df_val.groupby('question')['answer'].apply(list).reset_index()
df_val_agg

In [None]:
df_val_agg.iloc[10].answer

## Assign scores

In [None]:
# df_val_agg = df_val_agg[0:5].copy()

In [None]:
"""
all_metrics = []
all_reasoning = []
all_ans = []
for idx, row in tqdm(df_val_agg.iterrows(), total=len(df_val_agg)):
    ans = llm(
        row["question"],
        chat_history=[],
        system_prompt=medqa_prompt,
    )
    for x in ans:
        pass
    candidate_ans = x
    if "</think>" in candidate_ans:
        candidate_ans = candidate_ans.split("</think>")[-1]
    reasoning, metrics = judge_llm.judge_answer(row["question"], row["answer"], candidate_ans)
    all_ans.append(candidate_ans)
    all_metrics.append(metrics)
    all_reasoning.append(reasoning)

df_val_agg["candidate_ans"] = all_ans
df_val_agg["metrics"] = all_metrics
df_val_agg["reasoning"] = all_reasoning

df_val_agg.to_csv("zero_shot_metrics.csv", index=False)
"""

In [None]:
row = df_val_agg.iloc[10]
row["question"]

In [None]:
# df_val_agg.to_csv("zero_shot_metrics.csv", index=False)

In [None]:
import ast
df_val_agg = pd.read_csv("metrics/zero_shot_metrics.csv")
df_val_agg.metrics = df_val_agg.metrics.map(ast.literal_eval)
df_val_agg

### Debug

In [None]:
ans = llm(
    row["question"],
    chat_history=[],
    system_prompt=medqa_prompt,
)
prev = ""
for x in ans:
    cur_ans = x
    # print(cur_ans.replace(prev, ''))
    # prev = cur_ans
    print('.', end='')
print('\n')
print(x)
candidate_ans = x

In [None]:
if "</think>" in candidate_ans:
    candidate_ans = candidate_ans.split("</think>")[-1]
reasoning, metrics = judge_llm.judge_answer(row["question"], row["answer"], candidate_ans)

In [None]:
row["question"], row["answer"]

In [None]:
print(reasoning)

In [None]:
metrics

In [None]:
qara = QARetrievalAgent(df[df.split == "train"])

In [None]:
sel_q = qara("What is (are) diabetes")
print(sel_q)

### Glove

In [None]:
gs = GloVeSimilarity(df[df.split == "train"])

In [None]:
ans = gs.find_documents("Tell me about weaver syndrome")
ans

In [None]:
ans.iloc[4].to_dict()

In [None]:
gs._get_sentence_emb("What is (are) Glaucoma ?")

In [None]:
gs("What is (are) Glaucoma ?")