In [2]:
%cd ..

/home/burak/repos/smartrag


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [3]:
import json
import pandas as pd
import numpy as np
from tqdm import tqdm
import torch
from langchain_openai.embeddings import OpenAIEmbeddings
from transformers import AutoTokenizer, AutoModelForMaskedLM

from langchain_openai.chat_models import ChatOpenAI
from langchain.prompts import (
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
    ChatPromptTemplate,
    PromptTemplate,
)
from langchain.output_parsers import PydanticOutputParser
from langchain_core.runnables import RunnableLambda, RunnableParallel
from pprint import pp
from utils import secrets
from models import QueryAmbiguation

In [4]:
is_bool = False
domain = "MEDICAL"
dataset_name = "medquad"
top_n = 10
df = pd.read_csv(f"{dataset_name}_ambiguous_with_top{top_n}_merged.csv", index_col=0)
df = df[df.valid == 1].reset_index()
n_queries = len(df)

### Chain Definition

In [5]:
llm = ChatOpenAI(
    model="gpt-4-turbo-preview",
    api_key=secrets.get("OPENAI_API_KEY"),
    max_tokens=4096,
    temperature=0.0,
    model_kwargs={"response_format": {"type": "json_object"}},
)
embedder = OpenAIEmbeddings(api_key=secrets.get("OPENAI_API_KEY"), model="text-embedding-3-large")

## MLM

In [6]:
# tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
# model = AutoModelForMaskedLM.from_pretrained("google-bert/bert-base-uncased").to("cuda")

# tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base")
# model = AutoModelForMaskedLM.from_pretrained("FacebookAI/xlm-roberta-base").to("cuda")

# tokenizer = AutoTokenizer.from_pretrained("distilbert/distilroberta-base")
# model = AutoModelForMaskedLM.from_pretrained("distilbert/distilroberta-base").to("cuda")

# tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
# model = AutoModelForMaskedLM.from_pretrained("emilyalsentzer/Bio_ClinicalBERT").to("cuda")

tokenizer = AutoTokenizer.from_pretrained("medicalai/ClinicalBERT")
model = AutoModelForMaskedLM.from_pretrained("medicalai/ClinicalBERT").to("cuda")

In [7]:
for query_n in range(n_queries):
    if df.loc[query_n, "valid"] != 1:
        continue

    example = df.loc[query_n, "ambiguous_question"]
    ambiguities = json.loads(df.loc[query_n, "possible_ambiguities"])
    ambiguities = QueryAmbiguation(**ambiguities)

    # focus on only the first ambiguity in a query
    amb = ambiguities.full_form_abbrv_map[0]
    to_be_masked_example = example.replace(amb.abbreviation, amb.abbreviation + " ({abbreviation})")

    possibilities = [amb.full_form.casefold().strip()]
    if not pd.isna(df.loc[query_n, f"top_{top_n}_full_form"]):
        possibilities += list(
            map(lambda x: x.casefold().strip(), df.loc[query_n, f"top_{top_n}_full_form"].split("<-->")[0].split("<->"))
        )
    # add llm suggestion to df
    # possibilities += [df.loc[query_n, "llm_full_form_suggestion"]]

    possibility_probs = []
    for pos in tqdm(possibilities):
        tokenized = tokenizer(pos, return_tensors="pt")
        length = len(tokenized.input_ids[0]) - 2
        prev = [tokenizer.mask_token_id for _ in range(length)]
        logit_sum = 0.0
        for i in range(length):
            current_token_id = tokenized.input_ids[0][i + 1]

            masked_example = to_be_masked_example.format(abbreviation=tokenizer.decode(prev))
            tokenized_example = tokenizer(masked_example, return_tensors="pt").to("cuda")

            mask_start = tokenized_example.input_ids[0].tolist().index(tokenizer.mask_token_id)
            with torch.no_grad():
                filled_logits = torch.log_softmax(model(**tokenized_example).logits[0, mask_start, :], dim=0)
            current_logit = filled_logits[current_token_id]

            prev[i] = current_token_id
            logit_sum += current_logit
            # print("Expected Token:", tokenizer.decode([tokenized.input_ids[0][i+1]]),
            #      "\n\tProbability:",torch.exp(current_logit).item(),
            #      "\n\tBest token:", tokenizer.decode(filled_logits.argmax(dim=1)),
            #      "\n\t\tProbability:", torch.exp(filled_logits.max(dim=1)[0]).item())
        possibility_probs.append(torch.exp(logit_sum).item())

    df.loc[query_n, "ground_truth_full_form_prob"] = possibility_probs.pop(0)
    possibilities.pop(0)
    if possibilities:
        arr = np.asarray(possibility_probs)
        most_likely_index = arr.argmax()
        df.loc[query_n, "most_likely_full_form"] = possibilities[most_likely_index]
        df.loc[query_n, "most_likely_full_form_prob"] = arr.max()
        df.loc[query_n, f"top_{top_n}_full_form_probs"] = "/".join(map(str, possibility_probs))

100%|██████████| 2/2 [00:08<00:00,  4.32s/it]
100%|██████████| 11/11 [00:01<00:00,  5.64it/s]
100%|██████████| 11/11 [00:03<00:00,  3.14it/s]
100%|██████████| 1/1 [00:00<00:00,  2.37it/s]
100%|██████████| 11/11 [00:02<00:00,  3.93it/s]
100%|██████████| 1/1 [00:00<00:00,  2.62it/s]
100%|██████████| 11/11 [00:01<00:00,  6.11it/s]
100%|██████████| 1/1 [00:00<00:00,  3.78it/s]
100%|██████████| 11/11 [00:02<00:00,  4.88it/s]
100%|██████████| 11/11 [00:04<00:00,  2.35it/s]
100%|██████████| 2/2 [00:00<00:00,  2.54it/s]
100%|██████████| 11/11 [00:03<00:00,  3.47it/s]
100%|██████████| 11/11 [00:03<00:00,  2.83it/s]
100%|██████████| 10/10 [00:05<00:00,  1.80it/s]
100%|██████████| 11/11 [00:03<00:00,  3.10it/s]
100%|██████████| 11/11 [00:03<00:00,  3.19it/s]
100%|██████████| 11/11 [00:03<00:00,  3.22it/s]
100%|██████████| 11/11 [00:03<00:00,  2.94it/s]
100%|██████████| 11/11 [00:03<00:00,  2.80it/s]
100%|██████████| 11/11 [00:02<00:00,  4.55it/s]
100%|██████████| 11/11 [00:02<00:00,  4.36it/s]
10

In [8]:
df.to_csv(f"{dataset_name}_ambiguous_with_top{top_n}_merged_MLM_clinicalbert.csv")

In [None]:
df.head()

## Text Embedder

In [10]:
for query_n in tqdm(range(n_queries)):
    if df.loc[query_n, "valid"] != 1:
        continue

    example = df.loc[query_n, "ambiguous_question"]
    ambiguities = json.loads(df.loc[query_n, "possible_ambiguities"])
    ambiguities = QueryAmbiguation(**ambiguities)

    # focus on only the first ambiguity
    amb = ambiguities.full_form_abbrv_map[0]
    masked_example = example.replace(amb.abbreviation, amb.abbreviation + " ({abbreviation})")

    possibilities = [amb.full_form]
    if not pd.isna(df.loc[query_n, f"top_{top_n}_full_form"]):
        possibilities += df.loc[query_n, f"top_{top_n}_full_form"].split("<-->")[0].split("<->")

    candidate_queries = [example] + [masked_example.format(abbreviation=pos) for pos in possibilities]

    embeddings = np.asarray(embedder.embed_documents(candidate_queries))

    similarities = (embeddings[:1, :] @ embeddings[1:, :].T).flatten()

    df.loc[query_n, "ground_truth_full_form_prob"] = similarities[0]
    possibilities.pop(0)

    similarities = similarities[1:]
    if possibilities:
        best_candidate = similarities.argmax()

        df.loc[query_n, "most_likely_full_form"] = possibilities[best_candidate]
        df.loc[query_n, "most_likely_full_form_prob"] = similarities[best_candidate]

  0%|          | 0/123 [00:05<?, ?it/s]


RateLimitError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}

In [None]:
df.to_csv(f"{dataset_name}_ambiguous_with_top{top_n}_merged_EMBED_text-embedding-3-large.csv")

## LLM Selector

In [None]:
from models import Selector

selector_output_parser = PydanticOutputParser(pydantic_object=Selector)

In [None]:
selector_system_prompt = """User will give you a query, in this query there will be an abbreviation. Your task is to resolve that abbreviation.
User will also provide possible full-forms that you can select from. Please do best you can while selecting from the given list of options.
If you cant find and appropriate selection from given options, please use selection_id as -1.

Query domain will be {domain}.

Format Instructions:
{format_instructions}
"""

In [None]:
selector_user_message = """Query: {query}
Abbreviation: {abbrv}
Options:
{options}
Selection:"""

In [None]:
selector_messages = [
    SystemMessagePromptTemplate(
        prompt=PromptTemplate(
            template=selector_system_prompt,
            input_variables=[],
            partial_variables={
                "format_instructions": selector_output_parser.get_format_instructions(),
                "domain": domain,
            },
        )
    ),
    HumanMessagePromptTemplate(
        prompt=PromptTemplate(template=selector_user_message, input_variables=["query", "abbrv", "options"])
    ),
]
selector_prompt = ChatPromptTemplate.from_messages(messages=selector_messages)

In [None]:
selector_chain = selector_prompt | llm | selector_output_parser

In [None]:
for query_n in tqdm(range(n_queries)):
    if df.loc[query_n, "valid"] != 1:
        continue

    query = df.loc[query_n, "ambiguous_question"]
    ambiguities = json.loads(df.loc[query_n, "possible_ambiguities"])
    ambiguities = QueryAmbiguation(**ambiguities)

    # focus on only the first ambiguity
    amb = ambiguities.full_form_abbrv_map[0]

    possibilities = []
    if not pd.isna(df.loc[query_n, f"top_{top_n}_full_form"]):
        possibilities += df.loc[query_n, f"top_{top_n}_full_form"].split("<-->")[0].split("<->")

    if not possibilities:
        possibilities += [df.loc[query_n, "llm_full_form_suggestion"]]
        df.loc[query_n, "most_likely_full_form"] = df.loc[query_n, "llm_full_form_suggestion"]
    else:
        resp = selector_chain.invoke(
            {
                "abbrv": amb.abbreviation,
                "query": query,
                "options": [f"{i} - {opt}\n" for i, opt in enumerate(possibilities)],
            }
        )
        df.loc[query_n, "most_likely_full_form"] = (
            possibilities[resp.selection_id] if resp.selection_id != -1 else df.loc[query_n, "llm_full_form_suggestion"]
        )

In [None]:
df.to_csv(f"{dataset_name}_ambiguous_with_top{top_n}_merged_LLM_selector.csv")

## LLM Boolean or STR Answer

In [None]:
from models import AnswerBool, AnswerStr, QueryAmbiguation

output_parser_bool = PydanticOutputParser(pydantic_object=AnswerBool)
output_parser_str = PydanticOutputParser(pydantic_object=AnswerStr)

In [None]:
output_parser_answer = output_parser_str if not is_bool else output_parser_bool

In [None]:
sys_message_answer = """Answer the given questions as concise and short as possible. Do not output something else.
Additionally, an intent and requirements for the answer can be provided by the user. Take them into consideration while answering.
Domain of the questions is {domain}.

{format_instructions}
"""

user_message_answer = """Query:{query}
Intent:{intent}
Requirements:{reqs}
Output:"""

In [None]:
messages_answer = [
    SystemMessagePromptTemplate(
        prompt=PromptTemplate(
            template=sys_message_answer,
            input_variables=[],
            partial_variables={"format_instructions": output_parser_answer.get_format_instructions(), "domain": domain},
        )
    ),
    HumanMessagePromptTemplate(
        prompt=PromptTemplate(
            template=user_message_answer, input_variables=["query", "intent", "reqs"], partial_variables={}
        )
    ),
]
prompt_answer = ChatPromptTemplate.from_messages(messages=messages_answer)

In [None]:
chain_answer = prompt_answer | llm | output_parser_answer

## LLM Answer Scoring

In [None]:
from models import AnswerJudge

output_parser_score = PydanticOutputParser(pydantic_object=AnswerJudge)

In [None]:
sys_message_score = """Given a query and two candidate answers and one real answer, your task is to compare two candidate answers.

If first candidate is the only one that is similar to the actual answer you will output number '1'.
If second candidate is the only one that is similar to the actual answer you will output number '2'.
If both of the candidates show enough similarity to the real answer you will output number '3'.
If none of the candidates show enough similarity to the real answer you will output number '4'.

Format instructions:
{format_instructions}
"""

user_message_score = """Real Answer: {answer}
First Candidate: {amb}
Second Candidate: {unamb}
Output:"""

In [None]:
messages_score = [
    SystemMessagePromptTemplate(
        prompt=PromptTemplate(
            template=sys_message_score,
            input_variables=[],
            partial_variables={"format_instructions": output_parser_score.get_format_instructions()},
        )
    ),
    HumanMessagePromptTemplate(
        prompt=PromptTemplate(
            template=user_message_score, input_variables=["answer", "amb", "unamb"], partial_variables={}
        )
    ),
]
prompt_score = ChatPromptTemplate.from_messages(messages=messages_score)

In [None]:
chain_score = prompt_score | llm | output_parser_score

## Accuracy Calc

In [None]:
correct = [set(), set()]

In [None]:
lm1 = RunnableLambda(lambda x: chain_answer.invoke(x["amb"]))
lm2 = RunnableLambda(lambda x: chain_answer.invoke(x["unamb"]))
chain_ = RunnableParallel(amb=lm1, unamb=lm2)

for df_index in tqdm(range(n_queries)):
    if df.loc[df_index, "valid"] != 1:
        continue

    ambiguities = QueryAmbiguation(**json.loads(df.loc[df_index, "possible_ambiguities"]))
    amb = ambiguities.full_form_abbrv_map[0]
    amb_question = df.loc[df_index, "ambiguous_question"]
    unamb_question = df.loc[df_index, "unambiguous_question"]
    if df.loc[df_index, "most_likely_full_form"]:
        amb_question = amb_question.replace(
            amb.abbreviation, amb.abbreviation + f" ({df.loc[df_index, 'most_likely_full_form']})"
        )
    # amb_question = amb_question.replace(amb.abbreviation, amb.abbreviation + f" ({df.loc[df_index, 'llm_full_form_suggestion']})")

    df.loc[df_index, "disambiguated_question"] = amb_question

    if "answer" in df and is_bool:
        answer = [df.loc[df_index, "answer"] in ["TRUE", "True", True, 1, "1"]]
    elif "answer" in df:
        answer = [df.loc[df_index, "answer"]]
    elif "answers" in df:
        answer = eval(df.loc[df_index, "answers"])["text"]
    else:
        answer = []
        for i in eval(df.loc[df_index, "annotations"]):
            if i["type"] == "singleAnswer":
                answer.extend(list(map(lambda x: x.casefold(), i["answer"])))
            elif i["type"] == "multipleQAs":
                for pair in i["qaPairs"]:
                    answer.extend(list(map(lambda x: x.casefold(), pair["answer"])))
            else:
                raise RuntimeError(i)

    response = chain_.invoke(
        {
            "amb": {
                "query": amb_question,
                "intent": df.loc[df_index, "intent"],
                "reqs": "\n".join(eval(df.loc[df_index, "requirements"])),
            },
            "unamb": {
                "query": unamb_question,
                "intent": df.loc[df_index, "intent"],
                "reqs": "\n".join(eval(df.loc[df_index, "requirements"])),
            },
        }
    )
    if is_bool:
        amb_correct = response["amb"].answer == answer[0]
        unamb_correct = response["unamb"].answer == answer[0]
    else:
        response_ = chain_score.invoke(
            {"answer": answer[0], "amb": response["amb"].answer, "unamb": response["unamb"].answer}
        )

        if response_.selection == 1:
            amb_correct = True
            unamb_correct = False
        elif response_.selection == 2:
            amb_correct = False
            unamb_correct = True
        elif response_.selection == 3:
            amb_correct = True
            unamb_correct = True
        else:
            amb_correct = False
            unamb_correct = False
    # amb_correct = any([response["amb"].answer.casefold() in a.casefold() or a.casefold() in response["amb"].answer.casefold() for a in answer])
    # unamb_correct = any([response["unamb"].answer.casefold() in a.casefold() or a.casefold() in response["unamb"].answer.casefold() for a in answer])

    df.loc[df_index, "disambiguated_question_answered_by_gpt4"] = response["amb"].answer
    df.loc[df_index, "unambiguous_question_answered_by_gpt4"] = response["unamb"].answer

    if amb_correct:
        correct[0].add(df_index)
        df.loc[i, "disambiguated_question_answered_correct_by_gpt4"] = True
    if unamb_correct:
        correct[1].add(df_index)
        df.loc[i, "unambiguous_question_answered_correct_by_gpt4"] = True
    df.loc[df_index, "answers"] = str(answer)

In [None]:
len(correct[0]) / len(df), len(correct[1]) / len(df)

(0.6476190476190476, 0.8285714285714286)

In [None]:
df.to_csv(f"{dataset_name}_ambiguous_with_top{top_n}_merged_with_intent_compared.csv")