In [1]:
%cd ..

/home/burak/repos/smartrag


In [2]:
from disambiguation_methods.qa import chain_answer
from disambiguation_methods.score import chain_score
from collections import defaultdict
from disambiguation_methods.qa import AnswerStr
from disambiguation_methods.score import AnswerJudge
from langchain_core.runnables import RunnableLambda, RunnableParallel, RunnableConfig
from models import QueryAmbiguation
import pandas as pd
import json
from disambiguation_methods.domain_extractor import categories

from tqdm import tqdm

In [3]:
top_n = 10
dataset_name = "generated"
domain = None

In [7]:
df = pd.read_csv(f"./data/{dataset_name}_ambiguous_top{top_n}+DOMAIN+LLM+Intent+MLM+TE+LLM.csv")

In [8]:
lm1 = RunnableLambda(lambda x: chain_answer.invoke(x["amb"]))
lm2 = RunnableLambda(lambda x: chain_answer.invoke(x["unamb"]))
lm3 = RunnableLambda(lambda x: chain_answer.invoke(x["disamb"]))
chain_ = RunnableParallel(amb=lm1, unamb=lm2, disamb=lm3)

In [15]:
df.columns

Index(['possible_ambiguities', 'question', 'top_10_full_form',
       'ambiguous_question', 'unambiguous_question', 'domain_idx',
       'top_10_full_form_sources', 'llm_full_form_suggestions', 'intent',
       'MLM_ground_truth_full_form_prob', 'MLM_llm_full_form_suggestion_prob',
       'MLM_most_likely_full_forms', 'MLM_most_likely_full_form_probs',
       'MLM_most_likely_selection_types', 'MLM_top_10_full_form_probs',
       'TE_ground_truth_full_form_prob', 'TE_llm_full_form_suggestion_prob',
       'TE_most_likely_full_forms', 'TE_most_likely_full_form_probs',
       'TE_most_likely_selection_types', 'TE_top_10_full_form_probs',
       'LLM_most_likely_full_forms', 'LLM_most_likely_selection_types',
       'disambiguated_question',
       'MLM_disambiguated_question_answered_by_gpt35_correct',
       'MLM_unambiguous_question_answered_by_gpt35_correct',
       'MLM_ambiguous_question_answered_by_gpt35_correct',
       'MLM_disambiguated_question_answered_by_gpt35',
       'MLM_u

In [16]:
evaluation = defaultdict(lambda: {"amb": [], "disamb": [], "unamb": []})

for method in ["LLM2"]:
    for df_index in tqdm(range(len(df))):
        ambiguities = QueryAmbiguation(**json.loads(df.loc[df_index, "possible_ambiguities"]))
        most_likely_full_forms: list[str] = json.loads(df.loc[df_index, f"llm_full_form_suggestions"])

        disambiguated_question_answers = []
        unambiguous_question_answers = []
        ambiguous_question_answers = []

        evaluation[method]["amb"].append([])
        evaluation[method]["unamb"].append([])
        evaluation[method]["disamb"].append([])

        for full_form, amb in zip(most_likely_full_forms, ambiguities.full_form_abbrv_map):
            amb_question: str = df.loc[df_index, "ambiguous_question"]
            unamb_question: str = df.loc[df_index, "unambiguous_question"]

            disambiguated_question = amb_question.replace(amb.abbreviation, amb.abbreviation + f" ({full_form})")
            df.loc[df_index, "disambiguated_question"] = disambiguated_question

            answer: str = df.loc[df_index, "answer"] if "answer" in df else ""

            response: dict[str, AnswerStr] = chain_.invoke(
                {
                    "amb": {
                        "query": amb_question,
                        "intent": df.loc[df_index, "intent"],
                        "domain": categories[df.loc[df_index, "domain_idx"]]
                        if df.loc[df_index, "domain_idx"] < len(categories)
                        else None,
                    },
                    "unamb": {
                        "query": unamb_question,
                        "intent": df.loc[df_index, "intent"],
                        "domain": categories[df.loc[df_index, "domain_idx"]]
                        if df.loc[df_index, "domain_idx"] < len(categories)
                        else None,
                    },
                    "disamb": {
                        "query": disambiguated_question,
                        "intent": df.loc[df_index, "intent"],
                        "domain": categories[df.loc[df_index, "domain_idx"]]
                        if df.loc[df_index, "domain_idx"] < len(categories)
                        else None,
                    },
                },
                RunnableConfig(configurable={"llm": "gpt35"}),
            )

            response_: AnswerJudge = chain_score.invoke(
                {
                    "answer": answer if answer else response["unamb"].answer,
                    "amb": response["amb"].answer,
                    "unamb": response["unamb"].answer,
                    "disamb": response["disamb"].answer,
                    "query": unamb_question,
                }
            )
            # amb, unamb, disamb

            disambiguated_question_answers.append(response["disamb"].answer)
            unambiguous_question_answers.append(response["unamb"].answer)
            ambiguous_question_answers.append(response["amb"].answer)

            evaluation[method]["amb"][-1].append(response_.bit1 == 1)
            evaluation[method]["unamb"][-1].append(response_.bit2 == 1)
            evaluation[method]["disamb"][-1].append(response_.bit3 == 1)

        df.loc[df_index, f"{method}_disambiguated_question_answered_by_gpt35_correct"] = json.dumps(
            evaluation[method]["disamb"]
        )
        df.loc[df_index, f"{method}_unambiguous_question_answered_by_gpt35_correct"] = json.dumps(
            evaluation[method]["unamb"]
        )
        df.loc[df_index, f"{method}_ambiguous_question_answered_by_gpt35_correct"] = json.dumps(
            evaluation[method]["amb"]
        )

        df.loc[df_index, f"{method}_disambiguated_question_answered_by_gpt35"] = json.dumps(
            disambiguated_question_answers
        )
        df.loc[df_index, f"{method}_unambiguous_question_answered_by_gpt35"] = json.dumps(unambiguous_question_answers)
        df.loc[df_index, f"{method}_ambiguous_question_answered_by_gpt35"] = json.dumps(ambiguous_question_answers)

  0%|          | 0/100 [00:00<?, ?it/s]

100%|██████████| 100/100 [06:04<00:00,  3.65s/it]


In [17]:
import numpy as np

eval_df = pd.DataFrame(columns=["method", "ambiguous_accuracy", "unambiguous_accuracy", "disambiguated_accuracy"])

for method in ["MLM", "TE", "LLM", "LLM2"]:
    eval_df = pd.concat(
        [
            eval_df,
            pd.DataFrame(
                [
                    {
                        "method": method,
                        "ambiguous_accuracy": df[f"{method}_ambiguous_question_answered_by_gpt35_correct"]
                        .apply(lambda x: np.mean(json.loads(x)))
                        .mean(),
                        "unambiguous_accuracy": df[f"{method}_unambiguous_question_answered_by_gpt35_correct"]
                        .apply(lambda x: np.mean(json.loads(x)))
                        .mean(),
                        "disambiguated_accuracy": df[f"{method}_disambiguated_question_answered_by_gpt35_correct"]
                        .apply(lambda x: np.mean(json.loads(x)))
                        .mean(),
                    }
                ]
            ),
        ],
        ignore_index=True,
    )
eval_df

  eval_df = pd.concat(


Unnamed: 0,method,ambiguous_accuracy,unambiguous_accuracy,disambiguated_accuracy
0,MLM,0.953775,1.0,0.91101
1,TE,0.952277,1.0,0.973554
2,LLM,0.945302,1.0,0.930966
3,LLM2,0.959358,1.0,0.886698


In [12]:
df.to_csv(f"./data/{dataset_name}_ambiguous_top{top_n}+DOMAIN+LLM+Intent+MLM+TE+LLM+eval.csv", index=False)