In [None]:
from api_caller import evaluate_answer_generator

In [2]:
from langchain_core.language_models import BaseChatModel
import datasets
import json
from tqdm import tqdm
from typing import Optional

def run_rag_tests(
    eval_dataset: datasets.Dataset,
    output_file: str,
    verbose: Optional[bool] = True,
    test_settings: Optional[str] = None,  # To document the test settings used
):
    """Runs RAG tests on the given dataset and saves the results to the given output file."""
    # try:  # load previous generations if they exist
    #     with open(output_file, "r") as f:
    #         outputs = json.load(f)
    # except:
    outputs = []

    for example in tqdm(eval_dataset):
        question = example["question"]
        if question in [output["question"] for output in outputs]:
            continue

        # answer, relevant_docs = evaluate_answer_generator(question)
        answer = evaluate_answer_generator(question)
         # Convert `ScoredVector` objects to serializable dictionaries
        # retrieved_docs = [
        #     {
        #         "id": doc.id,  # Document ID
        #         "score": doc.score,  # Similarity score
        #         "metadata": doc.metadata,  # Metadata from Pinecone
        #     }
        #     for doc in relevant_docs
        # ]

        if verbose:
            print("=======================================================")
            print(f"Question: {question}")
            print(f"Answer: {answer}")
            print(f'True answer: {example["answer"]}')
        result = {
            "question": question,
            "true_answer": example["answer"],
            "context": example["content"],
            "generated_answer": answer,
            # "retrieved_docs": retrieved_docs,
        }
        if test_settings:
            result["test_settings"] = test_settings
        outputs.append(result)

        with open(output_file, "w") as f:
            json.dump(outputs, f, indent=4, ensure_ascii=False)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
EVALUATION_PROMPT = """###Task Description:
An instruction (might include an Input inside it), a response to evaluate, a reference context that gets a relevant information, and a score rubric representing a evaluation criteria are given.
1. Write a detailed feedback that assess the quality of the response strictly based on the given score rubric, not evaluating in general.
2. After writing a feedback, write a score that is an integer between 1 and 5. You should refer to the score rubric.
3. The output format should look as follows: \"{{an integer number between 1 and 5}} [RESULT] Feedback: {{write a feedback for criteria}}\"
4. Please do not generate any other opening, closing, and explanations. Be sure to include [RESULT] in your output.

###The instruction to evaluate:
{instruction}

###Response to evaluate:
{response}

###Reference Context (Score 5):
{context}

###Score Rubrics:
[Is the response correct, accurate, and factual based on the reference context?]
Score 1: The response is completely incorrect, inaccurate, and/or not factual.
Score 2: The response is mostly incorrect, inaccurate, and/or not factual.
Score 3: The response is somewhat correct, accurate, and/or factual.
Score 4: The response is mostly correct, accurate, and factual.
Score 5: The response is completely correct, accurate, and factual.

###Feedback:"""

from langchain.prompts.chat import (
    ChatPromptTemplate,
    HumanMessagePromptTemplate,
)
from langchain.schema import SystemMessage


evaluation_prompt_template = ChatPromptTemplate.from_messages(
    [
        SystemMessage(content="You are a fair evaluator language model."),
        HumanMessagePromptTemplate.from_template(EVALUATION_PROMPT),
    ]
)
   

In [None]:
from langchain.chat_models import ChatOpenAI

OPENAI_API_KEY ='OPENAI_API_KEY'

eval_chat_model = ChatOpenAI(model="gpt-4o-mini", temperature=0, openai_api_key=OPENAI_API_KEY, max_tokens = 1)
evaluator_name = "GPT4o-mini"


def evaluate_answers(
    answer_path: str,
    eval_chat_model,
    evaluator_name: str,
    evaluation_prompt_template: ChatPromptTemplate,
) -> None:
    """Evaluates generated answers. Modifies the given answer file in place for better checkpointing."""
    answers = []
    if os.path.isfile(answer_path):  # load previous generations if they exist
        answers = json.load(open(answer_path, "r"))

    for experiment in tqdm(answers):
        if f"eval_score_{evaluator_name}" in experiment:
            continue

        eval_prompt = evaluation_prompt_template.format_messages(
            instruction=experiment["question"],
            response=experiment["generated_answer"],
            context=experiment["context"],
        )
        eval_result = eval_chat_model.invoke(eval_prompt)
        score = 100
        def getIntegerScore( num_retries = 3 ):
          for attempt_no in range(num_retries):
              try:
                  return int(eval_result.content.split("[RESULT]")[0])
              except ValueError as error:
                  if attempt_no < (num_retries - 1):
                      print("Error: Invalid output")
                  else:
                      raise error
        score = getIntegerScore()

        experiment[f"eval_score_{evaluator_name}"] = score
        print(score)
        # experiment[f"eval_feedback_{evaluator_name}"] = feedback

        with open(answer_path, "w") as f:
            json.dump(answers, f)

In [11]:
answers

NameError: name 'answers' is not defined

In [13]:

import os
if not os.path.exists("./output"):
    os.mkdir("./output")


GENERATOR_NAME = "EEVE-Korean-Instruct-10.8B-v1.0-GGUF:Q4_K_M"
from datasets import load_dataset

# Load the dataset from a local JSON file
eval_dataset = load_dataset("json", data_files="eval_dataset_final.json")

# Access the Dataset object
eval_dataset = eval_dataset["train"] # The loaded file is stored in the "train" split

settings_name = f"chunk_embeddings_rerank:True_reader-model:{GENERATOR_NAME}"
output_file_name = f"./output/rag_{settings_name}.json"

print(f"Running evaluation for {settings_name}:")

print("Loading knowledge base embeddings...")

print("Running RAG...")

# run_rag_tests(
#     eval_dataset=eval_dataset,
#     output_file=output_file_name,
#     verbose=False,
#     test_settings=settings_name,
# )

print("Running evaluation...")
evaluate_answers(
    output_file_name,
    eval_chat_model,
    evaluator_name,
    evaluation_prompt_template,
    )

Running evaluation for chunk_embeddings_rerank:True_reader-model:EEVE-Korean-Instruct-10.8B-v1.0-GGUF:Q4_K_M:
Loading knowledge base embeddings...
Running RAG...
Running evaluation...


  1%|          | 1/159 [00:00<02:10,  1.21it/s]

4


  1%|▏         | 2/159 [00:01<01:29,  1.76it/s]

5


  2%|▏         | 3/159 [00:01<01:14,  2.10it/s]

5


  3%|▎         | 4/159 [00:02<01:15,  2.06it/s]

1


  3%|▎         | 5/159 [00:02<01:07,  2.27it/s]

5


  4%|▍         | 6/159 [00:02<01:11,  2.14it/s]

1


  4%|▍         | 7/159 [00:03<01:10,  2.17it/s]

2


  5%|▌         | 8/159 [00:04<01:33,  1.62it/s]

4


  6%|▌         | 9/159 [00:04<01:22,  1.83it/s]

4


  6%|▋         | 10/159 [00:05<01:14,  2.00it/s]

4


  7%|▋         | 11/159 [00:05<01:13,  2.03it/s]

4


  8%|▊         | 12/159 [00:06<01:09,  2.10it/s]

3


  8%|▊         | 13/159 [00:06<01:04,  2.26it/s]

4


  9%|▉         | 14/159 [00:06<01:03,  2.28it/s]

4


  9%|▉         | 15/159 [00:07<01:03,  2.27it/s]

4


 10%|█         | 16/159 [00:07<01:01,  2.32it/s]

5


 11%|█         | 17/159 [00:08<01:06,  2.12it/s]

1


 11%|█▏        | 18/159 [00:08<01:06,  2.11it/s]

1


 12%|█▏        | 19/159 [00:09<01:01,  2.26it/s]

5


 13%|█▎        | 20/159 [00:09<00:59,  2.32it/s]

5


 13%|█▎        | 21/159 [00:17<06:00,  2.61s/it]

3


 14%|█▍        | 22/159 [00:17<04:27,  1.95s/it]

3


 14%|█▍        | 23/159 [00:18<03:25,  1.51s/it]

3


 15%|█▌        | 24/159 [00:18<02:39,  1.18s/it]

3


 16%|█▌        | 25/159 [00:19<02:11,  1.02it/s]

4


 16%|█▋        | 26/159 [00:19<01:50,  1.21it/s]

3


 17%|█▋        | 27/159 [00:20<01:35,  1.38it/s]

4


 18%|█▊        | 28/159 [00:20<01:26,  1.51it/s]

5


 18%|█▊        | 29/159 [00:20<01:14,  1.74it/s]

4


 19%|█▉        | 30/159 [00:21<01:09,  1.87it/s]

5


 19%|█▉        | 31/159 [00:21<01:06,  1.92it/s]

5


 20%|██        | 32/159 [00:22<01:10,  1.79it/s]

5


 21%|██        | 33/159 [00:22<01:04,  1.95it/s]

3


 21%|██▏       | 34/159 [00:23<01:00,  2.07it/s]

3


 22%|██▏       | 35/159 [00:23<01:01,  2.02it/s]

4


 23%|██▎       | 36/159 [00:24<00:58,  2.12it/s]

4


 23%|██▎       | 37/159 [00:24<00:54,  2.24it/s]

5


 24%|██▍       | 38/159 [00:25<00:53,  2.25it/s]

4


 25%|██▍       | 39/159 [00:26<01:24,  1.43it/s]

5


 25%|██▌       | 40/159 [00:26<01:14,  1.60it/s]

5


 26%|██▌       | 41/159 [00:27<01:04,  1.82it/s]

5


 26%|██▋       | 42/159 [00:27<00:58,  2.01it/s]

5


 27%|██▋       | 43/159 [00:28<00:59,  1.95it/s]

2


 28%|██▊       | 44/159 [00:28<00:58,  1.98it/s]

5


 28%|██▊       | 45/159 [00:29<00:54,  2.07it/s]

5


 29%|██▉       | 46/159 [00:29<00:53,  2.13it/s]

3


 30%|██▉       | 47/159 [00:29<00:49,  2.28it/s]

4


 30%|███       | 48/159 [00:30<00:48,  2.28it/s]

5


 31%|███       | 49/159 [00:30<00:46,  2.38it/s]

4


 31%|███▏      | 50/159 [00:31<00:44,  2.45it/s]

3


 32%|███▏      | 51/159 [00:31<00:44,  2.44it/s]

3


 33%|███▎      | 52/159 [00:31<00:42,  2.51it/s]

1


 33%|███▎      | 53/159 [00:32<00:55,  1.89it/s]

2


 34%|███▍      | 54/159 [00:33<00:52,  2.01it/s]

4


 35%|███▍      | 55/159 [00:33<00:48,  2.13it/s]

4


 35%|███▌      | 56/159 [00:33<00:45,  2.25it/s]

4


 36%|███▌      | 57/159 [00:34<00:45,  2.24it/s]

2


 36%|███▋      | 58/159 [00:34<00:45,  2.23it/s]

1


 37%|███▋      | 59/159 [00:35<00:43,  2.27it/s]

5


 38%|███▊      | 60/159 [00:35<00:43,  2.30it/s]

1


 38%|███▊      | 61/159 [00:36<00:47,  2.07it/s]

1


 39%|███▉      | 62/159 [00:36<00:45,  2.16it/s]

3


 40%|███▉      | 63/159 [00:37<00:43,  2.23it/s]

4


 40%|████      | 64/159 [00:37<00:44,  2.12it/s]

1


 41%|████      | 65/159 [00:37<00:42,  2.23it/s]

1


 42%|████▏     | 66/159 [00:38<00:42,  2.18it/s]

2


 42%|████▏     | 67/159 [00:38<00:40,  2.25it/s]

5


 43%|████▎     | 68/159 [00:39<00:39,  2.32it/s]

1


 43%|████▎     | 69/159 [00:39<00:41,  2.17it/s]

4


 44%|████▍     | 70/159 [00:40<00:39,  2.26it/s]

2


 45%|████▍     | 71/159 [00:40<00:38,  2.29it/s]

5


 45%|████▌     | 72/159 [00:40<00:36,  2.35it/s]

5


 46%|████▌     | 73/159 [00:41<00:36,  2.34it/s]

1


 47%|████▋     | 74/159 [00:41<00:35,  2.40it/s]

2


 47%|████▋     | 75/159 [00:42<00:34,  2.40it/s]

5


 48%|████▊     | 76/159 [00:42<00:38,  2.15it/s]

2


 48%|████▊     | 77/159 [00:43<00:37,  2.21it/s]

4


 49%|████▉     | 78/159 [00:43<00:38,  2.12it/s]

5


 50%|████▉     | 79/159 [00:44<00:35,  2.24it/s]

4


 50%|█████     | 80/159 [00:44<00:37,  2.12it/s]

5


 51%|█████     | 81/159 [00:45<00:35,  2.21it/s]

5


 52%|█████▏    | 82/159 [00:45<00:33,  2.29it/s]

3


 52%|█████▏    | 83/159 [00:45<00:31,  2.38it/s]

1


 53%|█████▎    | 84/159 [00:46<00:43,  1.71it/s]

4


 53%|█████▎    | 85/159 [00:47<00:39,  1.87it/s]

5


 54%|█████▍    | 86/159 [00:47<00:36,  2.00it/s]

4


 55%|█████▍    | 87/159 [00:48<00:34,  2.11it/s]

1


 55%|█████▌    | 88/159 [00:48<00:34,  2.07it/s]

4


 56%|█████▌    | 89/159 [00:49<00:33,  2.11it/s]

4


 57%|█████▋    | 90/159 [00:49<00:34,  1.99it/s]

2


 57%|█████▋    | 91/159 [00:49<00:32,  2.12it/s]

3


 58%|█████▊    | 92/159 [00:50<00:31,  2.13it/s]

1


 58%|█████▊    | 93/159 [00:50<00:32,  2.06it/s]

5


 59%|█████▉    | 94/159 [00:51<00:29,  2.21it/s]

5


 60%|█████▉    | 95/159 [00:51<00:27,  2.30it/s]

1


 60%|██████    | 96/159 [00:52<00:26,  2.42it/s]

5


 61%|██████    | 97/159 [00:52<00:25,  2.46it/s]

5


 62%|██████▏   | 98/159 [00:52<00:24,  2.49it/s]

4


 62%|██████▏   | 99/159 [00:53<00:26,  2.30it/s]

5


 63%|██████▎   | 100/159 [00:53<00:25,  2.34it/s]

5


 64%|██████▎   | 101/159 [00:54<00:23,  2.43it/s]

3


 64%|██████▍   | 102/159 [00:54<00:25,  2.22it/s]

3


 65%|██████▍   | 103/159 [00:55<00:24,  2.30it/s]

5


 65%|██████▌   | 104/159 [00:55<00:23,  2.37it/s]

4


 66%|██████▌   | 105/159 [00:55<00:22,  2.37it/s]

5


 67%|██████▋   | 106/159 [00:56<00:21,  2.42it/s]

1


 67%|██████▋   | 107/159 [00:56<00:21,  2.43it/s]

4


 68%|██████▊   | 108/159 [00:57<00:21,  2.41it/s]

5


 69%|██████▊   | 109/159 [00:57<00:20,  2.42it/s]

1


 69%|██████▉   | 110/159 [00:57<00:20,  2.43it/s]

1


 70%|██████▉   | 111/159 [00:58<00:20,  2.33it/s]

4


 70%|███████   | 112/159 [00:58<00:19,  2.38it/s]

4


 71%|███████   | 113/159 [00:59<00:18,  2.48it/s]

3


 72%|███████▏  | 114/159 [00:59<00:19,  2.35it/s]

5


 72%|███████▏  | 115/159 [01:00<00:18,  2.40it/s]

3


 73%|███████▎  | 116/159 [01:00<00:17,  2.43it/s]

1


 74%|███████▎  | 117/159 [01:01<00:18,  2.24it/s]

4


 74%|███████▍  | 118/159 [01:01<00:19,  2.10it/s]

5


 75%|███████▍  | 119/159 [01:01<00:18,  2.19it/s]

5


 75%|███████▌  | 120/159 [01:02<00:19,  1.96it/s]

1


 76%|███████▌  | 121/159 [01:03<00:17,  2.11it/s]

4


 77%|███████▋  | 122/159 [01:03<00:17,  2.10it/s]

1


 77%|███████▋  | 123/159 [01:03<00:16,  2.16it/s]

1


 78%|███████▊  | 124/159 [01:04<00:15,  2.24it/s]

4


 79%|███████▊  | 125/159 [01:04<00:14,  2.32it/s]

3


 79%|███████▉  | 126/159 [01:08<00:47,  1.43s/it]

1


 80%|███████▉  | 127/159 [01:08<00:35,  1.12s/it]

4


 81%|████████  | 128/159 [01:09<00:28,  1.08it/s]

4


 81%|████████  | 129/159 [01:09<00:22,  1.31it/s]

1


 82%|████████▏ | 130/159 [01:10<00:21,  1.36it/s]

3


 82%|████████▏ | 131/159 [01:10<00:18,  1.52it/s]

5


 83%|████████▎ | 132/159 [01:11<00:15,  1.71it/s]

4


 84%|████████▎ | 133/159 [01:11<00:13,  1.91it/s]

1


 84%|████████▍ | 134/159 [01:12<00:12,  2.06it/s]

4


 85%|████████▍ | 135/159 [01:12<00:11,  2.16it/s]

4


 86%|████████▌ | 136/159 [01:12<00:10,  2.27it/s]

5


 86%|████████▌ | 137/159 [01:13<00:09,  2.22it/s]

4


 87%|████████▋ | 138/159 [01:13<00:09,  2.28it/s]

5


 87%|████████▋ | 139/159 [01:14<00:09,  2.18it/s]

5


 88%|████████▊ | 140/159 [01:14<00:09,  1.97it/s]

4


 89%|████████▊ | 141/159 [01:15<00:09,  1.97it/s]

5


 89%|████████▉ | 142/159 [01:15<00:08,  2.09it/s]

1


 90%|████████▉ | 143/159 [01:16<00:07,  2.19it/s]

3


 91%|█████████ | 144/159 [01:16<00:06,  2.26it/s]

1


 91%|█████████ | 145/159 [01:17<00:06,  2.31it/s]

4


 92%|█████████▏| 146/159 [01:17<00:05,  2.36it/s]

4


 92%|█████████▏| 147/159 [01:18<00:05,  2.07it/s]

1


 93%|█████████▎| 148/159 [01:18<00:04,  2.20it/s]

4


 94%|█████████▎| 149/159 [01:18<00:04,  2.33it/s]

4


 94%|█████████▍| 150/159 [01:19<00:03,  2.41it/s]

3


 95%|█████████▍| 151/159 [01:19<00:03,  2.42it/s]

5


 96%|█████████▌| 152/159 [01:20<00:03,  2.09it/s]

5


 96%|█████████▌| 153/159 [01:20<00:02,  2.08it/s]

5


 97%|█████████▋| 154/159 [01:21<00:02,  2.16it/s]

5


 97%|█████████▋| 155/159 [01:22<00:02,  1.59it/s]

5


 98%|█████████▊| 156/159 [01:22<00:01,  1.69it/s]

5


 99%|█████████▊| 157/159 [01:23<00:01,  1.91it/s]

1


 99%|█████████▉| 158/159 [01:23<00:00,  2.06it/s]

5


100%|██████████| 159/159 [01:23<00:00,  1.89it/s]

4





In [14]:
eval_res = load_dataset("json", data_files="output/rag_chunk_embeddings_rerank:True_reader-model:EEVE-Korean-Instruct-10.8B-v1.0-GGUF:Q4_K_M.json")

Generating train split: 159 examples [00:00, 5855.55 examples/s]


In [21]:
a = 0
for i in eval_res['train']:
    a += i['eval_score_GPT4o-mini']

print(a/159)

3.50314465408805


In [19]:
eval_res

DatasetDict({
    train: Dataset({
        features: ['question', 'true_answer', 'context', 'generated_answer', 'test_settings', 'eval_score_GPT4o-mini'],
        num_rows: 159
    })
})