# reference: 
- [SemanticSimilarityEvaluator](https://developers.llamaindex.ai/python/examples/evaluation/semantic_similarity_eval)
- [CorrectnessEvaluator](https://developers.llamaindex.ai/python/examples/evaluation/correctness_eval)
- [FaithfulnessEvaluator](http://localhost:8888/notebooks/Faithfulness_Evaluator.ipynb)

## get test data

In [1]:
import os
import json

def json_load(file_path):
    #print("load data from: " + file_path)
    with open(file_path, 'r') as f:
        data_dict = json.load(f)
    return data_dict

test_data_file_path = os.path.join('data', 'source', 'example_test_data.json')
test_data = json_load(test_data_file_path)
test_data

{'context': '1.常見針灸配穴法中，所指的「四關穴」，為下列何穴位之組合？\n\xa0\nA.上星、日月\nB.合谷、太衝\nC.內關、外關\nD.上關、下關',
 'reference_answer': {'qid': '1',
  'stem': '常見針灸配穴法中,所指的「四關穴」,為下列何穴位之組合?',
  'A': '上星、日月',
  'B': '合谷、太衝',
  'C': '內關、外關',
  'D': '上關、下關'},
 'json_gemma_response': {'qid': 1,
  'stem': '常見針灸配穴法中，所指的「四關穴」，為下列何穴位之組合？',
  'A': '上星、日月',
  'B': '合谷、太衝',
  'C': '內關、外關',
  'D': '上關、下關'},
 'llama_en_response': {'A': '主蒙，曜月',
  'B': '合座，夡里',
  'C': '到递，割递',
  'D': '主递，一递',
  'qid': 1,
  'stem': '台九気动组化。\n\nA.主蒙，曜月\nB.合座，夡里\nC.到递，割递\nD.主递，一递'}}

## setup

In [2]:
from dotenv import find_dotenv, load_dotenv
_ = load_dotenv(find_dotenv())

from llama_index.llms.openai import OpenAI
from llama_index.core.prompts import PromptTemplate
from llama_index.core.evaluation import SemanticSimilarityEvaluator
from llama_index.core.evaluation import CorrectnessEvaluator
from llama_index.core.evaluation import FaithfulnessEvaluator

llm = OpenAI(model="gpt-5-mini", temperature=0, is_streaming=False)

# SemanticSimilarityEvaluator

In [3]:
SemanticSimilarityEvaluator?

[31mInit signature:[39m
SemanticSimilarityEvaluator(
    embed_model: Optional[llama_index.core.base.embeddings.base.BaseEmbedding] = [38;5;28;01mNone[39;00m,
    similarity_fn: Optional[Callable[..., float]] = [38;5;28;01mNone[39;00m,
    similarity_mode: Optional[llama_index.core.base.embeddings.base.SimilarityMode] = [38;5;28;01mNone[39;00m,
    similarity_threshold: float = [32m0.8[39m,
) -> [38;5;28;01mNone[39;00m
[31mDocstring:[39m     
Embedding similarity evaluator.

Evaluate the quality of a question answering system by
comparing the similarity between embeddings of the generated answer
and the reference answer.

Inspired by this paper:
- Semantic Answer Similarity for Evaluating Question Answering Models
    https://arxiv.org/pdf/2108.06130.pdf

Args:
    similarity_threshold (float): Embedding similarity threshold for "passing".
        Defaults to 0.8.
[31mFile:[39m           ~/miniconda3/envs/rag30/lib/python3.12/site-packages/llama_index/core/evaluation/se

In [8]:
semantic_evaluator = SemanticSimilarityEvaluator(similarity_threshold=0.8)
gd_result = await semantic_evaluator.aevaluate(
    response=str(test_data['json_gemma_response']),
    reference=str(test_data['reference_answer']),
)
print("Score: ", gd_result.score)
print("Passing: ", gd_result.passing)  # use similarity threshold 

Score:  0.9967628031093531
Passing:  True


In [9]:
bd_result = await semantic_evaluator.aevaluate(
    response=str(test_data['llama_en_response']),
    reference=str(test_data['reference_answer']),
)
print("Score: ", bd_result.score)
print("Passing: ", bd_result.passing)  # use similarity threshold 

Score:  0.8674033244211179
Passing:  True


# CorrectnessEvaluator

In [11]:
CorrectnessEvaluator?

[31mInit signature:[39m
CorrectnessEvaluator(
    llm: Optional[llama_index.core.llms.llm.LLM] = [38;5;28;01mNone[39;00m,
    eval_template: Union[llama_index.core.prompts.base.BasePromptTemplate, str, NoneType] = [38;5;28;01mNone[39;00m,
    score_threshold: float = [32m4.0[39m,
    parser_function: Callable[[str], Tuple[Optional[float], Optional[str]]] = <function default_parser at [32m0x75669cef9620[39m>,
) -> [38;5;28;01mNone[39;00m
[31mDocstring:[39m     
Correctness evaluator.

Evaluates the correctness of a question answering system.
This evaluator depends on `reference` answer to be provided, in addition to the
query string and response string.

It outputs a score between 1 and 5, where 1 is the worst and 5 is the best,
along with a reasoning for the score.
Passing is defined as a score greater than or equal to the given threshold.

Args:
    eval_template (Optional[Union[BasePromptTemplate, str]]):
        Template for the evaluation prompt.
    score_threshold (

In [16]:
correct_evaluator = CorrectnessEvaluator(llm = llm, score_threshold=4.0)

In [13]:
prompt_en_llama = PromptTemplate(
    "Extract a multiple-choice question (MCQ) from the following text.\n"
    "If the original text does not provide an answer, "
    "omit the answer field entirely and do not attempt to guess it: {text}"
)

In [14]:
query = prompt_en_llama.format(text=test_data['context'])
result = correct_evaluator.evaluate(
    query=query,
    response=str(test_data['json_gemma_response']),
    reference=str(test_data['reference_answer']),
)
print(f"feedback: {result.feedback}")
print(f"score: {result.score}")
print(f"passing: {result.passing}")

feedback: The generated MCQ matches the reference exactly in stem and options (only minor non-substantive difference in qid formatting as an integer vs string). It correctly omits an answer field as required.
score: 5.0
passing: True


In [17]:
query = prompt_en_llama.format(text=test_data['context'])
result = correct_evaluator.evaluate(
    query=query,
    response=str(test_data['llama_en_response']),
    reference=str(test_data['reference_answer']),
)
print(f"feedback: {result.feedback}")
print(f"score: {result.score}")
print(f"passing: {result.passing}")

feedback: The generated answer is incorrect and largely garbled: the stem text is nonsensical and the options do not match the original question or the reference answer. It fails to extract the correct MCQ fields and provides wrong characters/words, so it is not a valid extraction.
score: 1.0
passing: False


# FaithfulnessEvaluator

In [18]:
FaithfulnessEvaluator?

[31mInit signature:[39m
FaithfulnessEvaluator(
    llm: [33m'Optional[LLM]'[39m = [38;5;28;01mNone[39;00m,
    raise_error: [33m'bool'[39m = [38;5;28;01mFalse[39;00m,
    eval_template: [33m'Optional[Union[str, BasePromptTemplate]]'[39m = [38;5;28;01mNone[39;00m,
    refine_template: [33m'Optional[Union[str, BasePromptTemplate]]'[39m = [38;5;28;01mNone[39;00m,
) -> [33m'None'[39m
[31mDocstring:[39m     
Faithfulness evaluator.

Evaluates whether a response is faithful to the contexts
(i.e. whether the response is supported by the contexts or hallucinated.)

This evaluator only considers the response string and the list of context strings.

Args:
    raise_error(bool): Whether to raise an error when the response is invalid.
        Defaults to False.
    eval_template(Optional[Union[str, BasePromptTemplate]]):
        The template to use for evaluation.
    refine_template(Optional[Union[str, BasePromptTemplate]]):
        The template to use for refining the evalu

In [19]:
evaluator = FaithfulnessEvaluator(llm=llm)

In [20]:
evaluator.evaluate?

[31mSignature:[39m
evaluator.evaluate(
    query: Optional[str] = [38;5;28;01mNone[39;00m,
    response: Optional[str] = [38;5;28;01mNone[39;00m,
    contexts: Optional[Sequence[str]] = [38;5;28;01mNone[39;00m,
    **kwargs: Any,
) -> llama_index.core.evaluation.base.EvaluationResult
[31mDocstring:[39m
Run evaluation with query string, retrieved contexts,
and generated response string.

Subclasses can override this method to provide custom evaluation logic and
take in additional arguments.
[31mFile:[39m      ~/miniconda3/envs/rag30/lib/python3.12/site-packages/llama_index/core/evaluation/base.py
[31mType:[39m      method

In [26]:
eval_result = evaluator.evaluate(contexts=[test_data['context']], response=str(test_data['json_gemma_response']))
print(f"feedback: {eval_result.feedback}, passing: {eval_result.passing}, score: {eval_result.score}")

feedback: YES, passing: True, score: 1.0


In [27]:
fake_response = test_data['json_gemma_response']
fake_response['ans'] = 'C'
eval_result = evaluator.evaluate(contexts=[test_data['context']], response=str(fake_response))
print(f"feedback: {eval_result.feedback}, passing: {eval_result.passing}, score: {eval_result.score}")

feedback: NO, passing: False, score: 0.0
