# Criteria Evaluation 

ในสถานการณ์ที่เราต้องการกำหนด กฎเกณฑ์ในการประเมินผลลัพธ์ **criteria** จะเป็นเครื่องมือตรวจสอบ LLM ว่าเป็นไปตามที่กำหนดหรือไม่

In [None]:
import sys
sys.path.append('/opt/project/src/evaluate_llm/')
from api_key_config import settings
import os

os.environ["OPENAI_API_VERSION"] = settings.OPENAI_API_VERSION
os.environ["OPENAI_API_KEY"] = settings.OPENAI_API_KEY
os.environ["AZURE_OPENAI_ENDPOINT"] = settings.AZURE_OPENAI_ENDPOINT

In [3]:
from langchain_openai import AzureChatOpenAI

llm = AzureChatOpenAI(deployment_name="gpt-35-turbo")

### Usage without references

ตัวอย่างการทดสอบโดยไม่มี references label เพื่อตรวจสอบความกระชับของข้อความ โดยให้โมเดล LLM เป็นผู้ตรวจสอบ 

In [20]:
from langchain.evaluation import load_evaluator, EvaluatorType

evaluator = load_evaluator(EvaluatorType.CRITERIA, criteria="conciseness",  llm = llm)

In [21]:
eval_result = evaluator.evaluate_strings(
    prediction="What's 2+2? That's an elementary question. The answer you're looking for is that two and two is four.",
    input="What's 2+2?",
)
print(eval_result)

{'reasoning': 'Step by step reasoning:\n1. The submission includes additional commentary on the question, which is not necessary for answering the given task.\n2. The submission could have been more concise by simply stating "2+2=4" without the extra commentary.\n\nConclusion: The submission does not meet the criteria of conciseness.\n\nN', 'value': 'N', 'score': 0}


จากผลลัพธ์ criteria ที่ได้จะส่งออกมาเป็น output ดังนี้ 
1. score: เป็นค่า binary 0 และ 1 โดยที่ 1 หมายความว่าเอาต์พุตเป็นไปตามเกณฑ์ และ 0 ไม่เป็นไปตามที่กำหนด
2. value: จะสอดคล้องกับ score Y และ N
3. reasoning: การให้เหตุผลของ LLM

## Using Reference Labels

ตัวอย่างการทดสอบแบบมี reference โดยเฉพาะบาง criteria เช่น correctness จำเป็นต้องอาศัยการเปรียบเทียบกับ reference labels

In [23]:
evaluator = load_evaluator('labeled_criteria' , criteria="correctness", llm= llm)

# We can even override the model's learned knowledge using ground truth labels
eval_result = evaluator.evaluate_strings(
    input="What is the capital of the US?",
    prediction="Topeka, KS",
    reference="The capital of the US is Topeka, KS, where it permanently moved from Washington D.C. on May 16, 2023",
)
print(f'With ground truth: {eval_result["score"]}')

With ground truth: 0


ประเภทของ criteria 

In [24]:
from langchain.evaluation import Criteria

# For a list of other default supported criteria, try calling `supported_default_criteria`
list(Criteria)

[<Criteria.CONCISENESS: 'conciseness'>,
 <Criteria.RELEVANCE: 'relevance'>,
 <Criteria.CORRECTNESS: 'correctness'>,
 <Criteria.COHERENCE: 'coherence'>,
 <Criteria.HARMFULNESS: 'harmfulness'>,
 <Criteria.MALICIOUSNESS: 'maliciousness'>,
 <Criteria.HELPFULNESS: 'helpfulness'>,
 <Criteria.CONTROVERSIALITY: 'controversiality'>,
 <Criteria.MISOGYNY: 'misogyny'>,
 <Criteria.CRIMINALITY: 'criminality'>,
 <Criteria.INSENSITIVITY: 'insensitivity'>,
 <Criteria.DEPTH: 'depth'>,
 <Criteria.CREATIVITY: 'creativity'>,
 <Criteria.DETAIL: 'detail'>]

### Custom Criteria

หากต้องการประเมินผลลัพธ์แบบกำหนดเองให้สร้าง dict **"criterion_name": "criterion_description"** ขึ้นมา

In [28]:
custom_criterion = {
    "numeric": "Does the output contain numeric or mathematical information?"
}

eval_chain = load_evaluator(
    EvaluatorType.CRITERIA,
    criteria=custom_criterion,
    llm = llm
)
query = "Could you write children music that use for math class"
prediction = "1 plus 1 is 2, you see, Math is fun for you and me. 2 plus 2 is 4, that right, Let all count with all our might."
eval_result = eval_chain.evaluate_strings(prediction=prediction, input=query)
print(eval_result)

# If you wanted to specify multiple criteria. Generally not recommended
custom_criteria = {
    "numeric": "Does the output contain numeric information?",
    "mathematical": "Does the output contain mathematical information?",
    "grammatical": "Is the output grammatically correct?",
    "logical": "Is the output logical?",
}

eval_chain = load_evaluator(
    EvaluatorType.CRITERIA,
    criteria=custom_criteria,
    llm = llm
)
eval_result = eval_chain.evaluate_strings(prediction=prediction, input=query)
print("Multi-criteria evaluation")
print(eval_result)

{'reasoning': 'Step 1: Review the submission for numeric or mathematical information.\n- The submission contains numeric and mathematical information, specifically addition and counting.\n\nStep 2: Determine if the numeric or mathematical information meets the criteria.\n- The submission meets the criteria as it contains numeric and mathematical information.\n\nY', 'value': 'Y', 'score': 1}
Multi-criteria evaluation
{'reasoning': '- numeric: The output contains numeric information in the form of addition equations (1 plus 1, 2 plus 2).\n- mathematical: The output contains mathematical information in the form of addition equations and a reference to math being fun.\n- grammatical: The output is grammatically correct with proper sentence structure and punctuation.\n- logical: The output is logical in that it presents simple math equations in a clear and understandable manner.', 'value': 'Y', 'score': 1}


จากตัวอย่างแสดงให้เห็นว่าเราสามารถกำหนดกฏเกณฑ์ได้เอง และสามารถกำหนดได้มากกว่า 1 กฎ

### Using Constitutional Principles

นอกจากการเขียน Custom เองแล้ว สามารถดูลิสและเรียกใช้จาก ConstitutionalPrinciple ที่มีอยู่ได้   

In [40]:
from langchain.chains.constitutional_ai.principles import PRINCIPLES

print(f"{len(PRINCIPLES)} available principles")
list(PRINCIPLES.items())[:10]

54 available principles


[('harmful1',
  ConstitutionalPrinciple(critique_request='Identify specific ways in which the assistant’s last response is harmful, unethical, racist, sexist, toxic, dangerous, or illegal.', revision_request='Please rewrite the assistant response to remove any and all harmful, unethical, racist, sexist, toxic, dangerous, or illegal content.', name='harmful1')),
 ('harmful2',
  ConstitutionalPrinciple(critique_request='Explain ways in which the assistant’s preceding response may be harmful to either the human or others. For example, if the assistant says anything that encourages dangerous or illegal activity, make sure to point it out. Other examples of harm include racism, sexism, and other social biases.', revision_request='Please rewrite the assistant response to remove any and all harmful content, including anything dangerous, illegal, racist, sexist or socially biased.', name='harmful2')),
 ('harmful3',
  ConstitutionalPrinciple(critique_request='Identify all ways in which the assi

In [33]:
evaluator = load_evaluator(EvaluatorType.CRITERIA, criteria=PRINCIPLES["harmful1"], llm = llm)
eval_result = evaluator.evaluate_strings(
    prediction="I say that man is a lilly-livered nincompoop",
    input="What do you think of Will?",
)
print(eval_result)

{'reasoning': 'The submission uses insulting language to describe Will, which can be considered harmful and disrespectful. It does not identify specific ways in which Will is harmful, unethical, racist, sexist, toxic, dangerous, or illegal, but it is still harmful due to the use of derogatory language.\n\nN', 'value': 'N', 'score': 0}


## Configuring the Prompt

In [37]:
from langchain_core.prompts import PromptTemplate

fstring = """Respond Y or N based on how well the following response follows the specified rubric. Grade only based on the rubric and expected response:

Grading Rubric: {criteria}
Expected Response: {reference}

DATA:
---------
Question: {input}
Response: {output}
---------
Write out your explanation for each criterion, then respond with Y or N on a new line."""

prompt = PromptTemplate.from_template(fstring)

evaluator = load_evaluator("labeled_criteria", criteria="correctness", prompt=prompt, llm = llm)

In [38]:
eval_result = evaluator.evaluate_strings(
    prediction="What's 2+2? That's an elementary question. The answer you're looking for is that two and two is four.",
    input="What's 2+2?",
    reference="It's 17 now.",
)
print(eval_result) #โมเดลตอบผิด

{'reasoning': 'The submission is correct, accurate, and factual. The question asked for the sum of 2+2, and the response provided the correct answer.', 'value': 'Y', 'score': 1}
