# HellaSwag

HellaSwag เป็นเกณการประเมิน LLM โดยการเติมประโยคให้สมบูรณ์ โดยครอบคลุมในหลายสาขาวิชา มากกว่า 10000 รายการ 

### Arguments
- [Optional] tasks: a list of tasks (HellaSwagTask enums), which specifies the subject areas for sentence completion evaluation. By default, this is set to all tasks. The list of HellaSwagTask enums can be found here.
- [Optional] n_shots: the number of "shots" to use for few-shot learning. This is set to 10 by default and cannot exceed 15.

In [None]:
from langchain_openai import AzureChatOpenAI
from deepeval.models.base_model import DeepEvalBaseLLM
import sys
sys.path.append('/opt/project/src/evaluate_llm/')
from api_key_config import settings
import os

os.environ["OPENAI_API_VERSION"] = settings.OPENAI_API_VERSION
os.environ["OPENAI_API_KEY"] = settings.OPENAI_API_KEY
os.environ["AZURE_OPENAI_ENDPOINT"] = settings.AZURE_OPENAI_ENDPOINT

class AzureOpenAI(DeepEvalBaseLLM):
    def __init__(
        self,
        model
    ):
        self.model = model

    def load_model(self):
        return self.model

    def generate(self, prompt: str) -> str:
        chat_model = self.load_model()
        return chat_model.invoke(prompt).content

    async def a_generate(self, prompt: str) -> str:
        chat_model = self.load_model()
        res = await chat_model.ainvoke(prompt)
        return res.content

    def get_model_name(self):
        return "Custom Azure OpenAI Model"

# Replace these with real values
custom_model = AzureChatOpenAI(
    deployment_name="gpt-35-turbo",
)
azure_openai = AzureOpenAI(model=custom_model)

In [None]:
from deepeval.benchmarks import HellaSwag
from deepeval.benchmarks.tasks import HellaSwagTask

# Define benchmark with specific tasks and shots
benchmark = HellaSwag(
    tasks=[HellaSwagTask.TRIMMING_BRANCHES_OR_HEDGES, HellaSwagTask.BATON_TWIRLING],
    n_shots=5
)

# Replace 'mistral_7b' with your own custom model
benchmark.evaluate(model=mistral_7b)
print(benchmark.overall_score)

The overall_score for this benchmark ranges from 0 to 1, where 1 signifies perfect performance and 0 indicates no correct answers. The model's score, based on exact matching, is calculated by determining the proportion of multiple-choice sentence-completion questions for which the model produces the precise correct letter answer (e.g. 'A') in relation to the total number of questions.

As a result, utilizing more few-shot prompts (n_shots) can greatly improve the model's robustness in generating answers in the exact correct format and boost the overall score.