In [6]:
import logging
import random
import os
from dotenv import load_dotenv

if "dir_changed" not in locals():
    os.chdir("..")
    loaded = load_dotenv(override=True)
    dir_changed = True

In [7]:
from prompt_optimization.utils.data import load_summeval_jsonl
from prompt_optimization.prompt_eval.exact_match_evaluator import (
    ExactMatchEvaluatorConfig,
    ExactMatchEvaluator,
)
from prompt_optimization.types.prompt import Prompt
from prompt_optimization.utils.data import (
    load_rewardbench_references,
    CHAT_SUBSETS,
    CHAT_HARD_SUBSETS,
    SAFETY_SUBSETS,
    REASONING_SUBSETS,
)
from prompt_optimization.prompt_eval.dict_similarity_evaluator import (
    DictSimilarityEvaluator,
    DictSimilarityEvaluatorConfig,
)
from prompt_optimization.llm.open_ai_compliant_llm_engine import (
    OpenAICompliantLLMEngine,
)

In [9]:
logging.basicConfig(level=logging.WARNING)

references = load_summeval_jsonl("data/summeval.jsonl")

random.seed(123)
references_shuffled = random.sample(references, len(references))

se_train = references_shuffled[:25]
se_test = references_shuffled[50:150]

In [10]:
training_examples_per_subset = 1
validations_per_subset = 8

subsets_map = {
    "chat": CHAT_SUBSETS,
    "chat_hard": CHAT_HARD_SUBSETS,
    "safety": SAFETY_SUBSETS,
    "reasoning": REASONING_SUBSETS,
}

subsets_all = [subset for subsets in subsets_map.values() for subset in subsets]

subset_data = {subset: load_rewardbench_references([subset]) for subset in subsets_all}

rb_train = []
for subset in subsets_all:
    rb_train.extend(subset_data[subset][:training_examples_per_subset])

rb_test = []
for category, subsets in subsets_map.items():
    for subset in subsets:
        rb_test.extend(
            subset_data[subset][
                training_examples_per_subset : training_examples_per_subset
                + validations_per_subset
            ]
        )

Generating raw split: 100%|██████████| 5123/5123 [00:00<00:00, 93423.56 examples/s]
Generating filtered split: 100%|██████████| 2985/2985 [00:00<00:00, 139506.35 examples/s]
Filter: 100%|██████████| 5123/5123 [00:00<00:00, 62722.89 examples/s]
Filter: 100%|██████████| 2985/2985 [00:00<00:00, 165704.87 examples/s]
Filter: 100%|██████████| 5123/5123 [00:00<00:00, 149826.86 examples/s]
Filter: 100%|██████████| 2985/2985 [00:00<00:00, 131162.63 examples/s]
Filter: 100%|██████████| 5123/5123 [00:00<00:00, 144720.79 examples/s]
Filter: 100%|██████████| 2985/2985 [00:00<00:00, 126172.77 examples/s]
Filter: 100%|██████████| 5123/5123 [00:00<00:00, 140440.65 examples/s]
Filter: 100%|██████████| 2985/2985 [00:00<00:00, 166482.69 examples/s]
Filter: 100%|██████████| 5123/5123 [00:00<00:00, 128386.58 examples/s]
Filter: 100%|██████████| 2985/2985 [00:00<00:00, 179786.86 examples/s]
Filter: 100%|██████████| 5123/5123 [00:00<00:00, 121202.01 examples/s]
Filter: 100%|██████████| 2985/2985 [00:00<00:0

In [12]:
llama31_70b = OpenAICompliantLLMEngine(
    base_url=os.environ["DEEPINFRA_BASE_URL"],
    api_keys=[os.environ["DEEPINFRA_API_KEY_A1"]],
    model="meta-llama/Meta-Llama-3.1-70B-Instruct",
)
llama31_70b_eval = OpenAICompliantLLMEngine(
    base_url=os.environ["DEEPINFRA_BASE_URL"],
    api_keys=[os.environ["DEEPINFRA_API_KEY_A2"]],
    model="meta-llama/Meta-Llama-3.1-70B-Instruct",
    temperature=0.0,
)
llama31_70b_b = OpenAICompliantLLMEngine(
    base_url=os.environ["DEEPINFRA_BASE_URL"],
    api_keys=[os.environ["DEEPINFRA_API_KEY_A3"]],
    model="meta-llama/Meta-Llama-3.1-70B-Instruct",
)
llama31_70b_eval_b = OpenAICompliantLLMEngine(
    base_url=os.environ["DEEPINFRA_BASE_URL"],
    api_keys=[os.environ["DEEPINFRA_API_KEY_A4"]],
    model="meta-llama/Meta-Llama-3.1-70B-Instruct",
    temperature=0.0,
)

In [13]:
se_system = """You are an expert system at evaluating created summaries on the dimensions of coherence, consistency, fluency, and relevance."""

se_paper_instructions = """
Consider these dimensions of summary quality:

Coherence - the collective quality of all sentences.
We align this dimension with the DUC quality
question (Dang, 2005) of structure and coherence
whereby "the summary should be well-structured
and well-organized. The summary should not just
be a heap of related information, but should build
from sentence to sentence to a coherent body of
information about a topic."

Consistency - the factual alignment between the
summary and the summarized source. A factually
consistent summary contains only statements that
are entailed by the source document. Annotators
were also asked to penalize summaries that contained hallucinated facts.

Fluency - the quality of individual sentences.
Drawing again from the DUC quality guidelines,
sentences in the summary "should have no formatting problems, capitalization errors or obviously
ungrammatical sentences (e.g., fragments, missing
components) that make the text difficult to read."

Relevance - selection of important content from
the source. The summary should include only
important information from the source document.

You will now evaluate the quality of a given summary, with respect to a reference, on these
four dimensions.

Output your evaluation as a JSON object like this:
{
    "coherence: <float between 1 and 5>",
    "consistency: <float between 1 and 5>",
    "fluency: <float between 1 and 5>",
    "relevance: <float between 1 and 5>"
}

Before you output the JSON dict, lay out your reasoning step by step.
"""

se_instructions = """
You will evaluate the quality of a given summary, with respect to a reference on four dimensions.

Output your evaluation as a JSON object like this:
{
    "coherence: <float between 1 and 5>",
    "consistency: <float between 1 and 5>",
    "fluency: <float between 1 and 5>",
    "relevance: <float between 1 and 5>"
}

Before you output the JSON dict, lay out your reasoning step by step.
"""

rb_system = "You are a fair judge assistant tasked with providing clear, objective feedback based on specific criteria, ensuring each assessment reflects the absolute standards set for performance."

rb_prompt_v2 = (
    "Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user question displayed below. "
    "You should choose the assistant that follows the user's instructions and answers the user's question better. Your evaluation should consider "
    "factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of their responses. Begin your evaluation by "
    "comparing the two responses and provide a short explanation. Avoid any position biases and ensure that the order in which the responses were "
    "presented does not influence your decision. Do not allow the length of the responses to influence your evaluation. Do not favor certain names "
    "of the assistants. Be as objective as possible. After providing your explanation, output your final verdict by strictly following this format: "
    '"[[A]]" if assistant A is better, "[[B]]" if assistant B is better.'  # removed tie option as , and \"[[C]]\ " for a tie
)

In [14]:
prompt_set_se = [Prompt(prompt=se_paper_instructions), Prompt(prompt=se_instructions)]

prompt_set_rb = [Prompt(prompt=rb_prompt_v2)]

In [15]:
conf = DictSimilarityEvaluatorConfig(
    testset=se_test,
    llm=llama31_70b_eval,
    validation=True,
    system_prompt=se_system,
    instruction_execution_prompt="{input}\n\n{instruction}",
)
se_val = DictSimilarityEvaluator(conf)

conf = ExactMatchEvaluatorConfig(
    testset=rb_test,
    llm=llama31_70b_eval_b,
    validation=True,
    output_extraction_regex=r"\[\[.*\]\]",
    system_prompt=rb_system,
    instruction_execution_prompt="{instruction}\n\n{input}",
)
rb_val = ExactMatchEvaluator(conf)

await se_val(prompt_set_se)

In [30]:
await rb_val(prompt_set_rb, sequential=False)

Eval on prompt: Please act as an impartial jud...:   0%|          | 0/176 [00:00<?, ?it/s]{"message": "Cache hit from key: {\"base_url\": \"https://api.deepinfra.com/v1/openai/\", \"max_tokens\": 4096, \"messages\": [{\"content\": \"Y...", "timestamp": "2024-08-19T11:06:04.661376Z", "severity": "INFO", "labels": {"view": "query-processing", "searchId": ""}}
{"message": "Prompt: Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user question displayed below. You should choose the assistant that follows the user's instructions and answers the user's question better. Your evaluation should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of their responses. Begin your evaluation by comparing the two responses and provide a short explanation. Avoid any position biases and ensure that the order in which the responses were presented does not influence your decision. Do not allow

[Prompt(prompt='Please act as an impartial judge and evaluate the quality of the respo [...]  assistant B is better.' (len=915), mean_score=None, mean_validation_score=0.7386, zero_score_cases=False)]

In [31]:
prompt_set_rb

[Prompt(prompt='Please act as an impartial judge and evaluate the quality of the respo [...]  assistant B is better.' (len=915), mean_score=None, mean_validation_score=0.7386, zero_score_cases=False)]