# Metrics

In [1]:
from opik.evaluation.metrics.score_result import ScoreResult

In [2]:
# in our code bleu score is actually cosine similarity which uses this class and function I GUESS

from abc import ABC, abstractmethod
import re, math
class Comparator(ABC):
    @abstractmethod
    def compare(self, string1, string2):
        pass

class CosineSimilarity(Comparator):
    def compare(self, string1, string2):
        # Tokenize and create a combined set of unique words
        combined_set = self._create_combined_set(string1, string2)
        # Vectorize the strings
        vector1 = self._vectorize(string1, combined_set)
        vector2 = self._vectorize(string2, combined_set)
        dot_product = sum(p*q for p, q in zip(vector1, vector2))
        magnitude_vec1 = math.sqrt(sum([val**2 for val in vector1]))
        magnitude_vec2 = math.sqrt(sum([val**2 for val in vector2]))
        if magnitude_vec1 * magnitude_vec2 == 0:
            # Avoid division by zero
            return 0
        return dot_product / (magnitude_vec1 * magnitude_vec2)

    def _tokenize(self, string):
        """
        Tokenize the input string into a list of words.
        
        Args:
            string (str): The string to tokenize.
        
        Returns:
            list: A list of lowercased words from the string.
        """
        return re.findall(r'\b\w+\b', string.lower())

    def _create_combined_set(self, string1, string2):
        return set(self._tokenize(string1)).union(set(self._tokenize(string2)))

    def _vectorize(self, string, combined_set):
        tokenized = self._tokenize(string)
        vector = [tokenized.count(word) for word in combined_set]
        return vector

In [3]:
# opik expects the SAME keywords for the metric function else it throws an unknown error that doesnt even tell u the problem is with keywords. 
def cosine_score(dataset_item, llm_output):
    score = CosineSimilarity().compare(dataset_item.get('answer'), llm_output)
    return ScoreResult(name="CosineSimilarity", value=score, reason=f"Cosine similarity was found to be {score}", scoring_failed=False)

In [4]:
from dotenv import load_dotenv
load_dotenv()

True

## Create Dataset and Prompt

In [5]:
from opik_optimizer import ChatPrompt
from opik import Opik
import pandas as pd

# generated answer is the column which uses the prompt 'given {{context}}, answer the {{question}}' 
# answer similarity uses bleu score betweeen generated answer and the answer
data = pd.read_csv('demo-dataset.csv')[['context','question','answer','generated answer', 'Answer Similarity', 'Groundedness']]

cli= Opik(project_name='fi')

dataset = cli.get_or_create_dataset(name='prompt-opt-test')
dataset.insert_from_pandas(dataframe=data)

initial_prompt = ChatPrompt(
    name = 'init-qa-prompt',
    user='given {context}, answer the {question}',
    project_name="fi"
)

# Optimizers

In [6]:
from opik_optimizer import MetaPromptOptimizer, EvolutionaryOptimizer, FewShotBayesianOptimizer
from opik_optimizer.mipro_optimizer import MiproOptimizer

common_opt_params = {
    "model":'openai/gpt-4o-mini',
    "temperature":0.5,
    "verbose":1,
    "n_threads":5,
    "max_tokens":1024,
    # "seed":47
}

## Meta Prompt Optimizer
mainly used for re-wording the prompt, and not really recommended for complex tasks like agentic or prompts with few shot examples 

In [7]:
meta_optimizer = MetaPromptOptimizer(
    **common_opt_params,
    reasoning_model='openai/gpt-4o',
    max_rounds = 3,
    num_prompts_per_round=3,
)

## Evolutionary Optimizer
main usecase is optimizing for complex evaluations, or group of evaluations. It uses much higher number of mutations to explore as many prompt combinations as possible.  

In [8]:
evo_optimizer = EvolutionaryOptimizer(
    **common_opt_params,
    population_size=3,                 # Number of prompts in each generation
    num_generations=2,                 # Number of iterations the algorithm will run
    mutation_rate=0.2,                  # Probability of mutating an individual
    crossover_rate=0.8,                 # Probability of crossing over two individuals
    tournament_size=4,                  # Size of the tournament for selection (if not MOO)
    elitism_size=3,                     # Number of best individuals to carry over (if not MOO)
    adaptive_mutation=True,
    enable_llm_crossover=True
)

## FewShot Bayesian Optimizer
used when prompts should contain fewshot examples. can burn through tokens very easily.

In [9]:
bayes_optimizer = FewShotBayesianOptimizer(
    **common_opt_params
)

## MIPRO Optimizer
main use case is complex multi step reasoning or tool use.

In [10]:
mipro_optimizer = MiproOptimizer(
    model = common_opt_params['model'],
    project_name="fi"
)

## Final Optimize Prompt Run

In [11]:
common_opt_run_params = {
    "prompt": initial_prompt,
    "dataset": dataset,
    "metric": cosine_score,
    "n_samples": 7
}

In [12]:
optims = [meta_optimizer, evo_optimizer]
# evo takes much much much longer to optimize. Really not recommended tbh. Should run separately.
# bayes results in hallucinations.

In [13]:
from opik_optimizer import TaskConfig

results = []

for optim in optims:
    print(f"Optimizing using {optim.__class__.__name__}")

    if isinstance(optim, MiproOptimizer):
        res = optim.optimize_prompt(
            dataset=dataset,
            metric=cosine_score,
            task_config = TaskConfig(
                instruction_prompt = common_opt_run_params['prompt'].user,
                input_dataset_fields = ['question','context'],
                output_dataset_field = 'answer',
                use_chat_prompt = True
            ),
            num_candidates=1,
        )
    else:
        res = optim.optimize_prompt(
            **common_opt_run_params
        )
    results.append(res)

Optimizing using MetaPromptOptimizer
╭────────────────────────────────────────────────────────────────────╮
│ [32m● [0mRunning Opik Evaluation - [34mMetaPromptOptimizer[0m                    │
│                                                                    │
│ -> View optimization details ]8;id=26225;https://www.comet.com/opik/api/v1/session/redirect/optimizations/?optimization_id=01997687-fd45-7ff1-8199-e297d5bf6b5f&dataset_id=01997609-de6f-707e-ba5c-3c7756e0eb62&path=aHR0cHM6Ly93d3cuY29tZXQuY29tL29waWsvYXBpLw==\in your Opik dashboard]8;;\                │
╰────────────────────────────────────────────────────────────────────╯


> Let's optimize the prompt:

[2m╭─[0m[2m user [0m[2m────────────────────────────────────────────────────────────[0m[2m─╮[0m
[2m│[0m                                                                    [2m│[0m
[2m│[0m  given {context}, answer the {question}                            [2m│[0m
[2m│[0m                                  

Output()

[32m  Baseline score was: 0.7537.[0m

> Starting the optimization run
│
│ - Starting optimization round 1 of 3
│    Generating candidate prompts:
[2m│      Successfully generated 3 candidate prompts[0m
│
│    Evaluating candidate prompt 1:
│         [2m╭─[0m[2m system [0m[2m──────────────────────────────────────────────────────────[0m[2m─╮[0m
│         [2m│[0m                                                                    [2m│[0m
│         [2m│[0m  Using the provided {context}, answer the {question} as            [2m│[0m
│         [2m│[0m  accurately as possible. Ensure your response is concise and       [2m│[0m
│         [2m│[0m  directly addresses the question, reflecting the key points from   [2m│[0m
│         [2m│[0m  the context.                                                      [2m│[0m
│         [2m│[0m                                                                    [2m│[0m
│         [2m╰──────────────────────────────────────────────

Output()

[32m│          Evaluation score: 0.8394 (11.37%)[0m
│
│
│    Evaluating candidate prompt 2:
│         [2m╭─[0m[2m system [0m[2m──────────────────────────────────────────────────────────[0m[2m─╮[0m
│         [2m│[0m                                                                    [2m│[0m
│         [2m│[0m  Refer to the {context} to provide a precise answer to the         [2m│[0m
│         [2m│[0m  {question}. Your response should be a well-structured sentence    [2m│[0m
│         [2m│[0m  that captures the essence of the context related to the           [2m│[0m
│         [2m│[0m  question.                                                         [2m│[0m
│         [2m│[0m                                                                    [2m│[0m
│         [2m╰────────────────────────────────────────────────────────────────────╯[0m
│         [2m╭─[0m[2m user [0m[2m────────────────────────────────────────────────────────────[0m[2m─╮[0m
│        

Output()

[32m│          Evaluation score: 0.7661 (1.65%)[0m
│
│
│    Evaluating candidate prompt 3:
│         [2m╭─[0m[2m system [0m[2m──────────────────────────────────────────────────────────[0m[2m─╮[0m
│         [2m│[0m                                                                    [2m│[0m
│         [2m│[0m  Based on the {context}, generate an answer to the {question}      [2m│[0m
│         [2m│[0m  that highlights the main goal or purpose mentioned. Ensure your   [2m│[0m
│         [2m│[0m  answer is succinct and aligns with the context provided.          [2m│[0m
│         [2m│[0m                                                                    [2m│[0m
│         [2m╰────────────────────────────────────────────────────────────────────╯[0m
│         [2m╭─[0m[2m user [0m[2m────────────────────────────────────────────────────────────[0m[2m─╮[0m
│         [2m│[0m                                                                    [2m│[0m
│         

Output()

[31m│          Evaluation score: 0.7532 (-0.07%)[0m
│
│
│    Completed optimization round 1 of 3
[32m│    Found a new best performing prompt: 0.8394 (11.37%)[0m
│
│ - Starting optimization round 2 of 3
│    Generating candidate prompts:
[2m│      Successfully generated 3 candidate prompts[0m
│
│    Evaluating candidate prompt 1:
│         [2m╭─[0m[2m system [0m[2m──────────────────────────────────────────────────────────[0m[2m─╮[0m
│         [2m│[0m                                                                    [2m│[0m
│         [2m│[0m  Utilize the given {context} to provide a precise and concise      [2m│[0m
│         [2m│[0m  answer to the {question}. Ensure your response directly reflects  [2m│[0m
│         [2m│[0m  the main points from the context and addresses the question       [2m│[0m
│         [2m│[0m  accurately.                                                       [2m│[0m
│         [2m│[0m                                              

Output()

[31m│          Evaluation score: 0.7531 (-10.28%)[0m
│
│
│    Evaluating candidate prompt 2:
│         [2m╭─[0m[2m system [0m[2m──────────────────────────────────────────────────────────[0m[2m─╮[0m
│         [2m│[0m                                                                    [2m│[0m
│         [2m│[0m  From the provided {context}, extract and summarize the key        [2m│[0m
│         [2m│[0m  information needed to answer the {question}. Your response        [2m│[0m
│         [2m│[0m  should be concise and directly related to the question.           [2m│[0m
│         [2m│[0m                                                                    [2m│[0m
│         [2m╰────────────────────────────────────────────────────────────────────╯[0m
│         [2m╭─[0m[2m user [0m[2m────────────────────────────────────────────────────────────[0m[2m─╮[0m
│         [2m│[0m                                                                    [2m│[0m
│       

Output()

[31m│          Evaluation score: 0.7854 (-6.43%)[0m
│
│
│    Evaluating candidate prompt 3:
│         [2m╭─[0m[2m system [0m[2m──────────────────────────────────────────────────────────[0m[2m─╮[0m
│         [2m│[0m                                                                    [2m│[0m
│         [2m│[0m  Refer to the {context} to construct a clear and concise answer    [2m│[0m
│         [2m│[0m  to the {question}. Ensure your response highlights the core       [2m│[0m
│         [2m│[0m  elements from the context that are essential for addressing the   [2m│[0m
│         [2m│[0m  question.                                                         [2m│[0m
│         [2m│[0m                                                                    [2m│[0m
│         [2m╰────────────────────────────────────────────────────────────────────╯[0m
│         [2m╭─[0m[2m user [0m[2m────────────────────────────────────────────────────────────[0m[2m─╮[0m
│        

Output()

[31m│          Evaluation score: 0.7089 (-15.55%)[0m
│
│
│    Completed optimization round 2 of 3
[31m│    No improvement in this optimization round[0m
│
│ - Starting optimization round 3 of 3
│    Generating candidate prompts:
[2m│      Successfully generated 3 candidate prompts[0m
│
│    Evaluating candidate prompt 1:
│         [2m╭─[0m[2m system [0m[2m──────────────────────────────────────────────────────────[0m[2m─╮[0m
│         [2m│[0m                                                                    [2m│[0m
│         [2m│[0m  Using the provided {context}, generate a concise and precise      [2m│[0m
│         [2m│[0m  answer to the {question}. Focus on extracting key points from     [2m│[0m
│         [2m│[0m  the context that directly address the question. Ensure your       [2m│[0m
│         [2m│[0m  response is well-structured and reflects the main ideas           [2m│[0m
│         [2m│[0m  accurately.                                          

Output()

[31m│          Evaluation score: 0.7263 (-13.47%)[0m
│
│
│    Evaluating candidate prompt 2:
│         [2m╭─[0m[2m system [0m[2m──────────────────────────────────────────────────────────[0m[2m─╮[0m
│         [2m│[0m                                                                    [2m│[0m
│         [2m│[0m  From the given {context}, derive a direct and succinct answer to  [2m│[0m
│         [2m│[0m  the {question}. Highlight the essential elements that are         [2m│[0m
│         [2m│[0m  crucial for addressing the question. Your response should be      [2m│[0m
│         [2m│[0m  clear and reflect the core message of the context.                [2m│[0m
│         [2m│[0m                                                                    [2m│[0m
│         [2m╰────────────────────────────────────────────────────────────────────╯[0m
│         [2m╭─[0m[2m user [0m[2m────────────────────────────────────────────────────────────[0m[2m─╮[0m
│       

Output()

[31m│          Evaluation score: 0.6867 (-18.18%)[0m
│
│
│    Evaluating candidate prompt 3:
│         [2m╭─[0m[2m system [0m[2m──────────────────────────────────────────────────────────[0m[2m─╮[0m
│         [2m│[0m                                                                    [2m│[0m
│         [2m│[0m  Utilize the provided {context} to construct a concise answer to   [2m│[0m
│         [2m│[0m  the {question}. Ensure your response captures the main points     [2m│[0m
│         [2m│[0m  from the context that are necessary to accurately address the     [2m│[0m
│         [2m│[0m  question. The answer should be structured and directly relevant.  [2m│[0m
│         [2m│[0m                                                                    [2m│[0m
│         [2m╰────────────────────────────────────────────────────────────────────╯[0m
│         [2m╭─[0m[2m user [0m[2m────────────────────────────────────────────────────────────[0m[2m─╮[0m
│       

Output()

[31m│          Evaluation score: 0.7316 (-12.84%)[0m
│
│
│    Completed optimization round 3 of 3
[31m│    No improvement in this optimization round[0m
│

> Optimization complete

[32m╭─[0m[32m Optimization results [0m[32m────────────────────────────────────────────[0m[32m─╮[0m
[32m│[0m                                                                    [32m│[0m
[32m│[0m  [1;32mPrompt was optimized and improved from 0.7537 to 0.8394 (11.37%)[0m  [32m│[0m
[32m│[0m                                                                    [32m│[0m
[32m│[0m  Optimized prompt:                                                 [32m│[0m
[32m│[0m  [2m╭─[0m[2m system [0m[2m────────────────────────────────────────────────────[0m[2m─╮[0m  [32m│[0m
[32m│[0m  [2m│[0m                                                              [2m│[0m  [32m│[0m
[32m│[0m  [2m│[0m  Using the provided {context}, answer the {question} as      [2m│[0m  [32m│[0m
[32m│[0m 

Output()

[32m  Baseline score was: 0.7537.[0m

> Creating [1;36m2[0m variations of the initial prompt
│
│    Generating [1;36m1[0m fresh prompts based on the task description.
│       [2;32mSuccessfully generated 1 fresh prompts based on the task description.[0m
│
│    Generating [1;36m1[0m variations of the initial prompt.
[2;32m│       Successfully generated 0 variations of the initial prompt).[0m
│
│ Successfully initialized population with [1;36m3[0m prompts.

> Let's now evaluate the initial population


Output()

Output()

Output()

[32m  Prompt 1 score was: 0.7536625117758369.[0m
[2m  Prompt 2 score was: 0.6211151415243.[0m
[2m  Prompt 3 score was: 0.6546858447019043.[0m

> Starting evolutionary algorithm optimization
│   Starting generation 1 of 2
│      Performing crossover - Combining multiple prompts into a new one.
│   [2m      Recombining prompts using an LLM.[0m
│   [2;32m      Crossover successful, prompts have been combined and edited.[0m
[2;32m│[0m
│      Performing mutation - Altering prompts to improve their performance.
│   [2;32m      Mutation successful, 0 prompts have been edited.[0m
[2;32m│[0m
│      Performing evaluation - Assessing 2 prompts' performance.


Output()

[2m│      Performed evaluation for prompt 0 - Score: 0.6007.[0m


Output()

[2m│      Performed evaluation for prompt 1 - Score: 0.6002.[0m
│   Generation 1 completed. No improvement in this generation.
│
│   Starting generation 2 of 2
│      Performing crossover - Combining multiple prompts into a new one.
│   [2m      Recombining prompts using an LLM.[0m
│   [2;32m      Crossover successful, prompts have been combined and edited.[0m
[2;32m│[0m
│      Performing mutation - Altering prompts to improve their performance.
│   [2;32m      Mutation successful, prompt has been edited using an LLM (semantic mutation).[0m
│   [2;32m      Mutation successful, 1 prompts have been edited.[0m
[2;32m│[0m
│      Performing evaluation - Assessing 2 prompts' performance.


Output()

[2m│      Performed evaluation for prompt 0 - Score: 0.6353.[0m


Output()

[2m│      Performed evaluation for prompt 1 - Score: 0.7033.[0m
│   Generation 2 completed. No improvement in this generation.
│


> Optimization complete

[32m╭─[0m[32m Optimization results [0m[32m────────────────────────────────────────────[0m[32m─╮[0m
[32m│[0m                                                                    [32m│[0m
[32m│[0m  [1;2;31mOptimization run did not find a better prompt than the initial [0m   [32m│[0m
[32m│[0m  [1;2;31mone.[0m                                                              [32m│[0m
[32m│[0m  [1;2;31mScore: 0.7537[0m                                                     [32m│[0m
[32m│[0m                                                                    [32m│[0m
[32m│[0m  Optimized prompt:                                                 [32m│[0m
[32m│[0m  [2m╭─[0m[2m user [0m[2m──────────────────────────────────────────────────────[0m[2m─╮[0m  [32m│[0m
[32m│[0m  [2m│[0m                    