# Metrics

In [1]:
from opik.evaluation.metrics.score_result import ScoreResult

In [2]:
# in our code bleu score is actually cosine similarity which uses this class and function I GUESS

from abc import ABC, abstractmethod
import re, math
class Comparator(ABC):
    @abstractmethod
    def compare(self, string1, string2):
        pass

class CosineSimilarity(Comparator):
    def compare(self, string1, string2):
        # Tokenize and create a combined set of unique words
        combined_set = self._create_combined_set(string1, string2)
        # Vectorize the strings
        vector1 = self._vectorize(string1, combined_set)
        vector2 = self._vectorize(string2, combined_set)
        dot_product = sum(p*q for p, q in zip(vector1, vector2))
        magnitude_vec1 = math.sqrt(sum([val**2 for val in vector1]))
        magnitude_vec2 = math.sqrt(sum([val**2 for val in vector2]))
        if magnitude_vec1 * magnitude_vec2 == 0:
            # Avoid division by zero
            return 0
        return dot_product / (magnitude_vec1 * magnitude_vec2)

    def _tokenize(self, string):
        """
        Tokenize the input string into a list of words.
        
        Args:
            string (str): The string to tokenize.
        
        Returns:
            list: A list of lowercased words from the string.
        """
        return re.findall(r'\b\w+\b', string.lower())

    def _create_combined_set(self, string1, string2):
        return set(self._tokenize(string1)).union(set(self._tokenize(string2)))

    def _vectorize(self, string, combined_set):
        tokenized = self._tokenize(string)
        vector = [tokenized.count(word) for word in combined_set]
        return vector

In [3]:
# opik expects the SAME keywords for the metric function else it throws an unknown error that doesnt even tell u the problem is with keywords. 
def cosine_score(dataset_item, llm_output):
    score = CosineSimilarity().compare(dataset_item.get('answer'), llm_output)
    return ScoreResult(name="CosineSimilarity", value=score, reason=f"Cosine similarity was found to be {score}", scoring_failed=False)

## FutureAGI Metrics

In [4]:
from fi.evals import Evaluator
evaluator = Evaluator()

def fi_eval(dataset_item, llm_output):
    eval = evaluator.evaluate(
        eval_templates="context_adherence",
        model_name="turing_flash",
        inputs={
            "context": dataset_item.get("context"),
            "output": llm_output
        }
    )
    return ScoreResult(name="FutureAGI Eval", value=eval.eval_results[0].output, reason=eval.eval_results[0].reason)



In [5]:
from dotenv import load_dotenv
load_dotenv()

True

## Create Dataset and Prompt

In [6]:
from opik_optimizer import ChatPrompt
from opik import Opik
import pandas as pd

# generated answer is the column which uses the prompt 'given {{context}}, answer the {{question}}' 
# answer similarity uses bleu score betweeen generated answer and the answer
data = pd.read_csv('demo-dataset.csv')[['context','question','answer','generated answer', 'Answer Similarity', 'Groundedness']]

cli= Opik(project_name='fi')

dataset = cli.get_or_create_dataset(name='prompt-opt-test')
dataset.insert_from_pandas(dataframe=data)

initial_prompt = ChatPrompt(
    name = 'init-qa-prompt',
    user='given {context}, answer the {question}',
    project_name="fi"
)

# Optimizers

In [7]:
from opik_optimizer import MetaPromptOptimizer, EvolutionaryOptimizer, FewShotBayesianOptimizer
from opik_optimizer.mipro_optimizer import MiproOptimizer

common_opt_params = {
    "model":'openai/gpt-4o-mini',
    "temperature":0.5,
    "verbose":1,
    "n_threads":5,
    "max_tokens":1024,
    # "seed":47
}

## Meta Prompt Optimizer
mainly used for re-wording the prompt, and not really recommended for complex tasks like agentic or prompts with few shot examples 

In [8]:
meta_optimizer = MetaPromptOptimizer(
    **common_opt_params,
    reasoning_model='openai/gpt-4o',
    max_rounds = 3,
    num_prompts_per_round=3,
)

## Evolutionary Optimizer
main usecase is optimizing for complex evaluations, or group of evaluations. It uses much higher number of mutations to explore as many prompt combinations as possible.  

In [9]:
evo_optimizer = EvolutionaryOptimizer(
    **common_opt_params,
    population_size=3,                 # Number of prompts in each generation
    num_generations=2,                 # Number of iterations the algorithm will run
    mutation_rate=0.2,                  # Probability of mutating an individual
    crossover_rate=0.8,                 # Probability of crossing over two individuals
    tournament_size=4,                  # Size of the tournament for selection (if not MOO)
    elitism_size=3,                     # Number of best individuals to carry over (if not MOO)
    adaptive_mutation=True,
    enable_llm_crossover=True
)

## FewShot Bayesian Optimizer
used when prompts should contain fewshot examples. can burn through tokens very easily.

In [10]:
bayes_optimizer = FewShotBayesianOptimizer(
    **common_opt_params
)

## MIPRO Optimizer
main use case is complex multi step reasoning or tool use.

In [11]:
mipro_optimizer = MiproOptimizer(
    model = common_opt_params['model'],
    project_name="fi"
)

## Final Optimize Prompt Run

In [12]:
common_opt_run_params = {
    "prompt": initial_prompt,
    "dataset": dataset,
    "metric": fi_eval,
    "n_samples": 7
}

In [13]:
optims = [meta_optimizer, evo_optimizer, bayes_optimizer]
# evo takes much much much longer to optimize. Really not recommended tbh. Should run separately.
# bayes results in hallucinations.

In [14]:
from opik_optimizer import TaskConfig

results = []

for optim in optims:
    print(f"Optimizing using {optim.__class__.__name__}")
    # MIPRO is bugged
    # if isinstance(optim, MiproOptimizer):
    #     res = optim.optimize_prompt(
    #         dataset=dataset,
    #         metric=fi_eval,
    #         task_config = TaskConfig(
    #             instruction_prompt = common_opt_run_params['prompt'].user,
    #             input_dataset_fields = ['question','context'],
    #             output_dataset_field = 'answer',
    #             use_chat_prompt = True
    #         ),
    #         num_candidates=1,
    #     )
    # else:
    res = optim.optimize_prompt(
        **common_opt_run_params
    )
    results.append(res)

Optimizing using MetaPromptOptimizer
╭────────────────────────────────────────────────────────────────────╮
│ [32m● [0mRunning Opik Evaluation - [34mMetaPromptOptimizer[0m                    │
│                                                                    │
│ -> View optimization details ]8;id=26225;https://www.comet.com/opik/api/v1/session/redirect/optimizations/?optimization_id=019976db-87ce-7da4-815a-2d92dd071486&dataset_id=01997609-de6f-707e-ba5c-3c7756e0eb62&path=aHR0cHM6Ly93d3cuY29tZXQuY29tL29waWsvYXBpLw==\in your Opik dashboard]8;;\                │
╰────────────────────────────────────────────────────────────────────╯


> Let's optimize the prompt:

[2m╭─[0m[2m user [0m[2m────────────────────────────────────────────────────────────[0m[2m─╮[0m
[2m│[0m                                                                    [2m│[0m
[2m│[0m  given {context}, answer the {question}                            [2m│[0m
[2m│[0m                                  

Output()

[32m  Baseline score was: 0.8286.[0m

> Starting the optimization run
│
│ - Starting optimization round 1 of 3
│    Generating candidate prompts:
[2m│      Successfully generated 3 candidate prompts[0m
│
│    Evaluating candidate prompt 1:
│         [2m╭─[0m[2m system [0m[2m──────────────────────────────────────────────────────────[0m[2m─╮[0m
│         [2m│[0m                                                                    [2m│[0m
│         [2m│[0m  Using the provided {context}, accurately answer the {question}    [2m│[0m
│         [2m│[0m  by focusing on the main goal or key information. Ensure your      [2m│[0m
│         [2m│[0m  response is concise and directly related to the question.         [2m│[0m
│         [2m│[0m                                                                    [2m│[0m
│         [2m╰────────────────────────────────────────────────────────────────────╯[0m
│         [2m╭─[0m[2m user [0m[2m───────────────────────────────

Output()

[32m│          Evaluation score: 0.9143 (10.34%)[0m
│
│
│    Evaluating candidate prompt 2:
│         [2m╭─[0m[2m system [0m[2m──────────────────────────────────────────────────────────[0m[2m─╮[0m
│         [2m│[0m                                                                    [2m│[0m
│         [2m│[0m  Given the {context}, provide a clear and concise answer to the    [2m│[0m
│         [2m│[0m  {question}. Ensure the answer is directly supported by the        [2m│[0m
│         [2m│[0m  context and highlights the main objectives or themes discussed.   [2m│[0m
│         [2m│[0m                                                                    [2m│[0m
│         [2m╰────────────────────────────────────────────────────────────────────╯[0m
│         [2m╭─[0m[2m user [0m[2m────────────────────────────────────────────────────────────[0m[2m─╮[0m
│         [2m│[0m                                                                    [2m│[0m
│        

Output()

[31m│          Evaluation score: 0.8000 (-3.45%)[0m
│
│
│    Evaluating candidate prompt 3:
│         [2m╭─[0m[2m system [0m[2m──────────────────────────────────────────────────────────[0m[2m─╮[0m
│         [2m│[0m                                                                    [2m│[0m
│         [2m│[0m  Based on the {context}, answer the {question} by summarizing the  [2m│[0m
│         [2m│[0m  key points related to the question. Your answer should be         [2m│[0m
│         [2m│[0m  concise and directly tied to the context provided.                [2m│[0m
│         [2m│[0m                                                                    [2m│[0m
│         [2m╰────────────────────────────────────────────────────────────────────╯[0m
│         [2m╭─[0m[2m user [0m[2m────────────────────────────────────────────────────────────[0m[2m─╮[0m
│         [2m│[0m                                                                    [2m│[0m
│        

Output()

[31m│          Evaluation score: 0.7714 (-6.90%)[0m
│
│
│    Completed optimization round 1 of 3
[32m│    Found a new best performing prompt: 0.9143 (10.34%)[0m
│
│ - Starting optimization round 2 of 3
│    Generating candidate prompts:
[2m│      Successfully generated 3 candidate prompts[0m
│
│    Evaluating candidate prompt 1:
│         [2m╭─[0m[2m system [0m[2m──────────────────────────────────────────────────────────[0m[2m─╮[0m
│         [2m│[0m                                                                    [2m│[0m
│         [2m│[0m  Using the provided {context}, answer the {question} by            [2m│[0m
│         [2m│[0m  identifying the main goal or key information. Ensure your         [2m│[0m
│         [2m│[0m  response is concise, directly related to the question, and        [2m│[0m
│         [2m│[0m  supported by the context.                                         [2m│[0m
│         [2m│[0m                                              

Output()

[31m│          Evaluation score: 0.8286 (-9.38%)[0m
│
│
│    Evaluating candidate prompt 2:
│         [2m╭─[0m[2m system [0m[2m──────────────────────────────────────────────────────────[0m[2m─╮[0m
│         [2m│[0m                                                                    [2m│[0m
│         [2m│[0m  Refer to the provided {context} to accurately answer the          [2m│[0m
│         [2m│[0m  {question}. Focus on extracting the main objectives or themes,    [2m│[0m
│         [2m│[0m  ensuring your response is concise and directly tied to the        [2m│[0m
│         [2m│[0m  context.                                                          [2m│[0m
│         [2m│[0m                                                                    [2m│[0m
│         [2m╰────────────────────────────────────────────────────────────────────╯[0m
│         [2m╭─[0m[2m user [0m[2m────────────────────────────────────────────────────────────[0m[2m─╮[0m
│        

Output()

KeyboardInterrupt: 

In [None]:
for res in results:
    res.display()

[33m╔═[0m[33m════════════════════════════════════════════[0m[33m [0m[1;33mOptimization Complete[0m[33m [0m[33m════════════════════════════════════════════[0m[33m═╗[0m
[33m║[0m                                                                                                                 [33m║[0m
[33m║[0m [2mOptimizer:            [0m[2m [0m[1mMetaPromptOptimizer[0m                                                                      [33m║[0m
[33m║[0m [2mModel Used:           [0m[2m [0mopenai/gpt-4o-mini                                                                       [33m║[0m
[33m║[0m [2mMetric Evaluated:     [0m[2m [0m[1mcosine_score[0m                                                                             [33m║[0m
[33m║[0m [2mInitial Score:        [0m[2m [0m0.7537                                                                                   [33m║[0m
[33m║[0m [2mFinal Best Score:     [0m[2m [0m[1;36m0.8593[0m  