<a href="https://colab.research.google.com/github/crunchdomo/llm_conversation/blob/main/evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
!pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=f3cf39a94e6a2de6534e1cd33f528f090a49fdaebc98b80fd17aadcf77a24779
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [26]:
from typing import List, Dict
from sentence_transformers import SentenceTransformer, util
import evaluate  # Requires pip install evaluate
import json
import torch # Import torch for cosine similarity calculation
import re

class RecipeConversationEvaluator:
    def __init__(self, reference_recipe: Dict):
        self.reference_steps = reference_recipe['instructions']
        self.ingredients = reference_recipe['ingredients']
        self.model = SentenceTransformer('all-MiniLM-L6-v2')
        self.bleu = evaluate.load("bleu")
        self.rouge = evaluate.load("rouge")

    def evaluate_conversation(self, dialogue: List[Dict]):
        chef_messages = [m['content'] for m in dialogue if m['role'] == 'chef']
        return {
            'ingredient_coverage': self._calculate_ingredient_coverage(chef_messages),
            'step_accuracy': self._measure_step_accuracy(chef_messages),
            'question_handling': self._assess_question_handling(dialogue),
            # Assuming _calculate_semantic_similarity for the overall conversation
            # will be implemented or removed if not needed here.
            # For now, let's remove it from here if its purpose isn't clear
            # or add a basic implementation. Let's add a basic one for messages vs messages.
            'semantic_similarity': self._calculate_semantic_similarity(chef_messages, chef_messages) # Example usage, needs refinement based on desired metric
        }

    # Move the helper methods inside the class
    def _calculate_ingredient_coverage(self, messages: List[str]):
      # Extract core ingredients (remove quantities/units)
      core_ingredients = set()
      for ingredient in self.ingredients:
          # Remove measurements using regex
          base = re.sub(r'^[\d¼½¾/]+[^a-zA-Z]*', '', ingredient, flags=re.IGNORECASE)
          base = re.sub(r'\(.*?\)', '', base).strip().lower()  # Remove parentheticals
          core_ingredients.add(base)

      mentioned = 0
      for core in core_ingredients:
          if any(core in msg.lower() for msg in messages):
              mentioned += 1

      return mentioned / len(core_ingredients) if core_ingredients else 0


    def _measure_step_accuracy(self, messages: List[str]):
        # Ensure the number of chef messages matches the number of reference steps
        # or implement a different logic if they can differ.
        # For now, assuming a direct 1-to-1 correspondence or partial evaluation.
        # Let's handle the case where message count is different from step count.
        # We will compare each chef message to the most semantically similar reference step.
        if not messages or not self.reference_steps:
            return 0.0 # Return 0 if there are no messages or no reference steps

        step_similarities = []
        ref_embeddings = self.model.encode(self.reference_steps)
        msg_embeddings = self.model.encode(messages)

        # Calculate similarity between each message and each reference step
        cosine_scores = util.pytorch_cos_sim(msg_embeddings, ref_embeddings)

        # For each message, find the maximum similarity to any reference step
        max_similarities_per_message = torch.max(cosine_scores, dim=1).values.tolist()

        # Average the maximum similarities
        return sum(max_similarities_per_message) / len(max_similarities_per_message)


    def _assess_question_handling(self, dialogue: List[Dict]):
        questions = [m['content'] for m in dialogue
                    if m['role'] == 'trainee' and '?' in m['content']]
        # Ensure responses are paired correctly with questions.
        # This logic assumes an immediate response after each question.
        # A more robust approach might track conversation flow explicitly.
        responses = []
        for i, m in enumerate(dialogue[:-1]):
            if m['role'] == 'trainee' and '?' in m['content']:
                # Check if the next message exists and is a response
                if i + 1 < len(dialogue):
                    responses.append(dialogue[i+1]['content'])

        # Ensure the number of questions and responses are consistent for paired evaluation
        # If not assuming 1-to-1, adjust metric calculation
        min_len = min(len(questions), len(responses))
        questions_paired = questions[:min_len]
        responses_paired = responses[:min_len]

        return {
            'question_response_ratio': len(responses)/len(questions) if questions else 0,
            # Calculate semantic similarity only for paired questions and responses
            'answer_relevance': self._calculate_semantic_similarity(responses_paired, questions_paired) if questions_paired else 0.0
        }

    # Define the _calculate_semantic_similarity method
    # This method likely calculates the similarity between two lists of strings.
    # A simple implementation could be averaging pairwise similarities.
    def _calculate_semantic_similarity(self, list1: List[str], list2: List[str]):
        if not list1 or not list2:
            return 0.0 # Return 0 if either list is empty

        # Ensure lists have the same length for pairwise comparison, or adjust logic
        min_len = min(len(list1), len(list2))
        list1 = list1[:min_len]
        list2 = list2[:min_len]

        embeddings1 = self.model.encode(list1)
        embeddings2 = self.model.encode(list2)

        # Calculate pairwise cosine similarities
        cosine_scores = util.pytorch_cos_sim(embeddings1, embeddings2)

        # For paired lists, we average the diagonal
        # If lists can be different lengths or non-paired, the logic needs adjustment
        # Assuming paired lists for 'answer_relevance' based on the calling context
        # For 'semantic_similarity' in evaluate_conversation, the interpretation is less clear.
        # Let's assume pairwise average for this helper method for now.
        # If len(list1) == len(list2) and they are meant to be paired:
        pairwise_similarities = [cosine_scores[i][i].item() for i in range(min_len)]

        # If the lists are not necessarily paired 1-to-1, we might average all scores
        # or find the max similarity for each item in list1 against all of list2.
        # Based on how it's called in _assess_question_handling, pairwise seems intended.
        # For the call in evaluate_conversation with chef_messages vs chef_messages,
        # this might not be the desired metric. Let's assume pairwise average is the
        # general helper and the caller needs to provide appropriately paired lists.
        if not pairwise_similarities:
            return 0.0

        return sum(pairwise_similarities) / len(pairwise_similarities)

In [27]:
# Example usage
reference_recipe = {
    "title": "Miso-Butter Roast Chicken With Acorn Squash Panzanella",
    "ingredients": [
        '1 (3½–4-lb.) whole chicken', '2¾ tsp. kosher salt, divided, plus more',
        '2 small acorn squash (about 3 lb. total)', '2 Tbsp. finely chopped sage',
        '1 Tbsp. finely chopped rosemary', '6 Tbsp. unsalted butter, melted, plus 3 Tbsp. room temperature',
        '¼ tsp. ground allspice', 'Pinch of crushed red pepper flakes', 'Freshly ground black pepper',
        '⅓ loaf good-quality sturdy white bread, torn into 1" pieces (about 2½ cups)',
        '2 medium apples (such as Gala or Pink Lady; about 14 oz. total), cored, cut into 1" pieces',
        '2 Tbsp. extra-virgin olive oil', '½ small red onion, thinly sliced',
        '3 Tbsp. apple cider vinegar', '1 Tbsp. white miso', '¼ cup all-purpose flour',
        '2 Tbsp. unsalted butter, room temperature', '¼ cup dry white wine',
        '2 cups unsalted chicken broth', '2 tsp. white miso', 'Kosher salt, freshly ground pepper'
    ],
    "instructions": """
        1. Pat chicken dry with paper towels, season all over with 2 tsp. salt, and tie legs together with kitchen twine. Let sit at room temperature 1 hour.

        2. Meanwhile, halve squash and scoop out seeds. Run a vegetable peeler along ridges of squash halves to remove skin. Cut each half into ½""-thick wedges; arrange on a rimmed baking sheet.

        3. Combine sage, rosemary, and 6 Tbsp. melted butter in a large bowl; pour half of mixture over squash on baking sheet. Sprinkle squash with allspice, red pepper flakes, and ½ tsp. salt and season with black pepper; toss to coat.

        4. Add bread, apples, oil, and ¼ tsp. salt to remaining herb butter in bowl; season with black pepper and toss to combine. Set aside.
        Place onion and vinegar in a small bowl; season with salt and toss to coat. Let sit, tossing occasionally, until ready to serve.

        5. Place a rack in middle and lower third of oven; preheat to 425°F. Mix miso and 3 Tbsp. room-temperature butter in a small bowl until smooth. Pat chicken dry with paper towels, then rub or brush all over with miso butter. Place chicken in a large cast-iron skillet and roast on middle rack until an instant-read thermometer inserted into the thickest part of breast registers 155°F, 50–60 minutes. (Temperature will climb to 165°F while chicken rests.) Let chicken rest in skillet at least 5 minutes, then transfer to a plate; reserve skillet.

        6. Meanwhile, roast squash on lower rack until mostly tender, about 25 minutes. Remove from oven and scatter reserved bread mixture over, spreading into as even a layer as you can manage. Return to oven and roast until bread is golden brown and crisp and apples are tender, about 15 minutes. Remove from oven, drain pickled onions, and toss to combine. Transfer to a serving dish.

        7. Using your fingers, mash flour and butter in a small bowl to combine.

        8. Set reserved skillet with chicken drippings over medium heat. You should have about ¼ cup, but a little over or under is all good. (If you have significantly more, drain off and set excess aside.) Add wine and cook, stirring often and scraping up any browned bits with a wooden spoon, until bits are loosened and wine is reduced by about half (you should be able to smell the wine), about 2 minutes. Add butter mixture; cook, stirring often, until a smooth paste forms, about 2 minutes. Add broth and any reserved drippings and cook, stirring constantly, until combined and thickened, 6–8 minutes. Remove from heat and stir in miso. Taste and season with salt and black pepper.

        9. Serve chicken with gravy and squash panzanella alongside.
            """
}

with open("/content/llm_conversation.json", "r") as f:
    conversation_data = json.load(f)

evaluator = RecipeConversationEvaluator(reference_recipe)
results = evaluator.evaluate_conversation(conversation_data)
print(results)

{'ingredient_coverage': 0.047619047619047616, 'step_accuracy': 0.3325924141332507, 'question_handling': {'question_response_ratio': 1.0, 'answer_relevance': 0.6048608933176313}, 'semantic_similarity': 1.0}
