<a href="https://colab.research.google.com/github/crunchdomo/llm_conversation/blob/main/llm_eval.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import json
from sentence_transformers import SentenceTransformer, util
import pandas as pd

class RecipeEvaluator:
    def __init__(self, reference_path):
        self.ref_steps = self._load_steps(reference_path)
        self.model = SentenceTransformer('all-MiniLM-L6-v2')

    def _load_steps(self, path):
        with open(path) as f:
            data = json.load(f)
        return [msg['content'].split('AWAITING')[0].strip()
                for msg in data if msg['role'] == 'assistant'
                and 'STEP' in msg['content']]

    def compare_to_gpt4(self, model_path):
        candidate_steps = self._load_steps(model_path)

        # Semantic similarity
        ref_emb = self.model.encode(self.ref_steps)
        can_emb = self.model.encode(candidate_steps)
        semantic_sim = util.cos_sim(ref_emb, can_emb).diagonal().mean().item()

        # Step alignment score
        alignment = sum(1 for ref, can in zip(self.ref_steps, candidate_steps)
                       if key_phrases_match(ref, can)) / len(self.ref_steps)

        return {
            'model': model_path.split('_')[-1].split('.')[0],
            'semantic_similarity': round(semantic_sim, 3),
            'step_alignment': round(alignment, 3),
            'missing_steps': len(self.ref_steps) - len(candidate_steps)
        }

def key_phrases_match(ref, can):
    key_terms = ['salt', 'butter', 'oven', 'roast', 'temperature']
    return sum(1 for term in key_terms if term in ref.lower() and term in can.lower()) >= 3

# Usage
evaluator = RecipeEvaluator('cooking_session_combined_gpt-4-turbo.json')
results = []
for model in [
    'cooking_session_combined_meta-llama-Meta-Llama-3-8B-Instruct.json',
    'cooking_session_combined_gpt-3.5-turbo.json'
]:
    results.append(evaluator.compare_to_gpt4(model))

print(pd.DataFrame(results).to_markdown(index=False))


from sklearn.metrics import precision_score, recall_score
import numpy as np

class ExtendedRecipeEvaluator(RecipeEvaluator):
    def __init__(self, reference_path):
        super().__init__(reference_path)
        self.ingredients = {'chicken', 'squash', 'miso', 'butter', 'sage',
                           'rosemary', 'allspice', 'pepper flakes', 'apple'}

    def compare_to_gpt4(self, model_path):
        base_metrics = super().compare_to_gpt4(model_path)
        candidate_steps = self._load_steps(model_path)

        # Precision/Recall for critical cooking elements
        temp_scores = self._temperature_validation(candidate_steps)
        tool_scores = self._tool_correctness(candidate_steps)

        # Hallucination detection
        hallucination = self._detect_ingredient_hallucination(candidate_steps)

        return {
            **base_metrics,
            'temp_accuracy': temp_scores,
            'tool_precision': tool_scores,
            'ingredient_recall': self._ingredient_recall(candidate_steps),
            'hallucination_rate': hallucination,
            'temporal_consistency': self._step_order_validation(candidate_steps)
        }

    def _temperature_validation(self, steps):
        ref_temps = [220, 68, 74]  # Celsius values from GPT-4 steps
        detected = []
        for step in steps:
            if '°C' in step:
                detected.extend([int(s) for s in step.split() if s.isdigit()])
        return len(set(detected) & set(ref_temps)) / len(ref_temps)

    def _tool_correctness(self, steps):
        tools = {'oven', 'skillet', 'peeler', 'thermometer', 'whisk'}
        present = [tool for step in steps
                  for tool in tools if tool in step.lower()]
        return len(set(present)) / len(tools)

    def _ingredient_recall(self, steps):
        present = [ing for step in steps
                  for ing in self.ingredients if ing in step.lower()]
        return len(set(present)) / len(self.ingredients)

    def _detect_ingredient_hallucination(self, steps):
        foreign_ings = set()
        for step in steps:
            words = set(step.lower().split())
            foreign_ings.update(words - self.ingredients)
        return len(foreign_ings) / len(steps)

    def _step_order_validation(self, steps):
        key_verbs = ['pat', 'preheat', 'roast', 'toss', 'serve']
        order_score = sum(1 for gen, ref in zip(steps, self.ref_steps)
                         if any(v in gen.lower() for v in key_verbs))
        return order_score / len(key_verbs)


| model                               |   semantic_similarity |   step_alignment |   missing_steps |
|:------------------------------------|----------------------:|-----------------:|----------------:|
| meta-llama-Meta-Llama-3-8B-Instruct |                 0.774 |            0.091 |               0 |
| gpt-3                               |                 0.771 |            0.091 |               3 |


In [9]:
!pip install levenshtein
import json
import re
import pandas as pd
from sentence_transformers import SentenceTransformer, util
from Levenshtein import ratio as levenshtein_ratio
from sklearn.metrics import precision_score

class RecipeComparator:
    def __init__(self, gpt4_path):
        self.ref_steps = self._load_steps(gpt4_path)
        self.model = SentenceTransformer('all-MiniLM-L6-v2')
        self.ingredients = {
            'chicken', 'squash', 'miso', 'butter', 'sage',
            'rosemary', 'allspice', 'pepper flakes', 'apple'
        }
        self.tools = {'oven', 'skillet', 'peeler', 'thermometer', 'whisk'}
        self.temps = {220, 68, 74}  # Celsius temps from GPT-4 steps

    def _load_steps(self, path):
        with open(path) as f:
            data = json.load(f)
        return [msg['content'].split('AWAITING')[0].strip()
               for msg in data if msg['role'] == 'assistant'
               and 'STEP' in msg['content']]

    def compare_to_gpt4(self, model_path):
        candidate_steps = self._load_steps(model_path)

        return {
            'model': model_path.split('_')[-1].split('.')[0],
            'semantic_similarity': self._semantic_sim(candidate_steps),
            'step_completeness': self._step_completeness(candidate_steps),
            'temp_accuracy': self._temp_validation(candidate_steps),
            'ingredient_recall': self._ingredient_recall(candidate_steps),
            'tool_precision': self._tool_check(candidate_steps),
            'measurement_consistency': self._unit_check(candidate_steps),
            'hallucination_score': self._hallucination(candidate_steps)
        }

    def _semantic_sim(self, steps):
        ref_emb = self.model.encode(self.ref_steps)
        can_emb = self.model.encode(steps)
        return util.cos_sim(ref_emb, can_emb).diagonal().mean().item()

    def _step_completeness(self, steps):
        return len(steps)/len(self.ref_steps) if len(steps) <= len(self.ref_steps) else 1.0

    def _temp_validation(self, steps):
        detected = []
        for step in steps:
            if '°C' in step:
                temps = [int(s.replace('°C','').strip())
                        for s in step.split()
                        if s.startswith('°C') and s[1:].isdigit()]
                detected.extend(temps)
        return len(set(detected) & self.temps)/len(self.temps)

    def _ingredient_recall(self, steps):
        present = {ing for step in steps
                  for ing in self.ingredients if ing in step.lower()}
        return len(present)/len(self.ingredients)

    def _tool_check(self, steps):
        present = {tool for step in steps
                  for tool in self.tools if tool in step.lower()}
        return len(present)/len(self.tools)

    def _unit_check(self, steps):
        metric_units = {'g', 'kg', 'ml', 'l', 'tsp', 'tbsp'}
        return sum(1 for step in steps
                  if any(u in step for u in metric_units))/len(steps)

    def _hallucination(self, steps):
        foreign = set()
        for step in steps:
            words = set(re.findall(r'\b\w+\b', step.lower()))
            foreign.update(words - self.ingredients - self.tools)
        return len(foreign)/len(steps)

# Usage
comparator = RecipeComparator('cooking_session_combined_gpt-4-turbo.json')

results = []
for model in [
    'cooking_session_combined_meta-llama-Meta-Llama-3-8B-Instruct.json',
    'cooking_session_combined_gpt-3.5-turbo.json'
]:
    results.append(comparator.compare_to_gpt4(model))

df = pd.DataFrame(results)
print(df.round(2).to_markdown(index=False))


Collecting levenshtein
  Using cached levenshtein-0.27.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.6 kB)
Collecting rapidfuzz<4.0.0,>=3.9.0 (from levenshtein)
  Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading levenshtein-0.27.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (161 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m161.7/161.7 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m58.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz, levenshtein
Successfully installed levenshtein-0.27.1 rapidfuzz-3.13.0
| model                               |   semantic_similarity |   step_completeness |   temp_accuracy |   ingredient_recall |   tool_precision |   