<a href="https://colab.research.google.com/github/crunchdomo/llm_conversation/blob/main/llm_eval.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
from google.colab import userdata
userdata.get('OPENAI_API_KEY')

In [2]:
# !pip install huggingface_hub[hf_xet]
!pip install dotenv

Collecting dotenv
  Downloading dotenv-0.9.9-py2.py3-none-any.whl.metadata (279 bytes)
Collecting python-dotenv (from dotenv)
  Downloading python_dotenv-1.1.0-py3-none-any.whl.metadata (24 kB)
Downloading dotenv-0.9.9-py2.py3-none-any.whl (1.9 kB)
Downloading python_dotenv-1.1.0-py3-none-any.whl (20 kB)
Installing collected packages: python-dotenv, dotenv
Successfully installed dotenv-0.9.9 python-dotenv-1.1.0


In [8]:
import json
from sentence_transformers import SentenceTransformer, util
import pandas as pd
import json
import re
import pandas as pd
from deepeval.metrics import GEval
from deepeval.test_case import LLMTestCase, LLMTestCaseParams
from sentence_transformers import SentenceTransformer, util
import os
from dotenv import load_dotenv

os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY')

load_dotenv()

class EnhancedRecipeComparator:
    def __init__(self, gpt4_path):
        if 'OPENAI_API_KEY' not in os.environ:
            raise ValueError("Missing OpenAI API key in .env file")

        self.ref_steps = self._load_steps(gpt4_path)
        self.model = SentenceTransformer('all-MiniLM-L6-v2')

        self.cooking_metric = GEval(
            name="CookingAccuracy",
            criteria="""
            1. Metric measurement compliance
            2. Thermal parameter accuracy (±3°C)
            3. Technique verb alignment
            4. Ingredient consistency
            """,
            evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.EXPECTED_OUTPUT]
        )

    def _load_steps(self, path):
        with open(path) as f:
            data = json.load(f)
        return [msg['content'].split('AWAITING')[0].strip()
               for msg in data if msg['role'] == 'assistant'
               and 'STEP' in msg['content']]

    def compare_to_gpt4(self, model_path):
        candidate_steps = self._load_steps(model_path)

        return {
            'model': model_path.split('_')[-1].split('.')[0],
            'semantic_similarity': self._semantic_sim(candidate_steps),
            'missing_steps': self._identify_missing_steps(candidate_steps),
            'step_alignment': self._step_alignment_analysis(candidate_steps),
            'hallucination_score': self._enhanced_hallucination(candidate_steps),
            'llm_judge_score': self._llm_judge_evaluation(candidate_steps)
        }

    def _semantic_sim(self, steps):
        ref_emb = self.model.encode(self.ref_steps)
        can_emb = self.model.encode(steps)
        return util.cos_sim(ref_emb, can_emb).diagonal().mean().item()

    def _identify_missing_steps(self, candidate_steps):
        missing = []
        for i, ref_step in enumerate(self.ref_steps):
            if i >= len(candidate_steps):
                missing.append({
                    'step': i+1,
                    'reference': ref_step,
                    'candidate': "MISSING"
                })
            else:
                similarity = util.cos_sim(
                    self.model.encode(ref_step),
                    self.model.encode(candidate_steps[i])
                ).item()
                if similarity < 0.65:  # Research-backed threshold
                    missing.append({
                        'step': i+1,
                        'reference': ref_step,
                        'candidate': candidate_steps[i],
                        'similarity': similarity
                    })
        return missing

    def _step_alignment_analysis(self, steps):
        report = []
        for idx, (ref, can) in enumerate(zip(self.ref_steps, steps)):
            ref_terms = set(re.findall(r'\b\w+\b', ref.lower()))
            can_terms = set(re.findall(r'\b\w+\b', can.lower()))

            report.append({
                'step': idx+1,
                'shared_terms': list(ref_terms & can_terms),
                'missing_terms': list(ref_terms - can_terms),
                'extra_terms': list(can_terms - ref_terms)
            })
        return report

    def _enhanced_hallucination(self, steps):
        scores = {
            'foreign_entities': self._foreign_entity_check(steps),
            'contradictions': self._detect_contradictions(steps),
            'unit_errors': 1 - self._validate_measurements(steps)
        }
        return round(sum(scores.values())/len(scores), 2)

    def _foreign_entity_check(self, steps):
        valid_terms = {'chicken', 'squash', 'miso', 'butter', 'oven', 'salt', 'pepper'}
        foreign = set()
        for step in steps:
            words = set(re.findall(r'\b\w+\b', step.lower()))
            foreign.update(words - valid_terms)
        return len(foreign)/len(steps)

    def _detect_contradictions(self, steps):
        contradictions = 0
        for step in steps:
            if ('fry' in step.lower() or 'grill' in step.lower()) and 'roast' not in step.lower():
                contradictions +=1
        return contradictions/len(steps)

    def _validate_measurements(self, steps):
        metric_units = {'g', 'kg', 'ml', 'l', 'tsp', 'tbsp', '°c'}
        return sum(1 for step in steps
                  if any(u in step.lower() for u in metric_units))/len(steps)

    def _llm_judge_evaluation(self, steps):
        scores = []
        for ref_step, candidate_step in zip(self.ref_steps, steps):
            test_case = LLMTestCase(
                input="Recipe evaluation",
                actual_output=candidate_step,
                expected_output=ref_step,
                context=[ref_step]
            )
            self.cooking_metric.measure(test_case)
            scores.append(self.cooking_metric.score)
        return round(sum(scores)/len(scores), 2)

    def generate_report(self, model_path):
        candidate_steps = self._load_steps(model_path)
        metrics = self.compare_to_gpt4(model_path)

        print(f"\n=== Missing Steps Analysis ===")
        for missing in metrics['missing_steps']:
            print(f"Step {missing['step']}:")
            print(f"Reference: {missing['reference']}")
            print(f"Candidate: {missing.get('candidate', 'MISSING')}")
            if 'similarity' in missing:
                print(f"Similarity: {missing['similarity']:.2f}")
            print("------------------------")

        print("\n=== Semantic Similarity Justification ===")
        ref_emb = self.model.encode(self.ref_steps)
        can_emb = self.model.encode(candidate_steps)
        similarities = util.cos_sim(ref_emb, can_emb).diagonal()
        print(f"Mean Similarity: {similarities.mean().item():.2f}")
        print(f"Strongest Match: Step {similarities.argmax()+1} ({similarities.max().item():.2f})")
        print(f"Weakest Match: Step {similarities.argmin()+1} ({similarities.min().item():.2f})")

        print("\n=== Step Alignment Report ===")
        for alignment in metrics['step_alignment']:
            print(f"Step {alignment['step']}:")
            print(f"Shared Terms: {', '.join(alignment['shared_terms'])}")
            print(f"Missing Terms: {', '.join(alignment['missing_terms'])}")
            print(f"Extra Terms: {', '.join(alignment['extra_terms'])}")
            print("------------------------")

# Usage
comparator = EnhancedRecipeComparator('cooking_session_combined_gpt-4-turbo.json')

results = []
for model in [
    'cooking_session_combined_meta-llama-Meta-Llama-3-8B-Instruct.json',
    'cooking_session_combined_gpt-3.5-turbo.json'
]:
    print(f"\nAnalyzing {model}...")
    results.append(comparator.compare_to_gpt4(model))
    comparator.generate_report(model)

df = pd.DataFrame(results)
print("\nFinal Comparison:")
print(df.drop(['missing_steps','step_alignment'], axis=1).round(2).to_markdown(index=False))



Analyzing cooking_session_combined_meta-llama-Meta-Llama-3-8B-Instruct.json...


Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()


=== Missing Steps Analysis ===
Step 2:
Reference: STEP: 1
Pat the chicken dry with paper towels. Season it evenly all over with 10 grams of salt. Tie the legs together with kitchen twine. Let the chicken sit at room temperature for 1 hour to allow the seasoning to penetrate and the meat to temper, ensuring more even cooking.
Candidate: assistant

Let's proceed to Step 2:

**STEP 2:**
Preheat the oven to 425°F (220°C). In a small saucepan, melt 2 tablespoons of unsalted butter over medium heat. Add 2 cloves of minced garlic and cook, stirring occasionally, until fragrant, about 1-2 minutes. Remove from heat and stir in 2 tablespoons of white miso paste until smooth.
Similarity: 0.37
------------------------
Step 5:
Reference: STEP: 4
Position a rack in the middle of your oven and preheat it to 220 degrees Celsius. This will prepare the oven for roasting both the chicken and the squash, ensuring optimal heat distribution for even cooking.
Candidate: assistant

Here is Step 4:

**STEP 4:

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()


=== Missing Steps Analysis ===
Step 2:
Reference: STEP: 1
Pat the chicken dry with paper towels. Season it evenly all over with 10 grams of salt. Tie the legs together with kitchen twine. Let the chicken sit at room temperature for 1 hour to allow the seasoning to penetrate and the meat to temper, ensuring more even cooking.
Candidate: STEP: 2
Preheat the oven to 200°C. Cut the acorn squash into 2cm cubes and place on a baking sheet. Drizzle with olive oil, season with salt and pepper, and roast in the oven for 25-30 minutes until tender.
Similarity: 0.22
------------------------
Step 5:
Reference: STEP: 4
Position a rack in the middle of your oven and preheat it to 220 degrees Celsius. This will prepare the oven for roasting both the chicken and the squash, ensuring optimal heat distribution for even cooking.
Candidate: STEP: 4
Place the chicken on a wire rack set inside a rimmed baking sheet. Rub the remaining herb butter mixture all over the chicken, including under the skin. Roast