<a href="https://colab.research.google.com/github/crunchdomo/llm_conversation/blob/main/test_base_openai_baselne.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# from google.colab import userdata
# userdata.get('OPENAI_API_KEY')

In [7]:
import openai
import json
from datetime import datetime
import re
import random
import torch
from transformers import AutoTokenizer, pipeline

# device = 0 if torch.cuda.is_available() else -1

class ModelHandler:
    def __init__(self, model_name):
        self.model_name = model_name
        self.device = 0 if torch.cuda.is_available() else -1

        if "llama" in model_name.lower():
            self._init_llama()
        else:
            self._init_openai()

    def _init_openai(self):
        self.client = openai.OpenAI(
            api_key="",  # Replace with your key
            base_url="https://api.deepinfra.com/v1/openai" if "grok" in self.model_name else None
        )

    def _init_llama(self):
        self.tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
        self.pipe = pipeline(
            task="text-generation",
            model=self.model_name,
            tokenizer=self.tokenizer,
            device=self.device,
            torch_dtype=torch.bfloat16
        )
        self.pipe.tokenizer.pad_token = self.pipe.tokenizer.eos_token

    def format_llama_prompt(self, messages):
        B_INST, E_INST = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>", "<|eot_id|>"
        return "".join(
            f"{B_INST}{msg['content']}{E_INST}" if msg['role'] == 'user'
            else f"<|start_header_id|>assistant<|end_header_id|>{msg['content']}<|eot_id|>"
            for msg in messages
        )

    def generate(self, messages, max_tokens=512):
        if hasattr(self, 'client'):
            response = self.client.chat.completions.create(
                model=self.model_name,
                messages=messages,
                max_tokens=max_tokens,
                temperature=0.7
            )
            return response.choices[0].message.content
        else:
            prompt = self.format_llama_prompt(messages)
            outputs = self.pipe(prompt, max_new_tokens=max_tokens)
            return outputs[0]["generated_text"][len(prompt):].strip()

def parse_steps(instructions):
    """Parse instructions into individual steps"""
    steps = re.split(r'\n\s*\d+\.\s+', instructions.strip())
    return [f"{i+1}. {s.strip()}" for i, s in enumerate(steps) if s.strip()]

# def cook_recipe(recipe_data, automated_inputs, model_name="gpt-4-turbo", conversation_type="combined", question_prob=0.0):
#     handler = ModelHandler(model_name)

#     # Define conversation templates
#     conversation_templates = {
#         "combined": {
#             1: [
#                 "Can I use sea salt instead of kosher salt?",
#                 "Why do we let the chicken sit at room temperature?"
#             ],
#             3: [
#                 "Can I substitute rosemary with thyme?",
#                 "What's the purpose of tossing the squash with herbs?"
#             ]
#         },
#         "substitution_questions": {
#             1: ["Can I use sea salt instead of kosher salt?"],
#             3: ["Can I substitute rosemary with thyme?"]
#         },
#         "technique_clarification": {
#             1: ["Why do we let the chicken sit at room temperature?"],
#             3: ["What's the purpose of tossing the squash with herbs?"]
#         }
#     }

#     # Initialize system prompt
#     chat = [{
#         "role": "system",
#         "content": f"""You are a master chef guiding through: {recipe_data['title']}
#         - Begin each step with "STEP: [NUMBER]"
#         - Use metric measurements
#         - End each step with "AWAITING CONFIRMATION"
#         - After final step, say "CONVO-COMPLETE"
#         - Answer questions about specific steps using their numbers"""
#     }]

#     # Process steps
#     steps = parse_steps(recipe_data['instructions'])
#     step_index = 0
#     input_index = 0

#     while step_index < len(steps) and input_index < len(automated_inputs):
#         # Present current step
#         chat.append({
#             "role": "user",
#             "content": f"Present step {step_index+1} clearly: {steps[step_index]}"
#         })

#         # Get model response
#         chef_response = handler.generate(chat)
#         print(f"\nChef: {chef_response}")
#         chat.append({"role": "assistant", "content": chef_response})

#         # Handle user input
#         user_input = automated_inputs[input_index]
#         print(f"\nTrainee: {user_input}")
#         chat.append({"role": "user", "content": user_input})
#         input_index += 1

#         # Inject template questions
#         current_step = step_index + 1
#         if conversation_type in conversation_templates:
#             template = conversation_templates[conversation_type]
#             if current_step in template:
#                 for question in template[current_step]:
#                     if random.random() < question_prob:  # Add this check
#                         print(f"\n[Template Question] Trainee: {question}")

#                     # Get answer
#                     answer = handler.generate(chat, max_tokens=256)
#                     print(f"\nChef: {answer}")
#                     chat.append({"role": "assistant", "content": answer})

#         step_index += 1

#     save_conversation(chat, f"cooking_session_{conversation_type}_{model_name.replace('/', '-')}.json")
#     return chat

def save_conversation(chat_history, filename=None):
    if not filename:
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        filename = f"cooking_session_{timestamp}.json"

    with open(filename, 'w') as f:
        json.dump(chat_history, f, indent=2)
    print(f"Saved to {filename}")

# Recipe data structure
recipe = {
    "title": "Miso-Butter Roast Chicken With Acorn Squash Panzanella",
    "ingredients": [
        '1 (3½–4-lb.) whole chicken', '2¾ tsp. kosher salt, divided, plus more',
        '2 small acorn squash (about 3 lb. total)', '2 Tbsp. finely chopped sage',
        '1 Tbsp. finely chopped rosemary', '6 Tbsp. unsalted butter, melted, plus 3 Tbsp. room temperature',
        '¼ tsp. ground allspice', 'Pinch of crushed red pepper flakes', 'Freshly ground black pepper',
        '⅓ loaf good-quality sturdy white bread, torn into 1" pieces (about 2½ cups)',
        '2 medium apples (such as Gala or Pink Lady; about 14 oz. total), cored, cut into 1" pieces',
        '2 Tbsp. extra-virgin olive oil', '½ small red onion, thinly sliced',
        '3 Tbsp. apple cider vinegar', '1 Tbsp. white miso', '¼ cup all-purpose flour',
        '2 Tbsp. unsalted butter, room temperature', '¼ cup dry white wine',
        '2 cups unsalted chicken broth', '2 tsp. white miso', 'Kosher salt, freshly ground pepper'
    ],
    "instructions": """
        1. Pat chicken dry with paper towels, season all over with 2 tsp. salt, and tie legs together with kitchen twine. Let sit at room temperature 1 hour.

        2. Meanwhile, halve squash and scoop out seeds. Run a vegetable peeler along ridges of squash halves to remove skin. Cut each half into ½"-thick wedges; arrange on a rimmed baking sheet.

        3. Combine sage, rosemary, and 6 Tbsp. melted butter in a large bowl; pour half of mixture over squash on baking sheet. Sprinkle squash with allspice, red pepper flakes, and ½ tsp. salt and season with black pepper; toss to coat.

        4. Add bread, apples, oil, and ¼ tsp. salt to remaining herb butter in bowl; season with black pepper and toss to combine. Set aside.
        Place onion and vinegar in a small bowl; season with salt and toss to coat. Let sit, tossing occasionally, until ready to serve.

        5. Place a rack in middle and lower third of oven; preheat to 425°F. Mix miso and 3 Tbsp. room-temperature butter in a small bowl until smooth. Pat chicken dry with paper towels, then rub or brush all over with miso butter. Place chicken in a large cast-iron skillet and roast on middle rack until an instant-read thermometer inserted into the thickest part of breast registers 155°F, 50–60 minutes. (Temperature will climb to 165°F while chicken rests.) Let chicken rest in skillet at least 5 minutes, then transfer to a plate; reserve skillet.

        6. Meanwhile, roast squash on lower rack until mostly tender, about 25 minutes. Remove from oven and scatter reserved bread mixture over, spreading into as even a layer as you can manage. Return to oven and roast until bread is golden brown and crisp and apples are tender, about 15 minutes. Remove from oven, drain pickled onions, and toss to combine. Transfer to a serving dish.

        7. Using your fingers, mash flour and butter in a small bowl to combine.

        8. Set reserved skillet with chicken drippings over medium heat. You should have about ¼ cup, but a little over or under is all good. (If you have significantly more, drain off and set excess aside.) Add wine and cook, stirring often and scraping up any browned bits with a wooden spoon, until bits are loosened and wine is reduced by about half (you should be able to smell the wine), about 2 minutes. Add butter mixture; cook, stirring often, until a smooth paste forms, about 2 minutes. Add broth and any reserved drippings and cook, stirring constantly, until combined and thickened, 6–8 minutes. Remove from heat and stir in miso. Taste and season with salt and black pepper.

        9. Serve chicken with gravy and squash panzanella alongside.
    """
}

# # Automated inputs
# automated_inputs = [
#     "Got it! Ready for step 1.",
#     "Done Continue.",
#     "Done Continue.",
#     "Done Continue.",
#     "Done Continue.",
#     "Done Continue.",
#     "Done Continue.",
#     "Done Continue.",
#     "Done Continue.",
#     "Done Continue.",
#     "Done Continue.",
#     "exit"
# ]

def generate_llm_response(handler, role, chat, prompt, max_tokens=256):
    chat.append({"role": role, "content": prompt})
    response = handler.generate(chat, max_tokens=max_tokens)
    chat.append({"role": "assistant" if role == "user" else "user", "content": response})
    return response

def manager_decision(manager_handler, chat, step_index, steps, conversation_state):
    # The manager LLM decides the next action
    prompt = (
        f"You are the conversation manager for a cooking lesson. "
        f"Current step: {step_index+1}/{len(steps)}. "
        f"Conversation state: {conversation_state}. "
        "Decide what to do next: "
        "- Present next step, "
        "- End the conversation if all steps are done or student is satisfied. "
        "Reply with one of: PRESENT_STEP, ASK_QUESTION, END."
    )
    response = manager_handler.generate(chat + [{"role": "system", "content": prompt}], max_tokens=32)
    return response.strip().upper()

def student_generate(student_handler, chat, step_content):
    # Student LLM responds to the step, possibly with a question
    prompt = (
        f"You are a cooking student. The chef just presented: '{step_content}'. "
        "Respond as a student: If the chef's explanation is clear, reply with a brief confirmation like 'OK' or 'Thanks, I understand.' Only ask a question if you truly need clarification or want to know more.."
    )
    return student_handler.generate(chat + [{"role": "system", "content": prompt}], max_tokens=128)

def cook_recipe_llm_managed(recipe_data, model_names):
    chef_handler = ModelHandler(model_names['chef'])
    student_handler = ModelHandler(model_names['student'])
    manager_handler = ModelHandler(model_names['manager'])

    steps = parse_steps(recipe_data['instructions'])
    chat = [{
        "role": "system",
        "content": (
            f"You are a master chef guiding through: {recipe_data['title']}\n"
            "- Begin each step with 'STEP: [NUMBER]'\n"
            "- Use metric measurements\n"
            "- Answer questions about specific steps using their numbers"
        )
    }]
    step_index = 0
    conversation_state = {
        "step_confirmed": False,
        "questions_asked": 0,
        "step": 1
    }

    while step_index < len(steps):
        # Present step if not yet confirmed
        if not conversation_state["step_confirmed"]:
            step_content = f"Present step {step_index+1} clearly: {steps[step_index]}"
            chef_response = chef_handler.generate(chat + [{"role": "user", "content": step_content}])
            chat.append({"role": "assistant", "content": chef_response})

        # Student always asks a question first (unless already done)
        if conversation_state["questions_asked"] == 0:
            student_prompt = (
                f"You are a curious cooking student. The chef just presented: '{chef_response}'. "
                "Ask a clarifying or substitution question about this step before confirming."
            )
            student_question = student_handler.generate(chat + [{"role": "system", "content": student_prompt}])
            chat.append({"role": "user", "content": student_question})

            # Chef answers
            chef_answer = chef_handler.generate(chat)
            chat.append({"role": "assistant", "content": chef_answer})
            conversation_state["questions_asked"] += 1

        # Student confirms step after question is answered
        student_confirm_prompt = (
            "If the chef's answer is clear, reply with a brief confirmation like 'OK' or 'Understood.' "
            "If not, ask a clarifying question."
        )

        student_confirm = student_handler.generate(chat + [{"role": "system", "content": student_confirm_prompt}])
        chat.append({"role": "user", "content": student_confirm})
        conversation_state["step_confirmed"] = True

        # Manager decides if we should proceed
        manager_prompt = (
    f"You are the conversation manager for a cooking lesson. "
    f"Step: {step_index+1}/{len(steps)}. "
    f"Conversation so far: {chat[-5:]} "
    "Should the student ask a question, or just confirm? Reply with 'ASK_QUESTION' or 'CONFIRM'."
      )
        manager_decision = manager_handler.generate(chat + [{"role": "system", "content": manager_prompt}], max_tokens=8)

        manager_decision = manager_handler.generate(chat + [{"role": "system", "content": manager_prompt}], max_tokens=16)
        if "END" in manager_decision:
            chat.append({"role": "system", "content": "CONVO-COMPLETE"})
            break
        else:
            # Reset state for next step
            step_index += 1
            conversation_state = {
                "step_confirmed": False,
                "questions_asked": 0,
                "step": step_index + 1
            }

    save_conversation(chat, f"cooking_session_llm_managed.json")
    return chat


# Usage:
model_names = {
    "chef": "gpt-4-turbo",
    "student": "gpt-4-turbo",  # or another LLM
    "manager": "gpt-4-turbo"     # or another LLM
}
cook_recipe_llm_managed(recipe, model_names)


Saved to cooking_session_llm_managed.json


[{'role': 'system',
  'content': "You are a master chef guiding through: Miso-Butter Roast Chicken With Acorn Squash Panzanella\n- Begin each step with 'STEP: [NUMBER]'\n- Use metric measurements\n- Answer questions about specific steps using their numbers"},
 {'role': 'assistant',
  'content': 'STEP: 1\nPat the chicken dry with paper towels, then season it evenly all over with 10 grams of salt. Tie the legs together with kitchen twine. Let the chicken sit at room temperature for 1 hour to allow the salt to penetrate and season the meat.'},
 {'role': 'user',
  'content': 'What is the purpose of tying the legs together with kitchen twine in Step 1?'},
 {'role': 'assistant',
  'content': 'In Step 1, tying the legs together with kitchen twine serves a couple of key purposes when roasting a chicken. First, it helps in maintaining a compact shape which allows the chicken to cook more evenly. This prevents the legs from spreading out and cooking faster than the denser breast area, which coul

In [None]:
# # Add to your existing imports
# !pip install sentence_transformers
# from typing import Dict, List
# import pandas as pd
# from sklearn.metrics import accuracy_score, f1_score
# from sentence_transformers import SentenceTransformer, util

# class LLMEvaluator:
#     def __init__(self, baseline_model="gpt-4-turbo", api_key=None):
#         self.baseline_model = baseline_model
#         self.api_key = api_key  # Store the API key
#         self.baseline_conversations = self._load_baseline()
#         # If api_key is provided, set it as the default for the openai library
#         if self.api_key:
#             openai.api_key = self.api_key

#     def _load_baseline(self):
#         """Load pre-generated ChatGPT conversations as baseline"""
#         try:
#             with open(f"cooking_session_combined_{self.baseline_model}.json") as f:
#                 return json.load(f)
#         except FileNotFoundError:
#             raise Exception("Generate baseline conversations first")

#     def evaluate_conversation(self, conversation: List[Dict]) -> Dict:
#         """Evaluate a single conversation using multiple metrics"""
#         # Convert conversation to text format
#         text_convo = [f"{msg['role']}: {msg['content']}" for msg in conversation]

#         return {
#             "num_turns": len(text_convo),
#             "question_ratio": self._question_ratio(text_convo),
#             "step_completion": self._step_completion(text_convo),
#             "semantic_similarity": self._semantic_similarity(text_convo),
#             "fluency_score": self._fluency_score(text_convo),
#             "instruction_following": self._instruction_following(text_convo)
#         }

#     def _question_ratio(self, conversation: List[str]) -> float:
#         learner_msgs = [m for m in conversation if m.startswith("user:")]
#         return sum(1 for msg in learner_msgs if "?" in msg) / len(learner_msgs) if learner_msgs else 0

#     def _step_completion(self, conversation: List[str]) -> float:
#         return 1 if any("CONVO-COMPLETE" in msg for msg in conversation) else 0

#     def _semantic_similarity(self, conversation: List[str]) -> float:
#         # Compare with baseline using sentence transformers
#         baseline_text = [" ".join([msg['content'] for msg in self.baseline_conversations])]
#         eval_text = [" ".join(conversation)]

#         model = SentenceTransformer("all-MiniLM-L6-v2")
#         embeddings = model.encode(baseline_text + eval_text)
#         return util.pytorch_cos_sim(embeddings[0], embeddings[1]).item()

#     def _fluency_score(self, conversation: List[str]) -> float:
#             # Use GPT-4 as judge
#             prompt = f"""Rate the fluency of this cooking conversation (1-5):

#             {''.join(conversation)}

#             Score: """

#             # Pass the API key to the openai.OpenAI constructor if provided
#             if self.api_key:
#                 client = openai.OpenAI(api_key=self.api_key)
#                 response = client.chat.completions.create(
#                     model="gpt-4",
#                     messages=[{"role": "user", "content": prompt}],
#                     max_tokens=1
#                 )
#             else:
#                 response = openai.chat.completions.create( # Use default API key if not provided
#                     model="gpt-4",
#                     messages=[{"role": "user", "content": prompt}],
#                     max_tokens=1
#                 )
#             return float(response.choices[0].message.content.strip())

#     def _instruction_following(self, conversation: List[str]) -> float:
#         # Use your existing rubric-based evaluation
#         rubric = {
#             "metric_steps": "Does the conversation cover all recipe steps?",
#             "measurements": "Are metric units used consistently?",
#             "qna_quality": "Are questions answered accurately?"
#         }

#         scores = {}
#         for criterion, question in rubric.items():
#             prompt = f"""Does this conversation {question} (Yes/No)?
#             Conversation: {''.join(conversation)}"""

#             response = openai.chat.completions.create(
#                 model="gpt-4",
#                 messages=[{"role": "user", "content": prompt}],
#                 max_tokens=1
#             )
#             scores[criterion] = 1 if "yes" in response.choices[0].message.content.lower() else 0

#         return sum(scores.values()) / len(scores)

# class EvaluationComparator:
#     def __init__(self, evaluator: LLMEvaluator):
#         self.evaluator = evaluator
#         self.results = []

#     def add_model_results(self, model_name: str, conversations: List[List[Dict]]):
#         """Evaluate and store results for a model"""
#         model_scores = []
#         for conv in conversations:
#             score = self.evaluator.evaluate_conversation(conv)
#             score["model"] = model_name
#             model_scores.append(score)

#         self.results.extend(model_scores)

#     def compare_to_baseline(self) -> pd.DataFrame:
#             """Generate comparison report against ChatGPT baseline"""
#             df = pd.DataFrame(self.results)

#             # Calculate deltas from baseline
#             baseline_df = df[df["model"] == self.evaluator.baseline_model]
#             # Exclude the 'model' column from the mean calculation and subtraction
#             comparison = df.groupby("model").mean().drop(columns=['model'], errors='ignore').subtract(baseline_df.drop(columns=['model'], errors='ignore').mean(), axis=1)

#             return comparison[["semantic_similarity", "fluency_score", "instruction_following"]]


In [None]:
# evaluator = LLMEvaluator(api_key="")  # Replace with your actual API key
# comparator = EvaluationComparator(evaluator)

# models_to_compare = ["gpt-4-turbo", "gpt-3.5-turbo"]

# for model in models_to_compare:
#     # Generate conversations
#     conversations = cook_recipe(
#         recipe_data=recipe,
#         automated_inputs=automated_inputs,
#         model_name=model,
#         conversation_type="combined"
#     )

#     # Add to comparator
#     comparator.add_model_results(model, [conversations])

# report = comparator.compare_to_baseline()
# print(report)
