# End of week 1 exercise

# Week 1 — LLM Explainer: Cloud vs Local + Judge

This notebook builds a small tool that:
1. Sends the same technical question to a **cloud OpenAI model** and a **local Ollama model**.
2. Streams both explanations for quick comparison.
3. Uses a **third GPT judge** to score each answer (0–10) and pick a winner.

In [None]:
# imports
import os
import json
from dotenv import load_dotenv
from IPython.display import Markdown, display, update_display
from openai import OpenAI

In [None]:
# constants


MODEL_CLOUD = 'gpt-4.1-nano'
MODEL_LOCAL = 'deepseek-r1:8b'
MODEL_JUDGE = "gpt-4.1-mini" 

In [None]:
# set up environment

load_dotenv(override=True)

api_key = os.getenv('OPENAI_API_KEY')

if api_key and api_key.startswith('sk-proj-') and len(api_key)>10:
    print("API key looks good so far")
else:
    print("There might be a problem with your API key? Please visit the troubleshooting notebook!")

client_cloud = OpenAI()
client_local = OpenAI(base_url="http://localhost:11434/v1", api_key="ollama")

In [None]:
question = """
Please explain what this code does and why:

def make_badge(text):
    width = len(text) + 4
    top_bottom = "*" * width
    middle = f"* {text} *"
    return f"{top_bottom}\\n{middle}\\n{top_bottom}"

print(make_badge("Golden rule: Do unto others as you would have them do unto you"))
"""


In [None]:
system_prompt = (
    "You are a senior Python engineer and predictive/generative AI specialist.\n"
    "Your top priority is factual accuracy.\n"
    "DO NOT lie, guess, or invent information. DO NOT hallucinate.\n"
    "If you are unsure about any detail, say explicitly: \"I don't know\" "
    "and ask a clarifying question before continuing.\n"
    "Base your explanations ONLY on the code and context provided.\n"
    "Explain things didactically and clearly, using small examples when helpful.\n"
    "\n"
    "MANDATORY START OF YOUR RESPONSE:\n"
    "1) Introduce yourself in 1–2 sentences.\n"
    "2) State the exact LLM model identity you are running as.\n"
    "3) State your context window size and number of parameters ONLY if you know them with certainty.\n"
    "   If you do NOT know either with certainty, write exactly: 'not publicly available'.\n"
    "\n"
    "After that mandatory intro, proceed with the task."
)

user_prompt = (
    "First follow the mandatory intro from the system message.\n"
    "Then explain the following Python code in a simple, step-by-step way.\n"
    "Add a short comment for EACH line explaining what it does.\n"
    "Do not add new functionality or rewrite the code unless I explicitly ask.\n"
    "\n"
    "Code to explain:\n"
    f"{question}"
)

messages = [
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": user_prompt},
]


In [None]:
def stream_answer(API,modelo, mensaje):
    stream = API.chat.completions.create(
        model=modelo,
        messages=mensaje,
        stream=True
    )    
    response = ""
    display_handle = display(Markdown(""), display_id=True)
    for chunk in stream:
        response += chunk.choices[0].delta.content or ''
        update_display(Markdown(response), display_id=display_handle.display_id)

In [None]:

# This is needed so we can pass a closed, stable text block to the judge.

def get_full_answer(client, model_name, messages):
    """
    Runs a Chat Completion WITHOUT streaming to obtain
    the final full text from the model.
    """

    # Call the backend with the specified model.
    response = client.chat.completions.create(
        model=model_name,   # Model name to use.
        messages=messages   # Same messages for a fair A/B comparison.
    )

   
    answer_text = response.choices[0].message.content

    
    return answer_text



In [None]:

def judge_answers(client_judge, judge_model, question,
                  answer_a, answer_b,
                  model_a_name, model_b_name):
    """
    Judge compares two answers, scores them 0–10, and picks a winner.
    Model names are injected from the script (not guessed by the LLM).
    Returns a Python dict with the verdict.
    """

    judge_system_prompt = (
        "You are an impartial judge evaluating two LLM answers.\n"
        "Score each answer from 0 to 10 based on:\n"
        "1) Factual correctness (no invented info)\n"
        "2) Didactic clarity\n"
        "3) Completeness of the answer\n"
        "4) Coherence and structure\n"
        "Return ONLY valid JSON."
    )

    # We explicitly tell the judge which model produced each answer.
    # The judge should not change these names; they are metadata.
    judge_user_prompt = f"""
Original question:
{question}

Answer A (model: {model_a_name}):
{answer_a}

Answer B (model: {model_b_name}):
{answer_b}

Evaluate both answers and respond with JSON EXACTLY in this schema:
{{
  "model_A": "{model_a_name}",
  "model_B": "{model_b_name}",
  "score_A": <number 0-10>,
  "score_B": <number 0-10>,
  "winner": "A" or "B" or "tie",
  "reason": "brief concrete explanation citing criteria"
}}
"""

    response = client_judge.chat.completions.create(
        model=judge_model,
        messages=[
            {"role": "system", "content": judge_system_prompt},
            {"role": "user", "content": judge_user_prompt}
        ],
        response_format={"type": "json_object"}
    )

    verdict_text = response.choices[0].message.content
    verdict = json.loads(verdict_text)
    return verdict




In [None]:

#stream answer for A
stream_answer(client_cloud,MODEL_CLOUD,messages)

In [None]:

#stream answer for B
stream_answer(client_local,MODEL_LOCAL,messages)

In [None]:


answer_cloud = get_full_answer(client_cloud, MODEL_CLOUD, messages)
answer_local = get_full_answer(client_local, MODEL_LOCAL, messages)

verdict = judge_answers(
    client_judge=client_cloud,
    judge_model=MODEL_JUDGE,
    question=question,
    answer_a=answer_cloud,
    answer_b=answer_local,
    model_a_name=MODEL_CLOUD,   
    model_b_name=MODEL_LOCAL    
)


In [None]:
print(f"=== MODEL A (cloud) === {MODEL_CLOUD}")
print(f"=== SCORE A === {verdict['score_A']}")
print()
print(f"=== MODEL B (local) === {MODEL_LOCAL}")
print(f"=== SCORE B === {verdict['score_B']}")
print()
print(f"=== WINNER === {verdict['winner']}")
print()

reason = verdict["reason"].strip()          
reason = reason.replace(". ", ".\n")        
print(reason)     