<a href="https://colab.research.google.com/github/chuahwb/FNB-Imagery-AI-Tool/blob/main/notebooks/mllm_image_evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!git clone https://github.com/chuahwb/FNB-Imagery-AI-Tool.git

Cloning into 'FNB-Imagery-AI-Tool'...
remote: Enumerating objects: 32, done.[K
remote: Counting objects: 100% (32/32), done.[K
remote: Compressing objects: 100% (30/30), done.[K
remote: Total 32 (delta 10), reused 4 (delta 1), pack-reused 0 (from 0)[K
Receiving objects: 100% (32/32), 44.48 KiB | 3.18 MiB/s, done.
Resolving deltas: 100% (10/10), done.


In [1]:
# -*- coding: utf-8 -*-
"""
IPython Notebook for Phase 2: Evaluating Multimodal LLMs for F&B Image Recreation

This notebook connects to OpenRouter, processes local images, sends them with
prompts to selected multimodal LLMs, and retrieves structured descriptions
using the 'instructor' library for comparison.
"""

# @title Setup: Install Libraries and Import Modules
# Install necessary libraries
!pip install instructor openai python-dotenv pillow pandas tqdm -q

import os
import base64
import instructor
from openai import OpenAI
from pydantic import BaseModel, Field, field_validator
from PIL import Image
from io import BytesIO
from typing import List, Optional
import pandas as pd
from tqdm.notebook import tqdm
from dotenv import load_dotenv
import time

# @title Configure API Key and OpenRouter Client

# --- IMPORTANT ---
# Set your OpenRouter API key.
# Option 1: Create a .env file in the same directory as this notebook
#           with the line: OPENROUTER_API_KEY="your-key-here"
# Option 2: Set it as an environment variable in your system.
# Option 3: Replace os.getenv("OPENROUTER_API_KEY") below with your actual key string
#           (less secure, not recommended for shared notebooks).
load_dotenv()
OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")

if not OPENROUTER_API_KEY:
    print("⚠️ OpenRouter API Key not found.")
    print("Please set the OPENROUTER_API_KEY environment variable or in a .env file.")
    # You might want to raise an error or use input() here in a real script
    # OPENROUTER_API_KEY = input("Enter your OpenRouter API Key: ")


# Configure the Instructor client to use OpenRouter
# Patch the OpenAI client to add structured response capabilities
client = instructor.patch(
    OpenAI(
        base_url="https://openrouter.ai/api/v1",
        api_key=OPENROUTER_API_KEY,
        default_headers={ # Optional, but good practice for OpenRouter
            "HTTP-Referer": "http://localhost:8888", # Replace with your app URL if deployed
            "X-Title": "F&B Image Eval", # Replace with your app name
        },
        timeout=600 # Increase timeout for potentially long image processing
    ),
    mode=instructor.Mode.MD_JSON # Use Markdown JSON mode for better compatibility
)

print("✅ OpenAI client patched with Instructor and configured for OpenRouter.")

# @title Define Pydantic Model for Structured Description
# This model mirrors the 8 points requested in the prompts

class FnbImageDescription(BaseModel):
    """Structured description of an F&B social media image."""
    primary_subject: str = Field(..., description="Detailed description of the main food, drink, person, or element, including ingredients, preparation, presentation, and actions.")
    composition_framing: str = Field(..., description="Description of layout (e.g., centered, rule of thirds), camera angle (e.g., eye-level, overhead), and framing (e.g., close-up, medium shot).")
    background_environment: str = Field(..., description="Details of the setting, surfaces, other objects, and depth of field (e.g., blurred background).")
    lighting_color: str = Field(..., description="Description of light source, style (e.g., natural, studio), direction, shadows, highlights, dominant colors, and temperature.")
    texture_materials: str = Field(..., description="Specific textures visible (e.g., glossy sauce, crispy batter, smooth ceramic, condensation).")
    text_branding: str = Field(..., description="Accurate transcription of visible text and detailed description of logos or branding elements.")
    mood_atmosphere: str = Field(..., description="Overall feeling conveyed by the image (e.g., cozy, vibrant, elegant, casual).")
    overall_style: str = Field(..., description="Characterization of the image style (e.g., photorealistic, cinematic, flat lay, illustration).")

    # Optional: Add a validator to ensure fields are not empty
    @field_validator('*', mode='before')
    def check_not_empty(cls, value):
        if isinstance(value, str) and not value.strip():
            return "(Not specified)" # Provide a default if empty
        return value

print("✅ Pydantic model 'FnbImageDescription' defined.")

# @title Define Image Handling Function

def encode_image_to_base64(image_path: str, max_size=(1024, 1024)) -> str:
    """Loads an image, resizes if needed, and encodes it to base64."""
    try:
        with Image.open(image_path) as img:
            # Convert image to RGB if it's not (e.g., RGBA, P)
            if img.mode != 'RGB':
                img = img.convert('RGB')

            # Optional: Resize image to prevent exceeding token limits
            # img.thumbnail(max_size) # Uncomment if needed

            buffered = BytesIO()
            img.save(buffered, format="JPEG") # Save as JPEG
            img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
            return img_str
    except FileNotFoundError:
        print(f"❌ Error: Image file not found at {image_path}")
        return None
    except Exception as e:
        print(f"❌ Error encoding image {image_path}: {e}")
        return None

print("✅ Image encoding function 'encode_image_to_base64' defined.")

# @title Define Prompt Construction Function

# Baseline Prompt (as defined previously)
BASELINE_PROMPT = """
Analyze the provided F&B image in meticulous detail. Generate a comprehensive description suitable for recreating this exact image using a text-to-image AI. Describe the following elements:

1.  **Primary Subject(s):** Identify and describe the main food, drink, person, or element. Include details like ingredients, preparation style (e.g., grilled, fried, steamed), presentation, specific actions (e.g., pouring, eating).
2.  **Composition & Framing:** Describe the layout (e.g., centered, rule of thirds, asymmetrical), camera angle (e.g., eye-level, overhead shot, low angle, Dutch tilt), and framing (e.g., extreme close-up, close-up, medium shot, full shot, wide shot).
3.  **Background & Environment:** Detail the setting (e.g., restaurant table, kitchen counter, outdoor picnic, abstract background), surfaces (e.g., wooden table, marble countertop, checkered tablecloth), other objects present (e.g., cutlery, napkins, other dishes, decor), and depth of field (e.g., sharp focus on subject with heavily blurred background, deep focus with everything sharp).
4.  **Lighting & Color:** Describe the light source and style (e.g., bright natural daylight from window, warm indoor ambient light, dramatic studio flash, soft diffused light), direction of light, presence and softness of shadows, highlights, dominant color palette, and overall color temperature (e.g., warm tones, cool tones, vibrant, muted).
5.  **Texture & Materials:** Mention specific textures visible (e.g., glossy sauce, crispy batter, fluffy bread, smooth ceramic plate, rough wooden board, condensation on glass, metallic sheen of cutlery).
6.  **Text & Branding:** Accurately transcribe any visible text (e.g., on packaging, menus, signs). Describe any logos, specific brand colors used prominently, or recognizable branding elements in detail.
7.  **Mood & Atmosphere:** Describe the overall feeling conveyed by the image (e.g., cozy and intimate, bright and energetic, rustic and homely, elegant and sophisticated, casual and fun, busy and dynamic).
8.  **Overall Style:** Characterize the image style (e.g., photorealistic, cinematic, food photography style, candid shot, flat lay, vector illustration, graphic design with photo elements).
"""

# Category-Specific Emphasis (as defined previously)
CATEGORY_EMPHASIS = {
    "Product Shot": "Emphasis for Product Shot: Pay extra attention to the details of the food/drink item itself – texture, color accuracy, freshness indicators (e.g., steam, droplets), plating details, garnishes, and how the lighting highlights the product's appeal. Describe the dishware/glassware precisely.",
    "Lifestyle Shot": "Emphasis for Lifestyle Shot: Focus on the people involved – their expressions, actions, interactions with the product or each other, clothing style, and body language. Describe how the product is integrated into the scene and the overall narrative suggested (e.g., friends enjoying brunch, family dinner, solo coffee break).",
    "Menu Displays": "Emphasis for Menu Display: Prioritize accurate transcription of all visible text, including item names, descriptions, and prices. Describe the menu's layout, typography (font style, size, weight), color scheme, any graphical elements (lines, boxes, icons), and the material/context if it's a physical menu photo (e.g., chalkboard, printed paper, digital screen). Note the overall readability and design style.",
    "Promotional Graphics": "Emphasis for Promotional Graphic: Accurately transcribe all promotional text (offer details, dates, calls to action). Describe the graphic design elements used (e.g., background color/gradient, shapes, icons, font styles). If it combines photos and graphics, describe how they are integrated. Detail the overall visual hierarchy and intended message.",
    "Branding Elements": "Emphasis for Branding Element: Focus intensely on the specific branding element shown (e.g., logo, packaging detail, unique sign). Describe its colors, shapes, typography, and material precisely. Explain its context within the image and how it contributes to the overall brand identity.",
    "Location/Ambience Shots": "Emphasis for Location/Ambience: Describe the key features of the space – decor style (e.g., modern, rustic, industrial), furniture, lighting fixtures, color scheme, materials (wood, brick, metal), sense of space (cozy, spacious), cleanliness, and overall atmosphere it creates for a customer. Mention specific details like wall art, plants, table settings if visible.",
    "Event Promotions": "Emphasis for Event Promotion: Accurately transcribe all event details (name, date, time, location, description, contact info, price). Describe any specific imagery related to the event theme (e.g., musical instruments, wine bottles, specific food). Detail the overall design style of the flyer/poster/graphic and its call to action.",
    "Behind-the-Scenes (BTS)": "Emphasis for BTS: Describe the action taking place (e.g., cooking, plating, ingredient prep, staff interaction). Detail the environment (e.g., kitchen equipment, staff uniforms, raw ingredients) and the sense of activity or focus. Capture the candid, authentic feel typical of BTS shots.",
    "Default": "" # For categories not listed or if no emphasis is needed
}

def construct_prompt(category: Optional[str] = None, use_category_emphasis: bool = False) -> str:
    """Constructs the prompt, optionally adding category-specific emphasis."""
    prompt = BASELINE_PROMPT
    if use_category_emphasis and category:
        emphasis = CATEGORY_EMPHASIS.get(category, CATEGORY_EMPHASIS["Default"])
        if emphasis:
            prompt += "\n\n" + emphasis
    return prompt

print("✅ Prompt construction function 'construct_prompt' defined.")

# @title Define Core Inference Function

def get_structured_description(
    model_name: str,
    image_base64: str,
    prompt: str
) -> Optional[FnbImageDescription]:
    """Sends image and prompt to a model via OpenRouter and gets a structured description."""
    try:
        print(f"   Querying {model_name}...")
        start_time = time.time()

        response = client.chat.completions.create(
            model=model_name,
            response_model=FnbImageDescription,
            max_retries=1, # Retry once on failure
            messages=[
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": prompt},
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:image/jpeg;base64,{image_base64}"
                            },
                        },
                    ],
                }
            ],
            max_tokens=2048, # Adjust as needed
            temperature=0.1, # Lower temperature for more deterministic descriptions
        )
        end_time = time.time()
        print(f"   ✅ Success for {model_name} in {end_time - start_time:.2f} seconds.")
        return response
    except Exception as e:
        print(f"   ❌ Error querying {model_name}: {e}")
        # Consider more specific error handling (e.g., for API errors, validation errors)
        return None

print("✅ Core inference function 'get_structured_description' defined.")


# @title Define Main Processing Workflow

# --- Configuration ---
# List of OpenRouter model identifiers to test (Update based on Step 1.2 and availability)
# Ensure these models support vision input on OpenRouter
MODELS_TO_TEST = [
    "openai/gpt-4o",
    "anthropic/claude-3.7-sonnet-2025XXXX", # Replace XXXX with actual date string if needed
    "google/gemini-pro-vision", # Or "google/gemini-1.5-pro-latest" if available and preferred
    "meta-llama/llama-3.1-405b-instruct", # Example, check OpenRouter for Llama 4 / 3.2 vision models
    "qwen/qwen-max-longcontext", # Example, check OpenRouter for specific Qwen VL models
    # Add other models here
]

# --- Input Data ---
# List of images to process. Each item is a tuple: (image_id, image_path, category)
# Replace with your actual image paths and categories from Step 1.1
IMAGES_TO_PROCESS = [
    ("prod_001", "path/to/your/product_shot_1.jpg", "Product Shot"),
    ("life_001", "path/to/your/lifestyle_shot_1.png", "Lifestyle Shot"),
    ("menu_001", "path/to/your/menu_display_1.jpg", "Menu Displays"),
    ("promo_001", "path/to/your/promo_graphic_1.jpeg", "Promotional Graphics"),
    # Add all other images from your dataset here...
]

# --- Workflow Execution ---

results_list = []

# Use tqdm for progress bar
for image_id, image_path, category in tqdm(IMAGES_TO_PROCESS, desc="Processing Images"):
    print(f"\nProcessing Image: {image_id} ({category}) - {image_path}")

    # 1. Encode Image
    image_base64 = encode_image_to_base64(image_path)
    if not image_base64:
        print(f"   Skipping image {image_id} due to encoding error.")
        continue

    # 2. Construct Prompt (Choose whether to use category emphasis)
    # Set use_category_emphasis=True to add specific instructions
    use_category_emphasis_flag = False # Or True
    prompt_text = construct_prompt(category, use_category_emphasis=use_category_emphasis_flag)

    # 3. Iterate through models
    for model_name in tqdm(MODELS_TO_TEST, desc=f"  Models for {image_id}", leave=False):
        description_obj = get_structured_description(model_name, image_base64, prompt_text)

        # Store results
        result_data = {
            "Image ID": image_id,
            "Category": category,
            "Model": model_name,
            "Prompt Type": "Category-Specific" if use_category_emphasis_flag and category else "Baseline",
        }

        if description_obj:
            # Add structured fields to the result dictionary
            result_data.update(description_obj.model_dump())
            result_data["Status"] = "Success"
        else:
            # Add empty fields if the description failed
            for field in FnbImageDescription.model_fields:
                 result_data[field] = "ERROR"
            result_data["Status"] = "Error"

        results_list.append(result_data)

print("\n✅ Workflow finished.")

# @title Display Results in a DataFrame

if results_list:
    results_df = pd.DataFrame(results_list)

    # Set display options for better readability
    pd.set_option('display.max_rows', None)
    pd.set_option('display.max_columns', None)
    pd.set_option('display.width', None)
    pd.set_option('display.max_colwidth', 150) # Adjust width as needed

    print("\n--- Comparison Results ---")
    # Display the DataFrame - Transpose might be useful for comparing fields across models for ONE image
    # For comparing models across images, the standard view is better.
    display(results_df)

    # Example: To compare descriptions for a specific image ID side-by-side
    # specific_image_id = "prod_001" # Change to the ID you want to inspect
    # display(results_df[results_df["Image ID"] == specific_image_id].set_index('Model').T)

    # --- Optional: Save results to CSV ---
    # results_df.to_csv("fnb_llm_evaluation_results.csv", index=False)
    # print("\n✅ Results saved to fnb_llm_evaluation_results.csv")

else:
    print("\nNo results generated.")

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m345.6/345.6 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: Operation cancelled by user[0m[31m
[0m⚠️ OpenRouter API Key not found.
Please set the OPENROUTER_API_KEY environment variable or in a .env file.


OpenAIError: The api_key client option must be set either by passing api_key to the client or by setting the OPENAI_API_KEY environment variable

# New Section