In [None]:
# import torch
from transformers import AutoProcessor, AutoModelForImageTextToText
from PIL import Image

###############################################################################
# 1) SETUP: MODEL & PROCESSOR
###############################################################################
model_path = "qwen_model"  

processor = AutoProcessor.from_pretrained(model_path)
model = AutoModelForImageTextToText.from_pretrained(
    model_path,
    torch_dtype="auto",
    device_map="auto"  # Distribute across GPUs automatically if available
)

###############################################################################
# 2) EXAMPLE CASES: TEXT-ONLY CHAIN OF THOUGHT
###############################################################################


examples = [
    {
        "buyer_image_path": "Buyer_image_1.jpg",
        "seller_image_path": "Seller_image_1.jpg",
        "buyer_review": "Example buyer review",
        "seller_description": "example seller description",
        "expected_output": "example expected output"
    },
    {
        "buyer_image_path": "Buyer_image_2.jpg",
        "seller_image_path": "Seller_image_2.jpg",
        "buyer_review": "Example buyer review",
        "seller_description": "example seller description",
        "expected_output": "example expected output"
    },
    # Add more examples as needed
    
]

###############################################################################
# 3) BUILD THE EXAMPLE TEXT
###############################################################################

example_text_blocks = []
for i, ex in enumerate(examples, start=1):
    block = (
        f"Example {i}:\n"
        f"Buyer Review:\n{ex['buyer_review']}\n\n"
        f"Seller Description:\n{ex['seller_description']}\n\n"
        f"Expected Output:\n{ex['expected_output']}\n"
        f"{'-'*40}\n"
    )
    example_text_blocks.append(block)

# Combine all example blocks into a single string
examples_text_combined = "\n".join(example_text_blocks)

###############################################################################
# 4) ACTUAL CASE INPUT (WITH 2 REAL IMAGES)
###############################################################################
buyer_image_path = "buyerimage.jpg"
seller_image_path = "sellerimage.jpg"

try:
    buyer_image = Image.open(buyer_image_path).convert("RGB").resize((524, 524))
    seller_image = Image.open(seller_image_path).convert("RGB").resize((524, 524))
except Exception as e:
    print(f"Error loading actual images: {e}")
    raise

# Define the buyer's review, seller's description, and final prompt
actual_buyer_review = (
""" Actual buyer review

"""
)

actual_seller_description = (
    """Actual seller reveiw

"""
)

analysis_prompt = (
    "Actual prompt"
)

###############################################################################
# 5) BUILD THE FINAL MULTI-MODAL MESSAGE
###############################################################################
messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "text",
                "text": (
                    "Below are multiple chain-of-thought examples for reference:\n\n"
                    f"{examples_text_combined}\n"
                    "Now, here is the new case to analyze:\n\n"
                )
            },
            {"type": "image", "image": buyer_image},
            {"type": "text", "text": f"Buyer Review:\n{actual_buyer_review}\n\n"},
            {"type": "image", "image": seller_image},
            {"type": "text", "text": f"Seller Description:\n{actual_seller_description}\n\n"},
            {"type": "text", "text": analysis_prompt},
        ],
    }
]

###############################################################################
# 6) CONSTRUCT THE FINAL TEXT & PREPARE MODEL INPUT
###############################################################################
try:
    text_input = processor.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    if text_input is None:
        raise ValueError("Generated text from apply_chat_template is None.")

    inputs = processor(
        text=[text_input],
        images=[buyer_image, seller_image],
        return_tensors="pt",
        padding=True
    )

    device = "cuda" if torch.cuda.is_available() else "cpu"
    inputs = {k: v.to(device) for k, v in inputs.items()}

    ###############################################################################
    # 7) GENERATE OUTPUT (CHAIN OF THOUGHT RESPONSE)
    ###############################################################################
    generated_ids = model.generate(
        **inputs,
        max_new_tokens=1500,
        temperature=0.7,
        repetition_penalty=1.1,
        top_k=50,
        top_p=0.9
    )

    generated_ids_trimmed = [
        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs["input_ids"], generated_ids)
    ]

    # Decode the final text
    output_text = processor.batch_decode(
        generated_ids_trimmed,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False
    )

    print("Model's Chain-of-Thought Analysis:\n", output_text[0])

except Exception as e:
    print(f"Error during processing or inference: {e}")
