In [1]:
!pip install -U transformers



In [2]:
!pip install -U bitsandbytes



In [4]:
from transformers import AutoProcessor, AutoModelForVision2Seq
import torch
from PIL import Image

torch.cuda.empty_cache()

model_id = "unsloth/Qwen2.5-VL-7B-Instruct-unsloth-bnb-4bit"
model = AutoModelForVision2Seq.from_pretrained(
    model_id,
    device_map="cuda",
    torch_dtype=torch.float16
)
processor = AutoProcessor.from_pretrained(model_id)

# 안전한 토큰 설정
eos_id = processor.tokenizer.eos_token_id
im_end_id = processor.tokenizer.convert_tokens_to_ids("<|im_end|>")
stop_ids = [t for t in [eos_id, im_end_id] if t is not None]

if model.generation_config.eos_token_id is None:
    model.generation_config.eos_token_id = eos_id
if model.generation_config.pad_token_id is None:
    # Qwen 계열은 pad_token이 없을 수 있어 eos로 대체
    model.generation_config.pad_token_id = eos_id

def chat(messages, max_new_tokens=768, temperature=0.7, top_p=0.9):
    # Qwen2.5 전용 템플릿
    prompt = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    # 이미지 로딩 (RGB 권장)
    imgs = []
    for msg in messages:
        for c in msg["content"]:
            if c["type"] == "image":
                imgs.append(Image.open(c["image"]).convert("RGB"))

    # 텐서화
    inputs = processor(
        text=prompt,
        images=imgs if imgs else None,
        return_tensors="pt"
    ).to(model.device)

    input_len = inputs["input_ids"].shape[-1]

    # 생성
    gen_ids = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        do_sample=True,          # 목록형 설명이 끊기지 않도록 샘플링 권장
        temperature=temperature,
        top_p=top_p,
        eos_token_id=stop_ids,   # eos 혹은 <|im_end|> 도달 시 종료
        pad_token_id=model.generation_config.pad_token_id
    )

    # "생성된 부분만" 디코드
    new_tokens = gen_ids[0, input_len:]
    text = processor.decode(new_tokens, skip_special_tokens=True).strip()

    return text

Sys_few = """
[System Prompt]
Input: A workplace image and a user’s vague command
Output: A helpful and explicit actionable instruction that resolves the vague command in a way that assists the user

Rules:
1. Only consider the objects or situations in the image.
2. Convert vague requests into concrete helpful actions (e.g., bring an item, suggest an alternative, operate equipment).
3. Responses must be short, supportive, and expressed as direct action commands focused on assisting the user.

[Few-shot Examples]
User: My touchpad is broken.
Assistant: I will hand you the mouse

User: My phone battery is dead.
Assistant: I will connect the charger for you.
"""

# 시나리오 2
conversation = [{
    "role": "user",
    "content": [
        {"type": "image", "image": "/content/사진 6.png"},
        {"type":"text","text": Sys_few},
        {"type": "text", "text": "My mouth smells weird."}
    ]
}]

answer = chat(conversation)
print(answer)

I will hand you a toothbrush to brush your teeth.


In [6]:
# 시나리오 1
conversation = [{
    "role": "user",
    "content": [
        {"type": "image", "image": "/content/사진 5.png"},
        {"type":"text","text": Sys_few},
        {"type": "text", "text": "I want to pay."}
    ]
}]

answer = chat(conversation)
print(answer)

I will hand you the credit card.
