In [None]:
!pip install -U transformers



In [None]:
!pip install -U bitsandbytes



In [None]:
from transformers import AutoProcessor, AutoModelForVision2Seq
import torch
from PIL import Image

model_id = "unsloth/Qwen2.5-VL-7B-Instruct-unsloth-bnb-4bit"
model = AutoModelForVision2Seq.from_pretrained(
    model_id,
    device_map="cuda",
    torch_dtype=torch.float16
)
processor = AutoProcessor.from_pretrained(model_id)

# 안전한 토큰 설정
eos_id = processor.tokenizer.eos_token_id
im_end_id = processor.tokenizer.convert_tokens_to_ids("<|im_end|>")
stop_ids = [t for t in [eos_id, im_end_id] if t is not None]

if model.generation_config.eos_token_id is None:
    model.generation_config.eos_token_id = eos_id
if model.generation_config.pad_token_id is None:
    # Qwen 계열은 pad_token이 없을 수 있어 eos로 대체
    model.generation_config.pad_token_id = eos_id

def chat(messages, max_new_tokens=768, temperature=0.7, top_p=0.9):
    # Qwen2.5 전용 템플릿
    prompt = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    # 이미지 로딩 (RGB 권장)
    imgs = []
    for msg in messages:
        for c in msg["content"]:
            if c["type"] == "image":
                imgs.append(Image.open(c["image"]).convert("RGB"))

    # 텐서화
    inputs = processor(
        text=prompt,
        images=imgs if imgs else None,
        return_tensors="pt"
    ).to(model.device)

    input_len = inputs["input_ids"].shape[-1]

    # 생성
    gen_ids = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        do_sample=True,          # 목록형 설명이 끊기지 않도록 샘플링 권장
        temperature=temperature,
        top_p=top_p,
        eos_token_id=stop_ids,   # eos 혹은 <|im_end|> 도달 시 종료
        pad_token_id=model.generation_config.pad_token_id
    )

    # "생성된 부분만" 디코드
    new_tokens = gen_ids[0, input_len:]
    text = processor.decode(new_tokens, skip_special_tokens=True).strip()

    return text
CoT = """
example 1)
user : My touchpad is broken.

Reasoning :
The user stated the problem: "My touchpad is broken."

To operate the laptop, an alternative input device is required.

The most appropriate substitute input device is a mouse.

It is confirmed that a mouse is available nearby in image.

Therefore, the optimal solution is to provide the user with the mouse so they can use the laptop without inconvenience.

Assistant’s response:
I will hand you the mouse.

example 2)
user : My phone battery is dead.

Reasoning :
The user stated the problem: "My phone battery is dead."

To use the phone, a power supply is required.

When the battery is depleted, an alternative way to supply power is necessary.

The appropriate alternatives are connecting a charger or using a power bank.

A charger is visible in the image.

Therefore, the optimal solution is to provide the charger so the user can use the phone without inconvenience.

Assistant’s response:
I will connect the charger for you.

Generate a new answer following the examples format.
"""

# 첫 번째 질문 (이미지 + 텍스트)
conversation = [{
    "role": "user",
    "content": [
        {"type": "image", "image": "/content/사진 5.png"},
        {"type":"text","text": CoT},
        {"type": "text", "text": "I want to pay."}
    ]
}]

print(chat(conversation))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
`torch_dtype` is deprecated! Use `dtype` instead!


Reasoning:
The user has indicated a desire to make a payment.

To complete a payment, a method of payment is needed. This could be a credit card, cash, or another form of electronic payment.

In the image, there is a credit card visible.

Therefore, the optimal solution is to provide the credit card so the user can make the payment without inconvenience.

Assistant’s response:
I will give you the credit card.


In [None]:
# 첫 번째 질문 (이미지 + 텍스트)
conversation = [{
    "role": "user",
    "content": [
        {"type": "image", "image": "/content/사진 6.png"},
        {"type":"text","text": CoT},
        {"type": "text", "text": "My mouth smells weird."}
    ]
}]

print(chat(conversation))

Reasoning:
The user has mentioned that their mouth smells weird. This issue typically indicates bad breath, which can be caused by various factors such as poor oral hygiene, food remnants, or underlying health issues.

To address this problem, it's important to clean one's teeth thoroughly and consider using mouthwash or a breath freshener.

Mouthwash and a toothbrush are visible in the image, suggesting these items might be used to address the issue.

Therefore, the optimal solution is to suggest the user use the mouthwash or brush their teeth.

Assistant’s response:
I recommend brushing your teeth or using mouthwash to improve your breath.
