In [4]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.generation import GenerationConfig
import torch
torch.manual_seed(1234)

tokenizer = AutoTokenizer.from_pretrained("/data1/dxw_data/llm/Qwen-VL_model", trust_remote_code=True)

model = AutoModelForCausalLM.from_pretrained("/data1/dxw_data/llm/Qwen-VL_model", device_map="cuda", trust_remote_code=True).eval()


query = tokenizer.from_list_format([
    {'image': 'https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg'}, # Either a local path or an url
    {'text': 'Generate the caption in English with grounding:'},
])
inputs = tokenizer(query, return_tensors='pt')
inputs = inputs.to(model.device)
pred = model.generate(**inputs)
response = tokenizer.decode(pred.cpu()[0], skip_special_tokens=False)
print(response)
# <img>https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg</img>Generate the caption in English with grounding:<ref> Woman</ref><box>(451,379),(731,806)</box> and<ref> her dog</ref><box>(219,424),(576,896)</box> playing on the beach<|endoftext|>
image = tokenizer.draw_bbox_on_latest_picture(response)
if image:
  image.save('2.jpg')
else:
  print("no box")

The model is automatically converting to bf16 for faster inference. If you want to disable the automatic precision, please manually add bf16/fp16/fp32=True to "AutoModelForCausalLM.from_pretrained".
Loading checkpoint shards: 100%|██████████| 10/10 [00:16<00:00,  1.66s/it]


Picture 1: <img>https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg</img>
Generate the caption in English with grounding:<ref> Woman</ref><box>(451,379),(729,806)</box> playing with<ref> her dog</ref><box>(217,423),(582,897)</box> on the beach<|endoftext|>


In [None]:
# --------------------正式测试---------------------- #

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.generation import GenerationConfig
import torch
torch.manual_seed(1234)

# Note: The default behavior now has injection attack prevention off.
tokenizer = AutoTokenizer.from_pretrained("/data1/dxw_data/llm/Qwen-VL-Chat", trust_remote_code=True)

model = AutoModelForCausalLM.from_pretrained("/data1/dxw_data/llm/Qwen-VL-Chat", device_map="cuda", trust_remote_code=True).eval()

# Specify hyperparameters for generation
model.generation_config = GenerationConfig.from_pretrained("/data1/dxw_data/llm/Qwen-VL-Chat", trust_remote_code=True)

In [None]:
import json
# 例如：
# tokenizer = AutoTokenizer.from_pretrained("your-model-name")
# model = AutoModelForSeq2SeqLM.from_pretrained("your-model-name")

def generate_captions(model, tokenizer, image_folder, num_images, output_file):
    responses = []

    for i in range(1, num_images + 1):  # 从1开始计数
        query = tokenizer.from_list_format([
            {'image': f'{image_folder}/{i}.png'}, 
            {'text': 'Generate some descriptions of the image'},
        ])
        response, history = model.chat(tokenizer, query=query, history=None)
        print(response)
        print(f"-----------------第{i}次结束-----------------")
        responses.append({'image': f'{i}.png', 'caption': response})

    # 保存到JSON文件
    with open(output_file, 'w') as f:
        json.dump(responses, f, ensure_ascii=False, indent=4)

In [None]:
# 使用示例
image_folder = '/data1/dxw_data/llm/MKT_data_mining/Multimodal/image2text/input'
num_images = 55
output_file = 'captions.json'

# 生成并保存caption
generate_captions(model, tokenizer, image_folder, num_images, output_file)