In [1]:
from transformers import Qwen3ForCausalLM, Qwen3Config, AutoTokenizer
from transformers.generation import GenerationConfig

In [2]:
model_path = r'model_save\Qwen\Qwen3-0___6B'
device = 'cpu'
tokenizer = AutoTokenizer.from_pretrained(model_path)
model1 = Qwen3ForCausalLM.from_pretrained(model_path).to(device)

In [3]:
def test_model(model: Qwen3ForCausalLM):
    messages = [
    {"role": "user", "content": '介绍一下你自己'}
]
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
        enable_thinking=False # Switches between thinking and non-thinking modes. Default is True.
    )
    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

    generated_ids = model.generate(
        **model_inputs,
        generation_config=GenerationConfig(do_sample=False, max_new_tokens=32),
        use_model_defaults=False
    )
    output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist() 

    # the result will begin with thinking content in <think></think> tags, followed by the actual response
    print(tokenizer.decode(output_ids, skip_special_tokens=True))
test_model(model1)

您好！我是你的虚拟助手，可以为您提供帮助和支持。如果您有任何问题或需要帮助，请随时告诉我！


In [None]:
# config = Qwen3Config.from_pretrained('Qwen3-0___6B')


In [None]:
# import re
# for k, v in config.__dict__.items():
#     if isinstance(v, int|float|str|bool):
#         t = re.findall(r'\'(.*)\'', str(type(v)))[0]
#         print(f"{k}: {t} = {v}")

In [5]:
from dataclasses import dataclass

@dataclass
class VLMCOFIG:
    vocab_size: int = 151936
    max_position_embeddings: int = 40960
    hidden_size: int = 1024
    intermediate_size: int = 3072
    num_hidden_layers: int = 28
    num_attention_heads: int = 16
    use_sliding_window: bool = False
    max_window_layers: int = 28
    num_key_value_heads: int = 8
    head_dim: int = 128
    hidden_act: str = 'silu'
    initializer_range: float = 0.02
    rms_norm_eps: float = 1e-06
    use_cache: bool = True
    rope_theta: int = 1000000
    attention_bias: bool = False
    attention_dropout: float = 0.0
    tie_word_embeddings: bool = True
    chunk_size_feed_forward: int = 0
    is_encoder_decoder: bool = False
    is_decoder: bool = False
    add_cross_attention: bool = False
    tie_encoder_decoder: bool = False
    bos_token_id: int = 151643
    eos_token_id: int = 151645


In [6]:
new_config = Qwen3Config(**VLMCOFIG().__dict__)

new_model = Qwen3ForCausalLM(new_config)

In [7]:
test_model(new_model)

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


_countries_countries_countries_countries_countries_countries_countries_countries_countries_countries_countries_countries_countries_countries_countries_countries_countries_countries_countries_countries_countries_countries_countries都知道عمارعمارعمارعمارعمارعمارعمارعمار


In [8]:
new_model.load_state_dict(model1.state_dict())
test_model(new_model)


Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


您好！我是你的虚拟助手，可以为您提供帮助和支持。如果您有任何问题或需要帮助，请随时告诉我！


In [10]:
from transformers import SiglipVisionModel, AutoModel, SiglipTextModel

In [12]:
visionModel = SiglipVisionModel.from_pretrained(r'model_save\google\siglip2-base-patch16-256')
visionModel.num_parameters() / (1024**2)

88.625244140625

In [None]:
visionModel.config

In [None]:
import re
for k, v in visionModel.config.__dict__.items():
    if isinstance(v, int|float|str|bool):
        t = re.findall(r'\'(.*)\'', str(type(v)))[0]
        if isinstance(v, str):
            print(f"{k}: {t} = '{v}'")
        else:
            print(f"{k}: {t} = {v}")

In [16]:
# model = SiglipTextModel.from_pretrained(r'model_save\google\siglip2-base-patch16-256')
# model.num_parameters() / (1024**2)

In [17]:
# model

In [9]:
from models.config import VLMConfig
from models.vision_language_model import VisionLanguageModel
from models.processors import get_tokenizer, get_image_processor, vlm_input_apply_chat_template
import torch
from PIL import Image
import os
os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
torch.set_default_dtype(torch.bfloat16)

In [10]:
config = VLMConfig()
device = 'cuda:0'

In [11]:
tokenizer = get_tokenizer(config.lm_pretrain_path, config.vlm_extra_tokens, config.lm_chat_template)
image_processor = get_image_processor(256)
model = VisionLanguageModel(config, load_from_pretrained=True).to(device)

Loading weights from pretrained


In [12]:
messages = [
    {
        "role": "user",
        "content": [
             {"type": "text", "text": "描述一下这张图片"},
            {
                "type": "image",
                "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg",
            },
           
        ],
    }
]
encoded_prompt = tokenizer.apply_chat_template(
    messages, tokenize=True, add_generation_prompt=True
)
tokens = torch.tensor(encoded_prompt).to(device)
print(encoded_prompt)

[151644, 872, 198, 53481, 100158, 108893, 45930, 151652, 151655, 151653, 151645, 198, 151644, 77091, 198]


In [9]:
print(tokenizer.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
))

<|im_start|>user
描述一下这张图片<|vision_start|><|image_pad|><|vision_end|><|im_end|>
<|im_start|>assistant



In [None]:
# from transformers import Qwen2_5_VLProcessor
# from qwen_vl_utils import process_vision_info
# ps = Qwen2_5_VLProcessor.from_pretrained('model_save/qwen25_vl_ps')

# text = ps.apply_chat_template(
#     messages, tokenize=False, add_generation_prompt=True
# )
# print(text)
# image_inputs, video_inputs = process_vision_info(messages)
# print(image_inputs, type(image_inputs[0]))
# inputs = ps(
#     text=[text],
#     images=image_inputs,
#     videos=video_inputs,
#     padding=True,
#     return_tensors="pt",
# )
# print('\n')
# print(ps.decode(inputs['input_ids'][0]))
# inputs['input_ids']

In [None]:
inputs = vlm_input_apply_chat_template(tokenizer, messages,).to(device)

img = Image.open(r'assets/image.png').convert("RGB")
img_t = image_processor(img).unsqueeze(0).to(device)

In [12]:
for i in range(5):
    gen = model.generate(inputs['input_ids'], img_t, max_new_tokens=20)
    out = tokenizer.batch_decode(gen, skip_special_tokens=True)[0]
    print(f"  >> Generation {i+1}: {out}")

  >> Generation 1: 1. **图片的描述**：  
图片显示了一个清晰、直观的描述，能够帮助
  >> Generation 2: 1. 请描述这张图片的内容。
  >> Generation 3: 请描述一下这张图片的内容。
  >> Generation 4: 1. **图片中的场景**：  
- 一个宁静的公园，湖面平静，
  >> Generation 5: 1. **图片描述**  
1. **图片描述**  
1. **图片描述**


In [13]:
save_path = 'model_save/my_vlm_model'
model.save_pretrained(save_path)

In [24]:
from models.vision_language_model import VisionLanguageModel
save_path = 'model_save/my_model'
model2 = VisionLanguageModel.from_pretrained(save_path).to(device)

In [25]:
model2.num_parameters() / 1024**2

681.062744140625

In [26]:
with torch.no_grad():
    for i in range(5):
        gen = model2.generate(inputs['input_ids'], img_t, max_new_tokens=20)
        out = tokenizer.batch_decode(gen, skip_special_tokens=True)[0]
        print(f">> Generation {i+1}: {out}")

>> Generation 1: 好的，以下是关于您这张图片的描述：

* 该图片展示了一幅关于现代中国
>> Generation 2: 好的，我来描述一下这张图片。图片中有一张红色的圆点，大小为
>> Generation 3: 好的，我来帮你描述这张图片。如果你是在寻找一些关于这个图片的描述，我可以
>> Generation 4: 好的，以下是针对您上传图片的描述：

我注意到您上传了图片，可能与您
>> Generation 5: 好的，我现在会根据你的要求来描述这张图片。如果你愿意的话，可以告诉我图片中的
