In [1]:
from fastapi import FastAPI, Request
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
import uvicorn
import json
import datetime
import torch

In [3]:
model_name_or_path = './qwen/Qwen1.5-7B-Chat'
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=False)
model = AutoModelForCausalLM.from_pretrained(model_name_or_path, device_map="auto", torch_dtype=torch.bfloat16)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

We've detected an older driver with an RTX 4000 series GPU. These drivers have issues with P2P. This can affect the multi-gpu inference when using accelerate device_map.Please make sure to update your driver to the latest version which resolves this.


In [4]:
messages = [
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": "你好"}
    ]

# 调用模型进行对话生成
input_ids = tokenizer.apply_chat_template(messages,tokenize=False,add_generation_prompt=True)
input_ids

'<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n你好<|im_end|>\n<|im_start|>assistant\n'

In [5]:
model_inputs = tokenizer([input_ids], return_tensors="pt").to('cuda')
model_inputs

{'input_ids': tensor([[151644,   8948,    198,   2610,    525,    264,  10950,  17847,     13,
         151645,    198, 151644,    872,    198, 108386, 151645,    198, 151644,
          77091,    198]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
       device='cuda:0')}

In [6]:
model_inputs.input_ids

tensor([[151644,   8948,    198,   2610,    525,    264,  10950,  17847,     13,
         151645,    198, 151644,    872,    198, 108386, 151645,    198, 151644,
          77091,    198]], device='cuda:0')

In [7]:
generated_ids = model.generate(model_inputs.input_ids,max_new_tokens=512)
generated_ids

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


tensor([[151644,   8948,    198,   2610,    525,    264,  10950,  17847,     13,
         151645,    198, 151644,    872,    198, 108386, 151645,    198, 151644,
          77091,    198, 108386,   6313, 112169, 106184,  99553, 100364,   1773,
         104139,  86119, 100631,  85106, 100703,   9370, 101037,  11319, 151645]],
       device='cuda:0')

In [8]:
generated_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]
generated_ids

[tensor([108386,   6313, 112169, 106184,  99553, 100364,   1773, 104139,  86119,
         100631,  85106, 100703,   9370, 101037,  11319, 151645],
        device='cuda:0')]