In [None]:
import os
# 在导入 vllm 之前，设置环境变量来禁用 v1 引擎的 FlashAttention
# 这样会回退到支持交叉注意力的旧版注意力后端
os.environ['VLLM_USE_V1'] = '0'

import torch
from transformers import AutoConfig

# 导入 vLLM 的分布式初始化工具和模型组件
from vllm.model_executor.models.bart import BartDecoder
from vllm.distributed import init_distributed_environment, initialize_model_parallel

# 模型名称
model_name = "ByteDance/Dolphin"

# ---- 手动初始化一个模拟的分布式环境 ----
# 模拟单卡环境
world_size = 1
rank = 0
local_rank = 0
master_addr = "localhost"
# 确保端口未被占用
master_port = "29501" 

init_distributed_environment(
    world_size=world_size,
    rank=rank,
    local_rank=local_rank,
    distributed_init_method=f"tcp://{master_addr}:{master_port}"
)

# 初始化张量并行组
initialize_model_parallel(tensor_model_parallel_size=1)
# -----------------------------------------


# 现在，分布式环境和注意力后端都已设置正确
print("vLLM distributed environment initialized. Creating BartDecoder with fallback attention backend...")
config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)

# 这行代码现在应该可以成功执行了
decoder =  BartDecoder(config=config.decoder,
                        cache_config=None,
                        quant_config=None)

print("\n------ BartDecoder from vLLM (instantiated successfully) ------")
print(decoder)