dsxsteven
diff --git a/‎.buildkite/nightly-benchmarks/nightly-descriptions.md‎
Lines changed: 1 addition & 1 deletion b/‎.buildkite/nightly-benchmarks/nightly-descriptions.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.buildkite/release-pipeline.yaml‎
Lines changed: 4 additions & 12 deletions b/‎.buildkite/release-pipeline.yaml‎
Lines changed: 4 additions & 12 deletions
diff --git a/‎.buildkite/scripts/annotate-release.sh‎
Lines changed: 22 additions & 7 deletions b/‎.buildkite/scripts/annotate-release.sh‎
Lines changed: 22 additions & 7 deletions
diff --git a/‎.buildkite/scripts/hardware_ci/run-cpu-test.sh‎
Lines changed: 0 additions & 1 deletion b/‎.buildkite/scripts/hardware_ci/run-cpu-test.sh‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎.buildkite/test-pipeline.yaml‎
Lines changed: 31 additions & 35 deletions b/‎.buildkite/test-pipeline.yaml‎
Lines changed: 31 additions & 35 deletions
diff --git a/‎.coveragerc‎
Lines changed: 32 additions & 0 deletions b/‎.coveragerc‎
Lines changed: 32 additions & 0 deletions
diff --git a/‎.github/CODEOWNERS‎
Lines changed: 17 additions & 8 deletions b/‎.github/CODEOWNERS‎
Lines changed: 17 additions & 8 deletions
@@ -8,7 +8,7 @@ This benchmark aims to:
 
 Latest results: [results link](https://blog.vllm.ai/2024/09/05/perf-update.html), scroll to the end.
 
-Latest reproduction guilde: [github issue link](https://github.com/vllm-project/vllm/issues/8176)
+Latest reproduction guide: [github issue link](https://github.com/vllm-project/vllm/issues/8176)
 
 ## Setup
 
 
@@ -1,24 +1,22 @@
 steps:
   # aarch64 + CUDA builds. PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9
   - label: "Build arm64 wheel - CUDA 12.9"
+    depends_on: ~
     id: build-wheel-arm64-cuda-12-9
     agents:
       queue: arm64_cpu_queue_postmerge
     commands:
       # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
       # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
       - "mkdir artifacts"
       - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
       - "bash .buildkite/scripts/upload-wheels.sh"
     env:
       DOCKER_BUILDKIT: "1"
 
-  - block: "Build CUDA 12.8 wheel"
-    key: block-build-cu128-wheel
-
   - label: "Build wheel - CUDA 12.8"
-    depends_on: block-build-cu128-wheel
+    depends_on: ~
     id: build-wheel-cuda-12-8
     agents:
       queue: cpu_queue_postmerge
@@ -30,12 +28,8 @@ steps:
     env:
       DOCKER_BUILDKIT: "1"
 
-  - block: "Build CUDA 12.6 wheel"
-    key: block-build-cu126-wheel
-    depends_on: ~
-
   - label: "Build wheel - CUDA 12.6"
-    depends_on: block-build-cu126-wheel
+    depends_on: ~
     id: build-wheel-cuda-12-6
     agents:
       queue: cpu_queue_postmerge
@@ -102,8 +96,6 @@ steps:
     depends_on:
       - create-multi-arch-manifest
       - build-wheel-cuda-12-8
-      - build-wheel-cuda-12-6
-      - build-wheel-cuda-12-9
     id: annotate-release-workflow
     agents:
       queue: cpu_queue_postmerge
 
@@ -14,18 +14,33 @@ buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF
 To download the wheel:
 \`\`\`
 aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
+aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl .
+
 aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu126/vllm-${RELEASE_VERSION}+cu126-cp38-abi3-manylinux1_x86_64.whl .
-aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu118/vllm-${RELEASE_VERSION}+cu118-cp38-abi3-manylinux1_x86_64.whl . 
+aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu129/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
 \`\`\`
 
 To download and upload the image:
 
 \`\`\`
-docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}
-docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT} vllm/vllm-openai
-docker tag vllm/vllm-openai vllm/vllm-openai:latest
-docker tag vllm/vllm-openai vllm/vllm-openai:v${RELEASE_VERSION}
-docker push vllm/vllm-openai:latest
-docker push vllm/vllm-openai:v${RELEASE_VERSION}
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64
+
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64 vllm/vllm-openai:x86_64
+docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:latest-x86_64
+docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
+docker push vllm/vllm-openai:latest-x86_64
+docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
+
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64 vllm/vllm-openai:aarch64
+docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:latest-aarch64
+docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
+docker push vllm/vllm-openai:latest-aarch64
+docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
+
+docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64 --amend
+docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 --amend
+docker manifest push vllm/vllm-openai:latest
+docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}
 \`\`\`
 EOF 
@@ -66,7 +66,6 @@ function cpu_tests() {
 
     pytest -x -v -s tests/models/language/pooling -m cpu_model
     pytest -x -v -s tests/models/multimodal/generation \
-                --ignore=tests/models/multimodal/generation/test_mllama.py \
                 --ignore=tests/models/multimodal/generation/test_pixtral.py \
                 -m cpu_model"
 
 
@@ -46,24 +46,18 @@ steps:
   mirror_hardwares: [amdexperimental]
   source_file_dependencies:
   - vllm/
-  - tests/mq_llm_engine
-  - tests/async_engine
   - tests/test_inputs.py
   - tests/test_outputs.py
   - tests/multimodal
   - tests/utils_
-  - tests/worker
   - tests/standalone_tests/lazy_imports.py
   - tests/transformers_utils
   commands:
   - python3 standalone_tests/lazy_imports.py
-  - pytest -v -s mq_llm_engine # MQLLMEngine
-  - pytest -v -s async_engine # AsyncLLMEngine
   - pytest -v -s test_inputs.py
   - pytest -v -s test_outputs.py
   - pytest -v -s multimodal
   - pytest -v -s utils_ # Utils
-  - pytest -v -s worker # Worker
   - pytest -v -s transformers_utils # transformers_utils
 
 - label: Python-only Installation Test # 10min
@@ -84,25 +78,12 @@ steps:
   - vllm/
   - tests/basic_correctness/test_basic_correctness
   - tests/basic_correctness/test_cpu_offload
-  - tests/basic_correctness/test_preemption
   - tests/basic_correctness/test_cumem.py
   commands:
   - export VLLM_WORKER_MULTIPROC_METHOD=spawn
   - pytest -v -s basic_correctness/test_cumem.py
   - pytest -v -s basic_correctness/test_basic_correctness.py
   - pytest -v -s basic_correctness/test_cpu_offload.py
-  - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
-
-- label: Core Test # 22min
-  timeout_in_minutes: 35
-  mirror_hardwares: [amdexperimental]
-  fast_check: true
-  source_file_dependencies:
-  - vllm/core
-  - vllm/distributed
-  - tests/core
-  commands:
-  - pytest -v -s core
 
 - label: Entrypoints Unit Tests # 5min
   timeout_in_minutes: 10
@@ -230,16 +211,14 @@ steps:
   num_gpus: 2
   source_file_dependencies:
   - vllm/
-  - tests/metrics
   - tests/v1/tracing
   commands:
-  - pytest -v -s metrics
   - "pip install \
       'opentelemetry-sdk>=1.26.0' \
       'opentelemetry-api>=1.26.0' \
       'opentelemetry-exporter-otlp>=1.26.0' \
       'opentelemetry-semantic-conventions-ai>=0.4.1'"
-  - pytest -v -s tracing
+  - pytest -v -s v1/tracing
 
 ##### fast check tests  #####
 #####  1 GPU test  #####
@@ -394,6 +373,7 @@ steps:
     - pytest -v -s compile/test_async_tp.py
     - pytest -v -s compile/test_fusion_all_reduce.py
     - pytest -v -s compile/test_decorator.py
+    - pytest -v -s compile/test_noop_elimination.py
 
 - label: PyTorch Fullgraph Smoke Test # 15min
   timeout_in_minutes: 30
@@ -548,15 +528,6 @@ steps:
   commands: # LMEval+Transcription WER check
   - pytest -s entrypoints/openai/correctness/
 
-- label: Encoder Decoder tests # 12min
-  timeout_in_minutes: 20
-  mirror_hardwares: [amdexperimental]
-  source_file_dependencies:
-  - vllm/
-  - tests/encoder_decoder
-  commands:
-    - pytest -v -s encoder_decoder
-
 - label: OpenAI-Compatible Tool Use # 23 min
   timeout_in_minutes: 35
   mirror_hardwares: [amdexperimental]
@@ -817,7 +788,7 @@ steps:
     # Quantization
     - pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
     - pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
-    - pytest -v -s tests/kernels/quantization/test_silu_nvfp4_quant_fusion.py
+    - pytest -v -s tests/kernels/quantization/test_silu_mul_nvfp4_quant.py
     - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
     - pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
     - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
@@ -829,6 +800,20 @@ steps:
     - pytest -v -s tests/kernels/moe/test_flashinfer.py
     - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
 
+- label: GPT-OSS Eval (Blackwell)
+  timeout_in_minutes: 60
+  working_dir: "/vllm-workspace/"
+  gpu: b200
+  optional: true # disable while debugging
+  source_file_dependencies:
+  - tests/evals/gpt_oss
+  - vllm/model_executor/models/gpt_oss.py
+  - vllm/model_executor/layers/quantization/mxfp4.py
+  - vllm/v1/attention/backends/flashinfer.py
+  commands:
+    - uv pip install --system 'gpt-oss[eval]==0.0.5'
+    - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58 --server-args '--tensor-parallel-size 2'
+
 #####  1 GPU test  #####
 #####  multi gpus test  #####
 
@@ -954,7 +939,6 @@ steps:
   commands:
   - pytest -v -s distributed/test_pp_cudagraph.py
   - pytest -v -s distributed/test_pipeline_parallel.py
-  # - pytest -v -s distributed/test_context_parallel.py # TODO: enable it on Hopper runners or add triton MLA support
 
 - label: LoRA TP Test (Distributed) # 17 min
   timeout_in_minutes: 30
@@ -1028,9 +1012,21 @@ steps:
   - export VLLM_WORKER_MULTIPROC_METHOD=spawn
   - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
 
-- label: Qwen MoE EP Test # optional
+##### H200 test #####
+- label: Distrubted Tests (H200) # optional
   gpu: h200
   optional: true
+  working_dir: "/vllm-workspace/"
+  num_gpus: 2
+  commands:
+    - pytest -v -s tests/distributed/test_context_parallel.py
+    - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1  --dp-size=2 --max-model-len 2048
+
+##### B200 test #####
+- label: Distributed Tests (B200) # optional
+  gpu: b200
+  optional: true
+  working_dir: "/vllm-workspace/"
   num_gpus: 2
   commands:
-    - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 /vllm-workspace/examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1  --dp-size=2 --max-model-len 2048
+    - pytest -v -s tests/distributed/test_context_parallel.py
@@ -0,0 +1,32 @@
+[run]
+source = vllm
+omit =
+    */tests/*
+    */test_*
+    */__pycache__/*
+    */build/*
+    */dist/*
+    */vllm.egg-info/*
+    */third_party/*
+    */examples/*
+    */benchmarks/*
+    */docs/*
+
+[report]
+exclude_lines =
+    pragma: no cover
+    def __repr__
+    if self.debug:
+    if settings.DEBUG
+    raise AssertionError
+    raise NotImplementedError
+    if 0:
+    if __name__ == .__main__.:
+    class .*\bProtocol\):
+    @(abc\.)?abstractmethod
+
+[html]
+directory = htmlcov
+
+[xml]
+output = coverage.xml
@@ -2,24 +2,27 @@
 # for more info about CODEOWNERS file
 
 # This lists cover the "core" components of vLLM that require careful review
+/vllm/attention @LucasWilkinson
 /vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/core @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/engine/llm_engine.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
 /vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
 /vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
+/vllm/model_executor/layers/fused_moe @mgoin
 /vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @NickLucche
 /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256
 /vllm/model_executor/layers/mamba @tdoublep
 /vllm/model_executor/model_loader @22quinn
 /vllm/multimodal @DarkLight1337 @ywang96 @NickLucche
+/vllm/v1/attention @LucasWilkinson
 /vllm/v1/sample @22quinn @houseroad
 /vllm/vllm_flash_attn @LucasWilkinson
 /vllm/lora @jeejeelee
 /vllm/reasoning @aarnphm @chaunceyjiang
 /vllm/entrypoints @aarnphm @chaunceyjiang
 /vllm/compilation @zou3519 @youkaichao @ProExpertProg
-/vllm/distributed/kv_transfer @NickLucche
+/vllm/distributed/kv_transfer @NickLucche @ApostaC
 CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 
 # Any change to the VllmConfig changes can have a large user-facing impact,
@@ -30,30 +33,33 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 /vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
 /vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett
 /vllm/v1/spec_decode @benchislett @luccafong
+/vllm/v1/attention/backends/flashinfer.py @mgoin
 /vllm/v1/attention/backends/triton_attn.py @tdoublep
-/vllm/v1/core @heheda12345
+/vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 @ApostaC
 /vllm/v1/kv_cache_interface.py @heheda12345
+/vllm/v1/offloading @ApostaC
 
 # Test ownership
 /.buildkite/lm-eval-harness @mgoin @simon-mo
-/tests/async_engine @njhill @robertgshaw2-redhat @simon-mo
 /tests/distributed/test_multi_node_assignment.py @youkaichao
 /tests/distributed/test_pipeline_parallel.py @youkaichao
 /tests/distributed/test_same_node.py @youkaichao
 /tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm @NickLucche
-/tests/kernels @tlrmchlsmth @WoosukKwon @yewentao256
+/tests/evals @mgoin
+/tests/kernels @mgoin @tlrmchlsmth @WoosukKwon @yewentao256
 /tests/models @DarkLight1337 @ywang96
 /tests/multimodal @DarkLight1337 @ywang96 @NickLucche
-/tests/prefix_caching @comaniac @KuntaiDu
 /tests/quantization @mgoin @robertgshaw2-redhat @yewentao256
 /tests/test_inputs.py @DarkLight1337 @ywang96
 /tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
 /tests/v1/structured_output @mgoin @russellb @aarnphm
-/tests/v1/core @heheda12345
+/tests/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 @ApostaC
 /tests/weight_loading @mgoin @youkaichao @yewentao256
 /tests/lora @jeejeelee
 /tests/models/language/generation/test_hybrid.py @tdoublep
-/tests/v1/kv_connector/nixl_integration @NickLucche
+/tests/v1/kv_connector/nixl_integration @NickLucche 
+/tests/v1/kv_connector @ApostaC
+/tests/v1/offloading @ApostaC
 
 # Docs
 /docs @hmellor
@@ -101,4 +107,7 @@ mkdocs.yaml @hmellor
 /vllm/v1/worker/tpu* @NickLucche
 /vllm/platforms/tpu.py @NickLucche
 /vllm/v1/sample/tpu @NickLucche
-/vllm/tests/v1/tpu @NickLucche
+/vllm/tests/v1/tpu @NickLucche
+
+# KVConnector installation files
+/requirements/kv_connectors.txt @NickLucche