In [None]:
%load_ext autoreload
%autoreload 2

import os
from dotenv import load_dotenv
load_dotenv()

from clients.prompt_client import PromptClient
import json

client = PromptClient(
    base_url=f"{os.getenv('BASE_URL')}/{os.getenv('APPLICATION_NAME')}/v1",
    api_key=os.getenv('API_TOKEN')
)

## Of course, you can also stream the response

In [None]:
import time

response = client.stream(
    input_data={'topic': "guacamole"},
    route="/prompts/speedbench"
)

report = ""
intervals = [64, 128, 256, 512, 1024, 2048]
interval_index = 0
t0 = None

for i, elt in enumerate(response):


    if elt['format'] == 'text':
        print(elt['content'], end="")
        
    if i == 0:
        t0 = time.time()

    # Check if we've reached the next interval
    if interval_index < len(intervals) and i == intervals[interval_index]:
        t1 = time.time()
        delta = t1 - t0
        tokens_in_interval = intervals[interval_index] - (intervals[interval_index - 1] if interval_index > 0 else 0)
        rate = tokens_in_interval / delta
        report += f"{intervals[interval_index-1] if interval_index > 0 else 0}-{intervals[interval_index]}: {rate:.2f} tokens/sec\n"
        t0 = t1  # Reset timer for next interval
        interval_index += 1
        
print(report)

In [None]:
import time

topics = [
    "guacamole",
    "artificial intelligence",
    "quantum computing",
    "sustainable energy",
    "space exploration",
    "neuroscience",
    "blockchain technology",
    "virtual reality",
    "climate change",
    "biotechnology",
    "robotics",
    "cybersecurity",
    "renewable resources",
    "machine learning",
    "genetic engineering",
    "autonomous vehicles"
] * 8


response = client.batch(
    input_data=[{'topic': elt} for elt in topics],
    route="/prompts/speedbench",
    n_jobs=32,
    verbose=50
)

In [None]:
gpt-oss-120b
0-64: 182.14 tokens/sec
64-128: 147.11 tokens/sec
128-256: 158.66 tokens/sec
256-512: 143.20 tokens/sec
512-1024: 154.57 tokens/sec
1024-2048: 148.10 tokens/sec

batch_4: ~403-409 tokens/sec
batch_8: ~770-776 tokens/sec
batch_16: ~1294-1302 tokens/sec
batch_32: ~1986-2146 tokens/sec


gpt-oss-20b
0-64: 196.09 tokens/sec
64-128: 199.98 tokens/sec
128-256: 214.26 tokens/sec
256-512: 198.01 tokens/sec
512-1024: 196.56 tokens/sec
1024-2048: 194.38 tokens/sec

batch_4: ~564-624 tokens/sec
batch_8: ~1054-1117 tokens/sec
batch_16: ~1887-1912 tokens/sec
batch_32: ~2904-2911 tokens/sec


Qwen3-32B-AWQ
0-64: 60.47 tokens/sec
64-128: 68.94 tokens/sec
128-256: 62.53 tokens/sec
256-512: 62.36 tokens/sec
512-1024: 61.99 tokens/sec

batch_4: ~227-233 tokens/sec
batch_8: ~447-452 tokens/sec
batch_16: ~920-936 tokens/sec
batch_32: ~1448-1482 tokens/sec


Mistral-Small-3.2-24B-Instruct-hf-AWQ
0-64: 89.39 tokens/sec
64-128: 95.77 tokens/sec
128-256: 89.29 tokens/sec
256-512: 87.29 tokens/sec
512-1024: 86.95 tokens/sec
1024-2048: 86.59 tokens/sec

batch_4: ~288-336 tokens/sec
batch_8: ~631-646 tokens/sec
batch_16: ~1109-1153 tokens/sec
batch_32: ~1714-1790 tokens/sec


Qwen3-4B-Instruct-2507-GPTQ
0-64: 208.21 tokens/sec
64-128: 205.15 tokens/sec
128-256: 223.60 tokens/sec
256-512: 210.72 tokens/sec
512-1024: 211.67 tokens/sec
1024-2048: 207.49 tokens/sec

batch_4: ~721-743 tokens/sec
batch_8: ~1158-1377 tokens/sec
batch_16: ~2044-2236 tokens/sec
batch_32: ~2400-2666 tokens/sec


Qwen3-Coder-30B-A3B-Instruct-GPTQ-4bit
0-64: 179.42 tokens/sec
64-128: 176.71 tokens/sec
128-256: 176.01 tokens/sec
256-512: 175.81 tokens/sec
512-1024: 175.44 tokens/sec
1024-2048: 172.64 tokens/sec

batch_4: ~490-510 tokens/sec
batch_8: ~950-1000 tokens/sec
batch_16: ~1520-1602 tokens/sec
batch_32: ~2200-2400 tokens/sec

## Post

# RTX PRO 6000 Blackwell for LLM

Just received my brand new Blackwell card, so did a quick bench to let the community grasp the pros and cons

## Setup Details:
GPU : Rtx pro 6000 max-q workstation edition, 20% less performance than the complete, but with half the power draw. 2 slots
CPU : Ryzen 9 3950X, 24 channels, 16 cores / 32 threads
RAM : 128go DDR4 @33600

GPU1 : RTX 3090 24gb blower edition. 2 slots, unused here
GPU2 : RTX 3090 24gb founder edition. 3 slots, unused here

## Software details
### OS
Ubuntu 22.04
nvidia drivers : 770 open
Cuda toolkit 13

### Env
conda create --name vllm python=3.12
conda activate vllm

uv pip install flashinfer-python --prerelease=allow --upgrade --extra-index-url https://download.pytorch.org/whl/nightly/cu128
uv pip install vllm --torch-backend=cu128

## Training Benchmark
Two stuff are diferenciating for training on that card:
- the number of tensor core is outstanding, about 60% more than a single B100 gpu
- the 96GB vram is a game changer for training, enabling very large batch, so faster and smoother training

### Experiment:
Pretraining of a SLM with 35M parameters, based on GQA architecture with 8 layers, trained with pytorch lightning.
Training dataset is TinyStories, with a budget of 1B tokens (2 epochs), a sequence length of 256 tokens, and a virtual batch size of 100k tokens.
Models are trained in mixed bf16 precision (additionnal improvement could be expected from using black well fp8 training)

### Results:
- 1 x 4090 Laptop (similar perf as a 3090 Desktop) : ~2.5 hours to complete the training run
- 1 x RTX 6000 pro maxq workstation : ~20 min to complete the training run

### Conclusion
With proper optim, the card can single handedly deliver the training compute of 7.5 rtx 3090 card, while pulling only 300W of electricity (and being very quiet).

## Inference Benchmark
In inference, bandwith can be the bottleneck factor, especially in batch 1 inference.

Let's assess the results in batch 1, 4, 8, 16 and 32 to see how much token we can squeeze out of the card.

### Launch
```bash
export NVCC_THREADS=16
export MAX_JOBS=16
export OMP_NUM_THREADS=16
export VLLM_ATTENTION_BACKEND=FLASHINFER
export ENABLE_NVFP4_SM120=1
export VLLM_USE_FLASHINFER_MOE_FP4=1
export MODEL_NAME="DeepSeek-R1-0528-Qwen3-8B-FP4"
vllm serve "$MODEL_NAME" \
--served-model-name gpt-4 \
--port 5000 \
--max-model-len 16000 \
--gpu-memory-utilization 0.9 \
--trust_remote_code \
--max-seq-len-to-capture 8196 \
--enable-chunked-prefill  \
--kv-cache-dtype fp8 \
--compilation-config '{"pass_config":{"enable_fusion":true,"enable_noop":true},"cudagraph_mode":1,"max_capture_size":2048}'
```

### Launch >20B Active
On larger models, tensor cores can do wonders, so above 20B active parameters, the following additionnal env variables can provide a small speed increase, especially for batching.
export VLLM_USE_TRTLLM_ATTENTION=1
export VLLM_USE_TRTLLM_FP4_GEMM=1
export VLLM_FLASHINFER_FORCE_TENSOR_CORES=1

Note: i ran every speed test without these flags, but for example Mistral Small would give around 95 t/s on batch 1, and 1950 t/s on batch 32

### Launch QWEN Moe
Add flag --enable-expert-parallel

### Launch GPT-OSS
GPT OSS relies on MXFP4 quant (cause why would they do like everyone else uh?), an hybrid format that will most likely disapear once NVFP4 is fully supported.
They also are leveraging their own library for prompt formatting, that is not really compatible with vllm, so don't expect to get anything good from these, i am jsut testing the speed, but most of the time they only send you blank tokens, which is not really impressive.

#### DOWNLOADS
You'll need to download the following to make vllm work with special snowflake tokenizer, and not break on start:
sudo wget -O /etc/encodings/o200k_base.tiktoken https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken
sudo wget -O /etc/encodings/cl100k_base.tiktoken https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken

#### Launch Command
```bash
export ENABLE_NVFP4_SM120=1
export VLLM_USE_TRTLLM_ATTENTION=1
export OMP_NUM_THREADS=16
export TIKTOKEN_ENCODINGS_BASE=/etc/encodings  
export VLLM_USE_FLASHINFER_MXFP4_BF16_MOE=1 
export VLLM_USE_FLASHINFER_MXFP4_MOE=1 
export VLLM_ATTENTION_BACKEND=FLASHINFER
export MODEL_NAME="gpt-oss-120b"
vllm serve "$MODEL_NAME" \
--async-scheduling \
--served-model-name gpt-4 \
--port 5000 \
--max-model-len 16000 \
--gpu-memory-utilization 0.9 \
--trust_remote_code \
--max-seq-len-to-capture 8196 \
--compilation-config '{"pass_config":{"enable_fusion":true,"enable_noop":true},"cudagraph_mode":1,"max_capture_size":2048}' \
```

### Model Tested:
- Qwen3-Coder-30B-A3B-Instruct-GPTQ-4bit
- Qwen3-4B-Instruct-2507-GPTQ
- Qwen3-32B-AWQ
- Mistral-Small-3.2-24B-Instruct-hf-AWQ
- gpt-oss-20b
- gpt-oss-120b
- Llama-4-Scout-17B-16E-Instruct-AWQ

### Failed Test (where not able to run NVFP4 formats properly for some reasons)
- DeepSeek-R1-0528-Qwen3-8B-FP4
- Qwen3-32B-FP4

### Results


### Conclusion
No surprise, in batch 1, the performance is good but not outstanding, limited by the 1.7 TB/s of GDDR7 memory.
The blackwell optimizations allow to squeeze a bit more performance though (that might explode when flash attention 4 will be released) and just slightly beats the speed of 2 x 3090 with tensor parallelism.

The game changer is on batch 32, with an almost linear scaling of number of tokens delivered with batch size, so might be really usefull for small scale serving and multi agent deployment purpose.

So far, support is still not completely ready, but sufficient to play with some models.

## Code to reproduce the results
Training scripts can be found on this repo for pretraining: https://github.com/gabrielolympie/ArchiFactory
Speed Benchmark for inference + used prompts can be found in : https://github.com/gabrielolympie/PromptServer





