From e61643a49be03065ae77bbd3abef14305517e099 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 2 Nov 2025 10:44:06 +0200 Subject: [PATCH] benches : add folder with benchmarks --- benches/dgx-spark.md | 275 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 275 insertions(+) create mode 100644 benches/dgx-spark.md diff --git a/benches/dgx-spark.md b/benches/dgx-spark.md new file mode 100644 index 0000000000000..3c23738904132 --- /dev/null +++ b/benches/dgx-spark.md @@ -0,0 +1,275 @@ +## System info + +```bash +uname --all +Linux spark-17ed 6.11.0-1016-nvidia #16-Ubuntu SMP PREEMPT_DYNAMIC Sun Sep 21 16:52:46 UTC 2025 aarch64 aarch64 aarch64 GNU/Linux + +g++ --version +g++ (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0 + +nvidia-smi +Sun Nov 2 10:43:25 2025 ++-----------------------------------------------------------------------------------------+ +| NVIDIA-SMI 580.95.05 Driver Version: 580.95.05 CUDA Version: 13.0 | ++-----------------------------------------+------------------------+----------------------+ +| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | +| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | +| | | MIG M. | +|=========================================+========================+======================| +| 0 NVIDIA GB10 On | 0000000F:01:00.0 Off | N/A | +| N/A 35C P8 4W / N/A | Not Supported | 0% Default | +| | | N/A | ++-----------------------------------------+------------------------+----------------------+ +``` + +## ggml-org/gpt-oss-20b-GGUF + +Model: https://huggingface.co/ggml-org/gpt-oss-20b-GGUF + +- `llama-batched-bench` + + +main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, n_gpu_layers = -1, n_threads = 20, n_threads_batch = 20 + +| PP | TG | B | N_KV | T_PP s | S_PP t/s | T_TG s | S_TG t/s | T s | S t/s | +|-------|--------|------|--------|----------|----------|----------|----------|----------|----------| +| 512 | 32 | 1 | 544 | 0.390 | 1311.72 | 0.395 | 81.11 | 0.785 | 693.12 | +| 512 | 32 | 2 | 1088 | 0.273 | 3754.20 | 0.674 | 95.00 | 0.946 | 1149.52 | +| 512 | 32 | 4 | 2176 | 0.517 | 3965.01 | 0.843 | 151.90 | 1.359 | 1601.00 | +| 512 | 32 | 8 | 4352 | 1.025 | 3994.63 | 1.008 | 254.09 | 2.033 | 2140.79 | +| 512 | 32 | 16 | 8704 | 2.054 | 3988.71 | 1.277 | 400.94 | 3.331 | 2613.19 | +| 512 | 32 | 32 | 17408 | 4.110 | 3986.29 | 1.666 | 614.82 | 5.776 | 3014.04 | +| 4096 | 32 | 1 | 4128 | 1.166 | 3513.60 | 0.410 | 77.97 | 1.576 | 2619.03 | +| 4096 | 32 | 2 | 8256 | 2.316 | 3537.64 | 0.734 | 87.18 | 3.050 | 2707.09 | +| 4096 | 32 | 4 | 16512 | 4.653 | 3521.53 | 0.915 | 139.90 | 5.567 | 2965.81 | +| 4096 | 32 | 8 | 33024 | 9.277 | 3532.17 | 1.181 | 216.70 | 10.458 | 3157.66 | +| 4096 | 32 | 16 | 66048 | 18.624 | 3518.87 | 1.649 | 310.48 | 20.273 | 3257.89 | +| 4096 | 32 | 32 | 132096 | 37.217 | 3521.82 | 2.424 | 422.45 | 39.641 | 3332.30 | +| 8192 | 32 | 1 | 8224 | 2.473 | 3313.11 | 0.444 | 72.07 | 2.917 | 2819.71 | +| 8192 | 32 | 2 | 16448 | 4.926 | 3326.23 | 0.771 | 82.98 | 5.697 | 2887.14 | +| 8192 | 32 | 4 | 32896 | 9.851 | 3326.52 | 1.006 | 127.28 | 10.856 | 3030.15 | +| 8192 | 32 | 8 | 65792 | 19.635 | 3337.67 | 1.338 | 191.33 | 20.973 | 3136.95 | +| 8192 | 32 | 16 | 131584 | 39.290 | 3336.01 | 1.952 | 262.36 | 41.242 | 3190.56 | +| 8192 | 32 | 32 | 263168 | 78.571 | 3336.40 | 3.002 | 341.16 | 81.572 | 3226.19 | + + + + +- `llama-bench` + +| model | size | params | backend | ngl | n_ubatch | fa | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --------------: | -------------------: | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 | 3782.76 ± 11.95 | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | CUDA | 99 | 2048 | 1 | 0 | tg32 | 84.10 ± 0.52 | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d4096 | 3030.29 ± 8.25 | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d4096 | 77.36 ± 0.13 | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d8192 | 2852.09 ± 11.44 | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d8192 | 74.00 ± 0.64 | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d16384 | 2470.31 ± 13.96 | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d16384 | 68.85 ± 0.11 | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d32768 | 1899.37 ± 11.28 | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d32768 | 59.85 ± 0.12 | + +build: 2f68ce7cf (6918) + +## ggml-org/gpt-oss-120b-GGUF + +Model: https://huggingface.co/ggml-org/gpt-oss-120b-GGUF + +- `llama-batched-bench` + + +main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, n_gpu_layers = -1, n_threads = 20, n_threads_batch = 20 + +| PP | TG | B | N_KV | T_PP s | S_PP t/s | T_TG s | S_TG t/s | T s | S t/s | +|-------|--------|------|--------|----------|----------|----------|----------|----------|----------| +| 512 | 32 | 1 | 544 | 0.605 | 846.15 | 0.559 | 57.28 | 1.164 | 467.46 | +| 512 | 32 | 2 | 1088 | 0.585 | 1749.57 | 1.061 | 60.35 | 1.646 | 661.06 | +| 512 | 32 | 4 | 2176 | 1.026 | 1995.16 | 1.346 | 95.08 | 2.373 | 917.12 | +| 512 | 32 | 8 | 4352 | 2.051 | 1997.01 | 1.746 | 146.65 | 3.797 | 1146.26 | +| 512 | 32 | 16 | 8704 | 4.122 | 1987.35 | 2.327 | 220.02 | 6.449 | 1349.63 | +| 512 | 32 | 32 | 17408 | 8.262 | 1983.16 | 3.276 | 312.56 | 11.538 | 1508.79 | +| 4096 | 32 | 1 | 4128 | 2.245 | 1824.57 | 0.591 | 54.18 | 2.835 | 1455.83 | +| 4096 | 32 | 2 | 8256 | 4.484 | 1827.03 | 1.122 | 57.02 | 5.606 | 1472.68 | +| 4096 | 32 | 4 | 16512 | 8.967 | 1827.22 | 1.474 | 86.83 | 10.441 | 1581.50 | +| 4096 | 32 | 8 | 33024 | 17.902 | 1830.43 | 1.997 | 128.22 | 19.898 | 1659.64 | +| 4096 | 32 | 16 | 66048 | 35.801 | 1830.56 | 2.901 | 176.49 | 38.702 | 1706.58 | +| 4096 | 32 | 32 | 132096 | 71.607 | 1830.43 | 4.433 | 230.99 | 76.040 | 1737.18 | +| 8192 | 32 | 1 | 8224 | 4.688 | 1747.28 | 0.627 | 51.01 | 5.316 | 1547.11 | +| 8192 | 32 | 2 | 16448 | 9.342 | 1753.75 | 1.182 | 54.13 | 10.525 | 1562.81 | +| 8192 | 32 | 4 | 32896 | 18.697 | 1752.56 | 1.605 | 79.74 | 20.302 | 1620.30 | +| 8192 | 32 | 8 | 65792 | 37.442 | 1750.33 | 2.228 | 114.91 | 39.670 | 1658.48 | +| 8192 | 32 | 16 | 131584 | 74.863 | 1750.82 | 3.419 | 149.74 | 78.283 | 1680.89 | +| 8192 | 32 | 32 | 263168 | 149.598 | 1752.33 | 5.209 | 196.60 | 154.806 | 1699.98 | + + + + +- `llama-bench` + +| model | size | params | backend | ngl | n_ubatch | fa | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --------------: | -------------------: | +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 | 1937.54 ± 7.91 | +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | CUDA | 99 | 2048 | 1 | 0 | tg32 | 59.02 ± 0.17 | +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d4096 | 1653.89 ± 4.68 | +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d4096 | 53.95 ± 0.52 | +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d8192 | 1580.60 ± 3.10 | +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d8192 | 51.85 ± 0.12 | +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d16384 | 1411.42 ± 3.75 | +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d16384 | 48.03 ± 0.11 | +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d32768 | 1155.40 ± 2.98 | +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d32768 | 41.35 ± 0.09 | + +build: 2f68ce7cf (6918) + +## ggml-org/Qwen3-Coder-30B-A3B-Instruct-Q8_0-GGUF + +Model: https://huggingface.co/ggml-org/Qwen3-Coder-30B-A3B-Instruct-Q8_0-GGUF + +- `llama-batched-bench` + + +main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, n_gpu_layers = -1, n_threads = 20, n_threads_batch = 20 + +| PP | TG | B | N_KV | T_PP s | S_PP t/s | T_TG s | S_TG t/s | T s | S t/s | +|-------|--------|------|--------|----------|----------|----------|----------|----------|----------| +| 512 | 32 | 1 | 544 | 0.401 | 1275.89 | 0.555 | 57.63 | 0.957 | 568.70 | +| 512 | 32 | 2 | 1088 | 0.355 | 2884.04 | 0.998 | 64.11 | 1.353 | 803.94 | +| 512 | 32 | 4 | 2176 | 0.614 | 3333.11 | 1.255 | 102.01 | 1.869 | 1164.11 | +| 512 | 32 | 8 | 4352 | 1.215 | 3371.40 | 1.653 | 154.91 | 2.868 | 1517.67 | +| 512 | 32 | 16 | 8704 | 2.424 | 3379.01 | 2.198 | 232.98 | 4.622 | 1883.15 | +| 512 | 32 | 32 | 17408 | 4.845 | 3381.63 | 2.968 | 345.01 | 7.813 | 2228.08 | +| 4096 | 32 | 1 | 4128 | 1.319 | 3104.74 | 0.657 | 48.67 | 1.977 | 2088.30 | +| 4096 | 32 | 2 | 8256 | 2.636 | 3107.30 | 1.133 | 56.48 | 3.770 | 2190.15 | +| 4096 | 32 | 4 | 16512 | 5.290 | 3097.27 | 1.494 | 85.69 | 6.784 | 2434.13 | +| 4096 | 32 | 8 | 33024 | 10.554 | 3104.75 | 2.136 | 119.87 | 12.690 | 2602.40 | +| 4096 | 32 | 16 | 66048 | 21.162 | 3096.80 | 3.150 | 162.55 | 24.312 | 2716.66 | +| 4096 | 32 | 32 | 132096 | 42.380 | 3092.81 | 4.928 | 207.80 | 47.307 | 2792.29 | +| 8192 | 32 | 1 | 8224 | 2.838 | 2886.60 | 0.752 | 42.56 | 3.590 | 2290.94 | +| 8192 | 32 | 2 | 16448 | 5.678 | 2885.69 | 1.267 | 50.51 | 6.945 | 2368.42 | +| 8192 | 32 | 4 | 32896 | 11.342 | 2889.18 | 1.763 | 72.59 | 13.105 | 2510.17 | +| 8192 | 32 | 8 | 65792 | 22.650 | 2893.42 | 2.656 | 96.39 | 25.306 | 2599.86 | +| 8192 | 32 | 16 | 131584 | 45.366 | 2889.21 | 4.209 | 121.64 | 49.575 | 2654.23 | +| 8192 | 32 | 32 | 263168 | 90.690 | 2890.56 | 7.158 | 143.06 | 97.847 | 2689.58 | + + + + +- `llama-bench` + +| model | size | params | backend | ngl | n_ubatch | fa | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --------------: | -------------------: | +| qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 | 3175.62 ± 15.26 | +| qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | CUDA | 99 | 2048 | 1 | 0 | tg32 | 60.92 ± 0.17 | +| qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d4096 | 2704.06 ± 6.21 | +| qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d4096 | 53.52 ± 0.35 | +| qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d8192 | 2381.88 ± 8.79 | +| qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d8192 | 48.58 ± 0.11 | +| qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d16384 | 1900.91 ± 8.51 | +| qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d16384 | 40.81 ± 0.06 | +| qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d32768 | 1292.19 ± 7.09 | +| qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d32768 | 30.80 ± 0.02 | + +build: 2f68ce7cf (6918) + +## ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF + +Model: https://huggingface.co/ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF + +- `llama-batched-bench` + + +main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, n_gpu_layers = -1, n_threads = 20, n_threads_batch = 20 + +| PP | TG | B | N_KV | T_PP s | S_PP t/s | T_TG s | S_TG t/s | T s | S t/s | +|-------|--------|------|--------|----------|----------|----------|----------|----------|----------| +| 512 | 32 | 1 | 544 | 0.215 | 2379.83 | 1.095 | 29.24 | 1.310 | 415.37 | +| 512 | 32 | 2 | 1088 | 0.424 | 2416.31 | 1.175 | 54.46 | 1.599 | 680.46 | +| 512 | 32 | 4 | 2176 | 0.880 | 2328.41 | 1.227 | 104.33 | 2.106 | 1033.02 | +| 512 | 32 | 8 | 4352 | 1.738 | 2356.86 | 1.308 | 195.67 | 3.046 | 1428.66 | +| 512 | 32 | 16 | 8704 | 3.472 | 2359.65 | 1.528 | 335.01 | 5.000 | 1740.80 | +| 512 | 32 | 32 | 17408 | 6.956 | 2355.21 | 1.726 | 593.27 | 8.683 | 2004.95 | +| 4096 | 32 | 1 | 4128 | 1.813 | 2259.55 | 1.126 | 28.42 | 2.939 | 1404.78 | +| 4096 | 32 | 2 | 8256 | 3.601 | 2274.88 | 1.246 | 51.35 | 4.848 | 1703.13 | +| 4096 | 32 | 4 | 16512 | 7.203 | 2274.48 | 1.364 | 93.86 | 8.567 | 1927.37 | +| 4096 | 32 | 8 | 33024 | 14.403 | 2275.04 | 1.570 | 163.10 | 15.973 | 2067.50 | +| 4096 | 32 | 16 | 66048 | 28.797 | 2275.78 | 2.025 | 252.80 | 30.822 | 2142.85 | +| 4096 | 32 | 32 | 132096 | 57.630 | 2274.38 | 2.716 | 377.05 | 60.346 | 2188.99 | +| 8192 | 32 | 1 | 8224 | 3.746 | 2186.60 | 1.164 | 27.50 | 4.910 | 1674.94 | +| 8192 | 32 | 2 | 16448 | 7.473 | 2192.42 | 1.323 | 48.39 | 8.796 | 1870.03 | +| 8192 | 32 | 4 | 32896 | 14.905 | 2198.45 | 1.511 | 84.71 | 16.416 | 2003.89 | +| 8192 | 32 | 8 | 65792 | 29.837 | 2196.49 | 1.843 | 138.88 | 31.680 | 2076.77 | +| 8192 | 32 | 16 | 131584 | 59.700 | 2195.53 | 2.609 | 196.21 | 62.309 | 2111.80 | +| 8192 | 32 | 32 | 263168 | 119.441 | 2194.75 | 3.818 | 268.20 | 123.259 | 2135.08 | + + + + +- `llama-bench` + +| model | size | params | backend | ngl | n_ubatch | fa | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --------------: | -------------------: | +| qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 | 2276.28 ± 4.92 | +| qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | CUDA | 99 | 2048 | 1 | 0 | tg32 | 29.30 ± 0.03 | +| qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d4096 | 2090.96 ± 6.11 | +| qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d4096 | 28.27 ± 0.03 | +| qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d8192 | 1913.47 ± 5.86 | +| qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d8192 | 27.42 ± 0.03 | +| qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d16384 | 1643.88 ± 8.45 | +| qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d16384 | 25.93 ± 0.01 | +| qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d32768 | 1283.36 ± 1.00 | +| qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d32768 | 22.70 ± 0.02 | + +build: 2f68ce7cf (6918) + +## ggml-org/gemma-3-4b-it-qat-GGUF + +Model: https://huggingface.co/ggml-org/gemma-3-4b-it-qat-GGUF + +- `llama-batched-bench` + + +main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, n_gpu_layers = -1, n_threads = 20, n_threads_batch = 20 + +| PP | TG | B | N_KV | T_PP s | S_PP t/s | T_TG s | S_TG t/s | T s | S t/s | +|-------|--------|------|--------|----------|----------|----------|----------|----------|----------| +| 512 | 32 | 1 | 544 | 0.098 | 5236.19 | 0.408 | 78.38 | 0.506 | 1074.95 | +| 512 | 32 | 2 | 1088 | 0.171 | 5984.52 | 0.507 | 126.21 | 0.678 | 1604.23 | +| 512 | 32 | 4 | 2176 | 0.342 | 5981.45 | 0.554 | 231.11 | 0.896 | 2427.90 | +| 512 | 32 | 8 | 4352 | 0.674 | 6077.05 | 0.692 | 369.93 | 1.366 | 3185.88 | +| 512 | 32 | 16 | 8704 | 1.340 | 6115.09 | 0.930 | 550.47 | 2.270 | 3834.78 | +| 512 | 32 | 32 | 17408 | 2.676 | 6123.63 | 1.277 | 801.81 | 3.953 | 4404.13 | +| 4096 | 32 | 1 | 4128 | 0.711 | 5756.92 | 0.454 | 70.44 | 1.166 | 3541.03 | +| 4096 | 32 | 2 | 8256 | 1.404 | 5835.14 | 0.571 | 112.12 | 1.975 | 4180.78 | +| 4096 | 32 | 4 | 16512 | 2.786 | 5880.66 | 0.668 | 191.68 | 3.454 | 4780.71 | +| 4096 | 32 | 8 | 33024 | 5.547 | 5907.20 | 0.891 | 287.46 | 6.438 | 5129.79 | +| 4096 | 32 | 16 | 66048 | 11.097 | 5905.68 | 1.315 | 389.48 | 12.412 | 5321.43 | +| 4096 | 32 | 32 | 132096 | 22.178 | 5910.07 | 2.055 | 498.18 | 24.233 | 5451.03 | +| 8192 | 32 | 1 | 8224 | 1.424 | 5752.81 | 0.467 | 68.51 | 1.891 | 4348.89 | +| 8192 | 32 | 2 | 16448 | 2.819 | 5812.28 | 0.630 | 101.53 | 3.449 | 4768.59 | +| 8192 | 32 | 4 | 32896 | 5.626 | 5824.22 | 0.791 | 161.74 | 6.418 | 5125.96 | +| 8192 | 32 | 8 | 65792 | 11.238 | 5831.66 | 1.163 | 220.10 | 12.401 | 5305.36 | +| 8192 | 32 | 16 | 131584 | 22.426 | 5844.66 | 1.857 | 275.70 | 24.283 | 5418.77 | +| 8192 | 32 | 32 | 263168 | 44.834 | 5847.04 | 3.146 | 325.48 | 47.980 | 5484.98 | + + + + +- `llama-bench` + +| model | size | params | backend | ngl | n_ubatch | fa | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --------------: | -------------------: | +| gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 | 5713.05 ± 19.74 | +| gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | CUDA | 99 | 2048 | 1 | 0 | tg32 | 78.97 ± 0.15 | +| gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d4096 | 5165.17 ± 27.83 | +| gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d4096 | 68.94 ± 0.99 | +| gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d8192 | 4885.85 ± 46.27 | +| gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d8192 | 67.81 ± 0.84 | +| gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d16384 | 4417.88 ± 52.14 | +| gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d16384 | 64.35 ± 0.66 | +| gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d32768 | 3667.74 ± 50.02 | +| gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d32768 | 58.38 ± 0.05 | + +build: 2f68ce7cf (6918) + +