From bf21397ae5ea7c73d3494db3b91505599909227d Mon Sep 17 00:00:00 2001
From: AndreasKunar <andreaskmsn.com>
Date: Wed, 17 Jul 2024 08:59:21 +0200
Subject: [PATCH 1/7] Improvements for Windows with Snapdragon X

---
 cmake/arm64-windows-llvm.cmake |  3 ++-
 docs/build.md                  |  9 ++++++++-
 ggml/src/ggml-aarch64.c        | 12 ++++++------
 3 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/cmake/arm64-windows-llvm.cmake b/cmake/arm64-windows-llvm.cmake
index 8023796800683..82ca42d2096f3 100644
--- a/cmake/arm64-windows-llvm.cmake
+++ b/cmake/arm64-windows-llvm.cmake
@@ -9,7 +9,8 @@ set( CMAKE_CXX_COMPILER  clang++ )
 set( CMAKE_C_COMPILER_TARGET   ${target} )
 set( CMAKE_CXX_COMPILER_TARGET ${target} )
 
-set( arch_c_flags "-march=armv8.7-a -fvectorize -ffp-model=fast -fno-finite-math-only" )
+# march for Snapdragon X should be 8.7-a, but this currently breaks Q_4_0_4_4 acceleration, 8.5 works
+set( arch_c_flags "-march=armv8.5-a -fvectorize -ffp-model=fast -fno-finite-math-only" )
 set( warn_c_flags "-Wno-format -Wno-unused-variable -Wno-unused-function -Wno-gnu-zero-variadic-macro-arguments" )
 
 set( CMAKE_C_FLAGS_INIT   "${arch_c_flags} ${warn_c_flags}" )
diff --git a/docs/build.md b/docs/build.md
index 916fcf22d7924..dfff7cb0574b3 100644
--- a/docs/build.md
+++ b/docs/build.md
@@ -16,7 +16,7 @@ In order to build llama.cpp you have four different options.
       make
       ```
 
-  - On Windows:
+  - On Windows (x86/x64 only, arm64 requires cmake):
 
     1. Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases).
     2. Extract `w64devkit` on your pc.
@@ -45,6 +45,13 @@ In order to build llama.cpp you have four different options.
     - For `Q4_0_4_4` quantization type build, add the `-DGGML_LLAMAFILE=OFF` cmake option. For example, use `cmake -B build -DGGML_LLAMAFILE=OFF`.
     - For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `cmake --build build --config Release -j 8` will run 8 jobs in parallel.
     - For faster repeated compilation, install [ccache](https://ccache.dev/).
+    - For Windows:
+      - Install cmake e.g. via `winget install cmake`:
+      - As alternative to the w64devkit mentioned in "using make" above, install MSVC (e.g. via Visual Studio 2022 Community Edition).
+      - For Windows on ARM you need MSVC installed and _additonally_:
+        - Install [clang via LLVM for woa64](https://releases.llvm.org) to enable better ARM optimizations (clang needs the MSVC backend).
+        - For using clang, the first build step needs to be `cmake --preset arm64-windows-llvm-release` (instead of the `cmake -B ...` which defaults to MSVC).
+        - Note: Building for ARM can also just be done with MSVC (without installing clang or using the preset), but this e.g. does not support Q_4_0_4_4 acceleration, because the MSVC frontend cannot inline ARM assembly-code.
     - For debug builds, there are two cases:
 
       1. Single-config generators (e.g. default = `Unix Makefiles`; note that they just ignore the `--config` flag):
diff --git a/ggml/src/ggml-aarch64.c b/ggml/src/ggml-aarch64.c
index 26535b1c432ba..0c22e816a87a1 100644
--- a/ggml/src/ggml-aarch64.c
+++ b/ggml/src/ggml-aarch64.c
@@ -392,7 +392,7 @@ void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
 #if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
     GGML_ASSERT(!(ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) &&
                 "__ARM_NEON and __ARM_FEATURE_MATMUL_INT8 defined, use the Q4_0_4_8 quantization format for optimal performance");
-#elif defined(__ARM_NEON) && defined(__aarch64__)
+#elif defined(__ARM_NEON) && defined(__aarch64__) && ! defined(_MSC_VER)
     const void * b_ptr = vx;
     const void * a_ptr = vy;
     float * res_ptr = s;
@@ -501,7 +501,7 @@ void ggml_gemv_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void *
                     "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
     }
 #endif
-#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
+#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8) && ! defined(_MSC_VER)
     const void * b_ptr = vx;
     const void * a_ptr = vy;
     float * res_ptr = s;
@@ -613,7 +613,7 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
     UNUSED(ncols_interleaved);
     UNUSED(blocklen);
 
-#if defined(__ARM_FEATURE_SVE)
+#if defined(__ARM_FEATURE_SVE) && ! defined(_MSC_VER)
     if (svcntw() == 8) {
         const void * b_ptr = vx;
         const void * a_ptr = vy;
@@ -753,7 +753,7 @@ void ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
 #if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
     GGML_ASSERT(!(ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) &&
                 "__ARM_NEON and __ARM_FEATURE_MATMUL_INT8 defined, use the Q4_0_4_8 quantization format for optimal performance");
-#elif defined(__ARM_NEON) && defined(__aarch64__)
+#elif defined(__ARM_NEON) && defined(__aarch64__) && ! defined(_MSC_VER)
     const void * b_ptr = vx;
     const void * a_ptr = vy;
     float * res_ptr = s;
@@ -1271,7 +1271,7 @@ void ggml_gemm_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void *
                     "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
     }
 #endif
-#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
+#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8) && ! defined(_MSC_VER)
     const void * b_ptr = vx;
     const void * a_ptr = vy;
     float * res_ptr = s;
@@ -1727,7 +1727,7 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
     UNUSED(ncols_interleaved);
     UNUSED(blocklen);
 
-#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
+#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8) && ! defined(_MSC_VER)
     if (svcntw() == 8) {
         const void * b_ptr = vx;
         const void * a_ptr = vy;

From 820665f3a10b3730d06ef0443843cb1e2fc1f3e4 Mon Sep 17 00:00:00 2001
From: AndreasKunar <andreaskmsn.com>
Date: Wed, 17 Jul 2024 09:56:54 +0200
Subject: [PATCH 2/7] Revert "Improvements for Windows with Snapdragon X"

This reverts commit bf21397ae5ea7c73d3494db3b91505599909227d.
---
 cmake/arm64-windows-llvm.cmake |  3 +--
 docs/build.md                  |  9 +--------
 ggml/src/ggml-aarch64.c        | 12 ++++++------
 3 files changed, 8 insertions(+), 16 deletions(-)

diff --git a/cmake/arm64-windows-llvm.cmake b/cmake/arm64-windows-llvm.cmake
index 82ca42d2096f3..8023796800683 100644
--- a/cmake/arm64-windows-llvm.cmake
+++ b/cmake/arm64-windows-llvm.cmake
@@ -9,8 +9,7 @@ set( CMAKE_CXX_COMPILER  clang++ )
 set( CMAKE_C_COMPILER_TARGET   ${target} )
 set( CMAKE_CXX_COMPILER_TARGET ${target} )
 
-# march for Snapdragon X should be 8.7-a, but this currently breaks Q_4_0_4_4 acceleration, 8.5 works
-set( arch_c_flags "-march=armv8.5-a -fvectorize -ffp-model=fast -fno-finite-math-only" )
+set( arch_c_flags "-march=armv8.7-a -fvectorize -ffp-model=fast -fno-finite-math-only" )
 set( warn_c_flags "-Wno-format -Wno-unused-variable -Wno-unused-function -Wno-gnu-zero-variadic-macro-arguments" )
 
 set( CMAKE_C_FLAGS_INIT   "${arch_c_flags} ${warn_c_flags}" )
diff --git a/docs/build.md b/docs/build.md
index dfff7cb0574b3..916fcf22d7924 100644
--- a/docs/build.md
+++ b/docs/build.md
@@ -16,7 +16,7 @@ In order to build llama.cpp you have four different options.
       make
       ```
 
-  - On Windows (x86/x64 only, arm64 requires cmake):
+  - On Windows:
 
     1. Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases).
     2. Extract `w64devkit` on your pc.
@@ -45,13 +45,6 @@ In order to build llama.cpp you have four different options.
     - For `Q4_0_4_4` quantization type build, add the `-DGGML_LLAMAFILE=OFF` cmake option. For example, use `cmake -B build -DGGML_LLAMAFILE=OFF`.
     - For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `cmake --build build --config Release -j 8` will run 8 jobs in parallel.
     - For faster repeated compilation, install [ccache](https://ccache.dev/).
-    - For Windows:
-      - Install cmake e.g. via `winget install cmake`:
-      - As alternative to the w64devkit mentioned in "using make" above, install MSVC (e.g. via Visual Studio 2022 Community Edition).
-      - For Windows on ARM you need MSVC installed and _additonally_:
-        - Install [clang via LLVM for woa64](https://releases.llvm.org) to enable better ARM optimizations (clang needs the MSVC backend).
-        - For using clang, the first build step needs to be `cmake --preset arm64-windows-llvm-release` (instead of the `cmake -B ...` which defaults to MSVC).
-        - Note: Building for ARM can also just be done with MSVC (without installing clang or using the preset), but this e.g. does not support Q_4_0_4_4 acceleration, because the MSVC frontend cannot inline ARM assembly-code.
     - For debug builds, there are two cases:
 
       1. Single-config generators (e.g. default = `Unix Makefiles`; note that they just ignore the `--config` flag):
diff --git a/ggml/src/ggml-aarch64.c b/ggml/src/ggml-aarch64.c
index 0c22e816a87a1..26535b1c432ba 100644
--- a/ggml/src/ggml-aarch64.c
+++ b/ggml/src/ggml-aarch64.c
@@ -392,7 +392,7 @@ void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
 #if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
     GGML_ASSERT(!(ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) &&
                 "__ARM_NEON and __ARM_FEATURE_MATMUL_INT8 defined, use the Q4_0_4_8 quantization format for optimal performance");
-#elif defined(__ARM_NEON) && defined(__aarch64__) && ! defined(_MSC_VER)
+#elif defined(__ARM_NEON) && defined(__aarch64__)
     const void * b_ptr = vx;
     const void * a_ptr = vy;
     float * res_ptr = s;
@@ -501,7 +501,7 @@ void ggml_gemv_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void *
                     "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
     }
 #endif
-#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8) && ! defined(_MSC_VER)
+#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
     const void * b_ptr = vx;
     const void * a_ptr = vy;
     float * res_ptr = s;
@@ -613,7 +613,7 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
     UNUSED(ncols_interleaved);
     UNUSED(blocklen);
 
-#if defined(__ARM_FEATURE_SVE) && ! defined(_MSC_VER)
+#if defined(__ARM_FEATURE_SVE)
     if (svcntw() == 8) {
         const void * b_ptr = vx;
         const void * a_ptr = vy;
@@ -753,7 +753,7 @@ void ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
 #if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
     GGML_ASSERT(!(ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) &&
                 "__ARM_NEON and __ARM_FEATURE_MATMUL_INT8 defined, use the Q4_0_4_8 quantization format for optimal performance");
-#elif defined(__ARM_NEON) && defined(__aarch64__) && ! defined(_MSC_VER)
+#elif defined(__ARM_NEON) && defined(__aarch64__)
     const void * b_ptr = vx;
     const void * a_ptr = vy;
     float * res_ptr = s;
@@ -1271,7 +1271,7 @@ void ggml_gemm_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void *
                     "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
     }
 #endif
-#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8) && ! defined(_MSC_VER)
+#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
     const void * b_ptr = vx;
     const void * a_ptr = vy;
     float * res_ptr = s;
@@ -1727,7 +1727,7 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
     UNUSED(ncols_interleaved);
     UNUSED(blocklen);
 
-#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8) && ! defined(_MSC_VER)
+#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
     if (svcntw() == 8) {
         const void * b_ptr = vx;
         const void * a_ptr = vy;

From 1289e3516e22b5daac9f39c03eb3624344a1a8f5 Mon Sep 17 00:00:00 2001
From: AndreasKunar <andreaskmsn.com>
Date: Wed, 17 Jul 2024 11:54:53 +0200
Subject: [PATCH 3/7] Improvements for Windows with Snapdragon X

---
 docs/build.md           |  9 ++++++++-
 ggml/src/ggml-aarch64.c | 12 ++++++------
 2 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/docs/build.md b/docs/build.md
index 916fcf22d7924..34a8cde091b41 100644
--- a/docs/build.md
+++ b/docs/build.md
@@ -16,7 +16,7 @@ In order to build llama.cpp you have four different options.
       make
       ```
 
-  - On Windows:
+  - On Windows (x86/x64 only, arm64 requires cmake):
 
     1. Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases).
     2. Extract `w64devkit` on your pc.
@@ -31,6 +31,13 @@ In order to build llama.cpp you have four different options.
     - For `Q4_0_4_4` quantization type build, add the `GGML_NO_LLAMAFILE=1` flag. For example, use `make GGML_NO_LLAMAFILE=1`.
     - For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `make -j 8` will run 8 jobs in parallel.
     - For faster repeated compilation, install [ccache](https://ccache.dev/).
+    - For Windows:
+      - Install cmake e.g. via `winget install cmake`:
+      - As alternative to the w64devkit mentioned in "using make" above, install MSVC (e.g. via Visual Studio 2022 Community Edition).
+      - For Windows on ARM you need MSVC installed and _additonally_:
+        - Install [clang via LLVM for woa64](https://releases.llvm.org) to enable better ARM optimizations (clang needs the MSVC backend).
+        - For using clang, the first build step needs to be `cmake --preset arm64-windows-llvm-release` (instead of the `cmake -B ...` which defaults to MSVC).
+        - Note: Building for ARM can also be done just with MSVC (without installing clang or using the preset), but this does not support e.g. the accelerated Q_4_0_4_4/Q_4_0_4_8 kernels (a 2-2.5x prompt-processing speed improvement on the CPU), because the MSVC frontend cannot inline ARM assembly-code.
     - For debug builds, run `make LLAMA_DEBUG=1`
 
 - Using `CMake`:
diff --git a/ggml/src/ggml-aarch64.c b/ggml/src/ggml-aarch64.c
index 26535b1c432ba..af53dea172459 100644
--- a/ggml/src/ggml-aarch64.c
+++ b/ggml/src/ggml-aarch64.c
@@ -392,7 +392,7 @@ void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
 #if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
     GGML_ASSERT(!(ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) &&
                 "__ARM_NEON and __ARM_FEATURE_MATMUL_INT8 defined, use the Q4_0_4_8 quantization format for optimal performance");
-#elif defined(__ARM_NEON) && defined(__aarch64__)
+#elif defined(__ARM_NEON) && defined(__aarch64__) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
     const void * b_ptr = vx;
     const void * a_ptr = vy;
     float * res_ptr = s;
@@ -501,7 +501,7 @@ void ggml_gemv_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void *
                     "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
     }
 #endif
-#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
+#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
     const void * b_ptr = vx;
     const void * a_ptr = vy;
     float * res_ptr = s;
@@ -613,7 +613,7 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
     UNUSED(ncols_interleaved);
     UNUSED(blocklen);
 
-#if defined(__ARM_FEATURE_SVE)
+#if defined(__ARM_FEATURE_SVE) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
     if (svcntw() == 8) {
         const void * b_ptr = vx;
         const void * a_ptr = vy;
@@ -753,7 +753,7 @@ void ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
 #if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
     GGML_ASSERT(!(ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) &&
                 "__ARM_NEON and __ARM_FEATURE_MATMUL_INT8 defined, use the Q4_0_4_8 quantization format for optimal performance");
-#elif defined(__ARM_NEON) && defined(__aarch64__)
+#elif defined(__ARM_NEON) && defined(__aarch64__) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
     const void * b_ptr = vx;
     const void * a_ptr = vy;
     float * res_ptr = s;
@@ -1271,7 +1271,7 @@ void ggml_gemm_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void *
                     "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
     }
 #endif
-#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
+#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
     const void * b_ptr = vx;
     const void * a_ptr = vy;
     float * res_ptr = s;
@@ -1727,7 +1727,7 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
     UNUSED(ncols_interleaved);
     UNUSED(blocklen);
 
-#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
+#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
     if (svcntw() == 8) {
         const void * b_ptr = vx;
         const void * a_ptr = vy;

From bfc0e0c9233671134900742d7d0a4780a549497d Mon Sep 17 00:00:00 2001
From: AndreasKunar <andreaskmsn.com>
Date: Thu, 18 Jul 2024 14:04:04 +0200
Subject: [PATCH 4/7] WOA build clarifications

---
 docs/build.md | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/docs/build.md b/docs/build.md
index 34a8cde091b41..47b3579016692 100644
--- a/docs/build.md
+++ b/docs/build.md
@@ -31,13 +31,6 @@ In order to build llama.cpp you have four different options.
     - For `Q4_0_4_4` quantization type build, add the `GGML_NO_LLAMAFILE=1` flag. For example, use `make GGML_NO_LLAMAFILE=1`.
     - For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `make -j 8` will run 8 jobs in parallel.
     - For faster repeated compilation, install [ccache](https://ccache.dev/).
-    - For Windows:
-      - Install cmake e.g. via `winget install cmake`:
-      - As alternative to the w64devkit mentioned in "using make" above, install MSVC (e.g. via Visual Studio 2022 Community Edition).
-      - For Windows on ARM you need MSVC installed and _additonally_:
-        - Install [clang via LLVM for woa64](https://releases.llvm.org) to enable better ARM optimizations (clang needs the MSVC backend).
-        - For using clang, the first build step needs to be `cmake --preset arm64-windows-llvm-release` (instead of the `cmake -B ...` which defaults to MSVC).
-        - Note: Building for ARM can also be done just with MSVC (without installing clang or using the preset), but this does not support e.g. the accelerated Q_4_0_4_4/Q_4_0_4_8 kernels (a 2-2.5x prompt-processing speed improvement on the CPU), because the MSVC frontend cannot inline ARM assembly-code.
     - For debug builds, run `make LLAMA_DEBUG=1`
 
 - Using `CMake`:
@@ -67,6 +60,15 @@ In order to build llama.cpp you have four different options.
       cmake -B build -G "Xcode"
       cmake --build build --config Debug
       ```
+    - Building for Windows on ARM:
+      - Install MSVC / Visual Studio 2022 (VS2022), e.g. via the [Community Edition](https://visualstudio.microsoft.com/de/vs/community/). The latest VS2022 automatically installs the required CMake and Ninja tools.
+      - Also install the clang compiler + tools (LLVM). Either via direct download [clang/LLVM for Windows](https://github.com/llvm/llvm-project/releases/) (use the file "LLVM-\<version\>-woa64.exe"), or alternatively install via the [Chocolatey (choco) package manager](https://chocolatey.org/install) with `choco install llvm`. This LLVM for Windows requires an installed MSVC, and therefore always use it in a Develoer Command Prompt / PowerShell.
+      - Then build _in a Developer Command Prompt / PowerShell for VS2022_ with:
+        ```bash
+        cmake --preset arm64-windows-llvm-release
+        cmake --build build-arm64-windows-llvm-release
+        ```
+      - Note: Building for ARM can also be done just with MSVC (without installing clang and the preset above), but this does not support e.g. the accelerated Q_4_0_4_4/Q_4_0_4_8 kernels (a 2-2.5x prompt-processing speed improvement on the CPU), because the MSVC frontend cannot inline ARM assembly-code.
 
 -   Using `gmake` (FreeBSD):
 

From 028eebf1806377bd49046648b833f28c16b0eb6c Mon Sep 17 00:00:00 2001
From: AndreasKunar <andreaskmsn.com>
Date: Thu, 18 Jul 2024 14:11:58 +0200
Subject: [PATCH 5/7] WIndows on ARM build clarifications

---
 docs/build.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/build.md b/docs/build.md
index 47b3579016692..c9de085e2e6b4 100644
--- a/docs/build.md
+++ b/docs/build.md
@@ -61,7 +61,7 @@ In order to build llama.cpp you have four different options.
       cmake --build build --config Debug
       ```
     - Building for Windows on ARM:
-      - Install MSVC / Visual Studio 2022 (VS2022), e.g. via the [Community Edition](https://visualstudio.microsoft.com/de/vs/community/). The latest VS2022 automatically installs the required CMake and Ninja tools.
+      - Install MSVC / Visual Studio 2022 (VS2022), e.g. via the [Community Edition](https://visualstudio.microsoft.com/de/vs/community/). In the installer select the workload "Desktop-development with C++". This also automatically installs the required toola CMake and Ninja.
       - Also install the clang compiler + tools (LLVM). Either via direct download [clang/LLVM for Windows](https://github.com/llvm/llvm-project/releases/) (use the file "LLVM-\<version\>-woa64.exe"), or alternatively install via the [Chocolatey (choco) package manager](https://chocolatey.org/install) with `choco install llvm`. This LLVM for Windows requires an installed MSVC, and therefore always use it in a Develoer Command Prompt / PowerShell.
       - Then build _in a Developer Command Prompt / PowerShell for VS2022_ with:
         ```bash

From 6f2366210d8d5a5a148c7e00541ee0a9c8e7cc5f Mon Sep 17 00:00:00 2001
From: AndreasKunar <andreaskmsn.com>
Date: Thu, 18 Jul 2024 18:45:45 +0200
Subject: [PATCH 6/7] cmake build for Windows clarifications

---
 docs/build.md | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/docs/build.md b/docs/build.md
index c9de085e2e6b4..83ba30538b767 100644
--- a/docs/build.md
+++ b/docs/build.md
@@ -60,15 +60,17 @@ In order to build llama.cpp you have four different options.
       cmake -B build -G "Xcode"
       cmake --build build --config Debug
       ```
-    - Building for Windows on ARM:
-      - Install MSVC / Visual Studio 2022 (VS2022), e.g. via the [Community Edition](https://visualstudio.microsoft.com/de/vs/community/). In the installer select the workload "Desktop-development with C++". This also automatically installs the required toola CMake and Ninja.
-      - Also install the clang compiler + tools (LLVM). Either via direct download [clang/LLVM for Windows](https://github.com/llvm/llvm-project/releases/) (use the file "LLVM-\<version\>-woa64.exe"), or alternatively install via the [Chocolatey (choco) package manager](https://chocolatey.org/install) with `choco install llvm`. This LLVM for Windows requires an installed MSVC, and therefore always use it in a Develoer Command Prompt / PowerShell.
-      - Then build _in a Developer Command Prompt / PowerShell for VS2022_ with:
+    - Building for Windows (x86, x64 and arm64) with MSVC or clang as compilers:
+      - Install Visual Studio 2022, e.g. via the [Community Edition](https://visualstudio.microsoft.com/de/vs/community/). In the installer, select at least the following options (this also automatically installs the required additional tools like CMake,...):
+        - Tab Workload: Desktop-development with C++
+        - Tab Components (select quickly via search): C++-_CMake_ Tools for Windows, _Git_ for Windows, C++-_Clang_ Compiler for Windows, MS-Build Support for LLVM-Toolset (clang)
+      - Please remember to always use a Developer Command Prompt / PowerShell for VS2022 for git, build, test
+      - For Wndows on ARM (arm64, WoA) build with:
         ```bash
-        cmake --preset arm64-windows-llvm-release
+        cmake --preset arm64-windows-llvm-release -D GGML_OPENMP=OFF
         cmake --build build-arm64-windows-llvm-release
         ```
-      - Note: Building for ARM can also be done just with MSVC (without installing clang and the preset above), but this does not support e.g. the accelerated Q_4_0_4_4/Q_4_0_4_8 kernels (a 2-2.5x prompt-processing speed improvement on the CPU), because the MSVC frontend cannot inline ARM assembly-code.
+        Note: Building for arm64 could also be done just with MSVC (with the build-arm64-windows-MSVC preset, or the standard CMake build instructions). But MSVC does not support inline ARM assembly-code, used e.g. for the accelerated Q4_0_4_8 CPU kernels.
 
 -   Using `gmake` (FreeBSD):
 

From 14dbd9273fad4e593a06d74efa916537315bd385 Mon Sep 17 00:00:00 2001
From: "Andreas (Andi) Kunar" <andreask@msn.com>
Date: Tue, 23 Jul 2024 10:43:09 +0200
Subject: [PATCH 7/7] Update docs/build.md

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 docs/build.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/build.md b/docs/build.md
index 83ba30538b767..d9d12c46707bd 100644
--- a/docs/build.md
+++ b/docs/build.md
@@ -65,7 +65,7 @@ In order to build llama.cpp you have four different options.
         - Tab Workload: Desktop-development with C++
         - Tab Components (select quickly via search): C++-_CMake_ Tools for Windows, _Git_ for Windows, C++-_Clang_ Compiler for Windows, MS-Build Support for LLVM-Toolset (clang)
       - Please remember to always use a Developer Command Prompt / PowerShell for VS2022 for git, build, test
-      - For Wndows on ARM (arm64, WoA) build with:
+      - For Windows on ARM (arm64, WoA) build with:
         ```bash
         cmake --preset arm64-windows-llvm-release -D GGML_OPENMP=OFF
         cmake --build build-arm64-windows-llvm-release