From cd96be73a375bea8ec5d8dd48ff3b5299dae6f9e Mon Sep 17 00:00:00 2001
From: Sam Malayek <malayek@gmail.com>
Date: Sun, 12 Oct 2025 12:42:17 -0700
Subject: [PATCH 1/2] Add --embd-output-format raw for plain numeric embedding
 output

This new option outputs embeddings as raw space-separated floats, without JSON or 'embedding N:' prefixes. Useful for downstream vector pipelines and scripting.
---
 examples/embedding/embedding.cpp | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)
diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp
index 388908bc4d70a..11b44857a9856 100644
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -4,6 +4,7 @@
 #include "llama.h"
 
 #include <ctime>
+#include <cstdio>
 #include <algorithm>
 
 #if defined(_MSC_VER)
@@ -70,6 +71,29 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
     }
 }
 
+// plain, pipe-friendly output: one embedding per line
+static void print_raw_embeddings(const float * emb,
+                                 int n_embd_count,
+                                 int n_embd,
+                                 const llama_model * model,
+                                 enum llama_pooling_type pooling_type,
+                                 int embd_normalize) {
+    const uint32_t n_cls_out = llama_model_n_cls_out(model);
+    const bool is_rank = (pooling_type == LLAMA_POOLING_TYPE_RANK);
+    const int cols = is_rank ? std::min<int>(n_embd, (int) n_cls_out) : n_embd;
+
+    for (int j = 0; j < n_embd_count; ++j) {
+        for (int i = 0; i < cols; ++i) {
+            if (embd_normalize == 0) {
+                printf("%1.0f%s", emb[j * n_embd + i], (i + 1 < cols ? " " : ""));
+            } else {
+                printf("%1.7f%s", emb[j * n_embd + i], (i + 1 < cols ? " " : ""));
+            }
+        }
+        printf("\n");
+    }
+}
+
 int main(int argc, char ** argv) {
     common_params params;
 
@@ -259,6 +283,10 @@ int main(int argc, char ** argv) {
     float * out = emb + e * n_embd;
     batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize);
 
+    if (params.embd_out == "raw") {
+        print_raw_embeddings(emb, n_embd_count, n_embd, model, pooling_type, params.embd_normalize);
+    }
+
     if (params.embd_out.empty()) {
         LOG("\n");
 

From c66712074ccf7a409ecbacf31f070035d9b46fc3 Mon Sep 17 00:00:00 2001
From: Sam Malayek <malayek@gmail.com>
Date: Mon, 13 Oct 2025 11:33:41 -0700
Subject: [PATCH 2/2] Move raw output handling into format handling section

---
 examples/embedding/embedding.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp
index 11b44857a9856..8b25fcdb4fe7a 100644
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -283,10 +283,6 @@ int main(int argc, char ** argv) {
     float * out = emb + e * n_embd;
     batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize);
 
-    if (params.embd_out == "raw") {
-        print_raw_embeddings(emb, n_embd_count, n_embd, model, pooling_type, params.embd_normalize);
-    }
-
     if (params.embd_out.empty()) {
         LOG("\n");
 
@@ -402,6 +398,10 @@ int main(int argc, char ** argv) {
         if (notArray) LOG("\n}\n");
     }
 
+    if (params.embd_out == "raw") {
+        print_raw_embeddings(emb, n_embd_count, n_embd, model, pooling_type, params.embd_normalize);
+    }
+
     LOG("\n");
     llama_perf_context_print(ctx);