From 8354b7d2adfb90a556da459760f7cb1781710784 Mon Sep 17 00:00:00 2001
From: CJ Pais <cj@cjpais.com>
Date: Sat, 17 Feb 2024 11:32:04 -0800
Subject: [PATCH 1/7] server: init working 1.6

---
 Makefile                   |  2 +-
 examples/server/server.cpp | 59 ++++++++++++++------------------------
 2 files changed, 22 insertions(+), 39 deletions(-)
diff --git a/Makefile b/Makefile
index 0a2070b539df8..083a639c52ac2 100644
--- a/Makefile
+++ b/Makefile
@@ -691,7 +691,7 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-server: examples/server/server.cpp examples/server/oai.hpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
+server: examples/server/server.cpp examples/server/oai.hpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h examples/llava/llava.h examples/llava/llava.cpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) -c examples/llava/clip.cpp -o $(call GET_OBJ_FILE, examples/llava/clip.cpp) -Wno-cast-qual
 	$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h %.hpp $< examples/llava/clip.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) -o $@ $(LDFLAGS) $(LWINSOCK2)
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index a0b46970b83a9..d002db9b985fb 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -5,6 +5,7 @@
 #include "oai.hpp"
 
 #include "../llava/clip.h"
+#include "../llava/llava.h"
 
 #include "stb_image.h"
 
@@ -31,6 +32,14 @@
 
 using json = nlohmann::json;
 
+// TODO should be in clip.h?
+struct clip_image_u8 {
+    int nx;
+    int ny;
+
+    std::vector<uint8_t> buf;
+};
+
 struct server_params
 {
     std::string hostname = "127.0.0.1";
@@ -702,11 +711,12 @@ struct llama_server_context
                     slot_image img_sl;
                     img_sl.id = img.count("id") != 0 ? img["id"].get<int>() : slot->images.size();
                     img_sl.img_data = clip_image_u8_init();
-                    if (!clip_image_load_from_bytes(image_buffer.data(), image_buffer.size(), img_sl.img_data))
-                    {
-                        LOG_TEE("slot %i - failed to load image [id: %i]\n", slot->id, img_sl.id);
-                        return false;
-                    }
+                    img_sl.img_data->buf = image_buffer;
+                    // if (!clip_image_load_from_bytes(image_buffer.data(), image_buffer.size(), img_sl.img_data))
+                    // {
+                    //     LOG_TEE("slot %i - failed to load image [id: %i]\n", slot->id, img_sl.id);
+                    //     return false;
+                    // }
                     LOG_TEE("slot %i - loaded image\n", slot->id);
                     img_sl.request_encode_image = true;
                     slot->images.push_back(img_sl);
@@ -983,43 +993,16 @@ struct llama_server_context
             {
                 continue;
             }
-            clip_image_f32_batch img_res_v;
-            img_res_v.size = 0;
-            img_res_v.data = nullptr;
-            if (!clip_image_preprocess(clp_ctx, img.img_data, img_res_v))
-            {
-                LOG_TEE("Error processing the given image");
-                clip_free(clp_ctx);
-                clip_image_f32_batch_free(img_res_v);
-                return false;
-            }
-            if (img_res_v.size == 0)
-            {
-                LOG_TEE("Error processing the given image");
-                return false;
-            }
-
-            // note: assumes only one image was returned by clip_image_preprocess
-            clip_image_f32 * img_res = img_res_v.data;
 
-            img.image_tokens = clip_n_patches(clp_ctx);
-            img.image_embedding = (float *)malloc(clip_embd_nbytes(clp_ctx));
-            if (!img.image_embedding)
-            {
-                LOG_TEE("Unable to allocate memory for image embeddings\n");
-                clip_image_f32_batch_free(img_res_v);
-                clip_free(clp_ctx);
-                return false;
-            }
-            LOG_TEE("slot %i - encoding image [id: %i]\n", slot.id, img.id);
-            if (!clip_image_encode(clp_ctx, params.n_threads, img_res, img.image_embedding))
-            {
-                LOG_TEE("Unable to encode image\n");
-                clip_image_f32_batch_free(img_res_v);
+            // TODO call encode_image_with_clip instead?
+            llava_image_embed * embed = llava_image_embed_make_with_bytes(clp_ctx, params.n_threads, img.img_data->buf.data(), img.img_data->buf.size());
+            if (!embed) {
+                LOG_TEE("Error processing the given image");
                 return false;
             }
 
-            clip_image_f32_batch_free(img_res_v);
+            img.image_embedding = embed->embed;
+            img.image_tokens = embed->n_image_pos;
 
             img.request_encode_image = false;
         }

From 6d9fea6aae11a51a488edf9e38e338cdbf70cb4c Mon Sep 17 00:00:00 2001
From: CJ Pais <cj@cjpais.com>
Date: Sat, 17 Feb 2024 12:40:34 -0800
Subject: [PATCH 2/7] move clip_image to header

---
 examples/llava/clip.cpp    | 17 -----------------
 examples/llava/clip.h      | 18 ++++++++++++++++++
 examples/llava/llava.cpp   | 17 -----------------
 examples/server/server.cpp |  8 --------
 4 files changed, 18 insertions(+), 42 deletions(-)

diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index 98d512f67a0e2..0a26cf62ce643 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -34,23 +34,6 @@
 
 //#define CLIP_DEBUG_FUNCTIONS
 
-// RGB uint8 image
-struct clip_image_u8 {
-    int nx;
-    int ny;
-
-    std::vector<uint8_t> buf;
-};
-
-// RGB float32 image (NHWC)
-// Memory layout: RGBRGBRGB...
-struct clip_image_f32 {
-    int nx;
-    int ny;
-
-    std::vector<float> buf;
-};
-
 static std::string format(const char * fmt, ...) {
     va_list ap;
     va_list ap2;
diff --git a/examples/llava/clip.h b/examples/llava/clip.h
index e5bd54924a9c8..33f83490ca4bc 100644
--- a/examples/llava/clip.h
+++ b/examples/llava/clip.h
@@ -3,6 +3,7 @@
 
 #include <stddef.h>
 #include <stdint.h>
+#include <vector>
 
 #ifdef LLAMA_SHARED
 #    if defined(_WIN32) && !defined(__MINGW32__)
@@ -26,6 +27,23 @@ extern "C" {
 
 struct clip_ctx;
 
+// RGB uint8 image
+struct clip_image_u8 {
+    int nx;
+    int ny;
+
+    std::vector<uint8_t> buf;
+};
+
+// RGB float32 image (NHWC)
+// Memory layout: RGBRGBRGB...
+struct clip_image_f32 {
+    int nx;
+    int ny;
+
+    std::vector<float> buf;
+};
+
 struct clip_image_u8_batch {
     struct clip_image_u8 * data;
     size_t size;
diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp
index 4cb65a07b6740..9cb962936d3c6 100644
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@@ -9,23 +9,6 @@
 #include <vector>
 #include <numeric>
 
-// RGB uint8 image
-struct clip_image_u8 {
-    int nx;
-    int ny;
-
-    std::vector<uint8_t> buf;
-};
-
-// RGB float32 image (NHWC)
-// Memory layout: RGBRGBRGB...
-struct clip_image_f32 {
-    int nx;
-    int ny;
-
-    std::vector<float> buf;
-};
-
 struct clip_image_grid_shape {
     int first;
     int second;
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index d002db9b985fb..4afcdb86f73ba 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -32,14 +32,6 @@
 
 using json = nlohmann::json;
 
-// TODO should be in clip.h?
-struct clip_image_u8 {
-    int nx;
-    int ny;
-
-    std::vector<uint8_t> buf;
-};
-
 struct server_params
 {
     std::string hostname = "127.0.0.1";

From 2f2973de78276ce7ca026568aba7204dc653e85d Mon Sep 17 00:00:00 2001
From: CJ Pais <cj@cjpais.com>
Date: Sat, 17 Feb 2024 12:42:24 -0800
Subject: [PATCH 3/7] remove commented code

---
 examples/server/server.cpp | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 4afcdb86f73ba..0645386dae3a9 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -704,11 +704,6 @@ struct llama_server_context
                     img_sl.id = img.count("id") != 0 ? img["id"].get<int>() : slot->images.size();
                     img_sl.img_data = clip_image_u8_init();
                     img_sl.img_data->buf = image_buffer;
-                    // if (!clip_image_load_from_bytes(image_buffer.data(), image_buffer.size(), img_sl.img_data))
-                    // {
-                    //     LOG_TEE("slot %i - failed to load image [id: %i]\n", slot->id, img_sl.id);
-                    //     return false;
-                    // }
                     LOG_TEE("slot %i - loaded image\n", slot->id);
                     img_sl.request_encode_image = true;
                     slot->images.push_back(img_sl);

From e801037de60e70172c4f69823f0fa284effadf8f Mon Sep 17 00:00:00 2001
From: CJ Pais <cj@cjpais.com>
Date: Sun, 18 Feb 2024 13:25:49 -0800
Subject: [PATCH 4/7] remove c++ style from header

---
 examples/llava/clip.cpp    | 17 +++++++++++++++++
 examples/llava/clip.h      | 18 ------------------
 examples/llava/llava.cpp   | 17 +++++++++++++++++
 examples/server/server.cpp | 17 +++++++++++++++++
 4 files changed, 51 insertions(+), 18 deletions(-)

diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index 0a26cf62ce643..98d512f67a0e2 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -34,6 +34,23 @@
 
 //#define CLIP_DEBUG_FUNCTIONS
 
+// RGB uint8 image
+struct clip_image_u8 {
+    int nx;
+    int ny;
+
+    std::vector<uint8_t> buf;
+};
+
+// RGB float32 image (NHWC)
+// Memory layout: RGBRGBRGB...
+struct clip_image_f32 {
+    int nx;
+    int ny;
+
+    std::vector<float> buf;
+};
+
 static std::string format(const char * fmt, ...) {
     va_list ap;
     va_list ap2;
diff --git a/examples/llava/clip.h b/examples/llava/clip.h
index 33f83490ca4bc..e5bd54924a9c8 100644
--- a/examples/llava/clip.h
+++ b/examples/llava/clip.h
@@ -3,7 +3,6 @@
 
 #include <stddef.h>
 #include <stdint.h>
-#include <vector>
 
 #ifdef LLAMA_SHARED
 #    if defined(_WIN32) && !defined(__MINGW32__)
@@ -27,23 +26,6 @@ extern "C" {
 
 struct clip_ctx;
 
-// RGB uint8 image
-struct clip_image_u8 {
-    int nx;
-    int ny;
-
-    std::vector<uint8_t> buf;
-};
-
-// RGB float32 image (NHWC)
-// Memory layout: RGBRGBRGB...
-struct clip_image_f32 {
-    int nx;
-    int ny;
-
-    std::vector<float> buf;
-};
-
 struct clip_image_u8_batch {
     struct clip_image_u8 * data;
     size_t size;
diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp
index 9cb962936d3c6..4cb65a07b6740 100644
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@@ -9,6 +9,23 @@
 #include <vector>
 #include <numeric>
 
+// RGB uint8 image
+struct clip_image_u8 {
+    int nx;
+    int ny;
+
+    std::vector<uint8_t> buf;
+};
+
+// RGB float32 image (NHWC)
+// Memory layout: RGBRGBRGB...
+struct clip_image_f32 {
+    int nx;
+    int ny;
+
+    std::vector<float> buf;
+};
+
 struct clip_image_grid_shape {
     int first;
     int second;
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 0645386dae3a9..f5667fc7ab017 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -43,6 +43,23 @@ struct server_params
     int32_t write_timeout = 600;
 };
 
+// RGB uint8 image
+struct clip_image_u8 {
+    int nx;
+    int ny;
+
+    std::vector<uint8_t> buf;
+};
+
+// RGB float32 image (NHWC)
+// Memory layout: RGBRGBRGB...
+struct clip_image_f32 {
+    int nx;
+    int ny;
+
+    std::vector<float> buf;
+};
+
 bool server_verbose = false;
 
 static size_t common_part(const std::vector<llama_token> &a, const std::vector<llama_token> &b)

From fb16cc9e9f6107dc47fba553a76433840a902995 Mon Sep 17 00:00:00 2001
From: CJ Pais <cj@cjpais.com>
Date: Sun, 18 Feb 2024 13:30:08 -0800
Subject: [PATCH 5/7] remove todo

---
 examples/server/server.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index f5667fc7ab017..8bbb28f85b906 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -998,7 +998,6 @@ struct llama_server_context
                 continue;
             }
 
-            // TODO call encode_image_with_clip instead?
             llava_image_embed * embed = llava_image_embed_make_with_bytes(clp_ctx, params.n_threads, img.img_data->buf.data(), img.img_data->buf.size());
             if (!embed) {
                 LOG_TEE("Error processing the given image");

From 0ffba498a191712271bcc14fb2d2d614511fb9f9 Mon Sep 17 00:00:00 2001
From: CJ Pais <cj@cjpais.com>
Date: Mon, 19 Feb 2024 07:30:36 -0800
Subject: [PATCH 6/7] expose llava_image_embed_make_with_clip_img

---
 examples/llava/llava.cpp   |  2 +-
 examples/llava/llava.h     |  2 ++
 examples/server/server.cpp | 28 ++++++----------------------
 3 files changed, 9 insertions(+), 23 deletions(-)

diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp
index 4cb65a07b6740..1a1cf7c78bf34 100644
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@@ -311,7 +311,7 @@ bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx *
     return true;
 }
 
-static bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out) {
+bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out) {
     float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*6); // TODO: base on gridsize/llava model
     if (!image_embd) {
         fprintf(stderr, "Unable to allocate memory for image embeddings\n");
diff --git a/examples/llava/llava.h b/examples/llava/llava.h
index 9e9466a5d1726..2d40f3f1d5f84 100644
--- a/examples/llava/llava.h
+++ b/examples/llava/llava.h
@@ -31,6 +31,8 @@ struct llava_image_embed {
 /** sanity check for clip <-> llava embed size match */
 LLAVA_API bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * ctx_clip);
 
+LLAVA_API bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out);
+
 /** build an image embed from image file bytes */
 LLAVA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length);
 /** build an image embed from a path to an image filename */
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 8bbb28f85b906..c4cd325f5231f 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -43,23 +43,6 @@ struct server_params
     int32_t write_timeout = 600;
 };
 
-// RGB uint8 image
-struct clip_image_u8 {
-    int nx;
-    int ny;
-
-    std::vector<uint8_t> buf;
-};
-
-// RGB float32 image (NHWC)
-// Memory layout: RGBRGBRGB...
-struct clip_image_f32 {
-    int nx;
-    int ny;
-
-    std::vector<float> buf;
-};
-
 bool server_verbose = false;
 
 static size_t common_part(const std::vector<llama_token> &a, const std::vector<llama_token> &b)
@@ -720,7 +703,11 @@ struct llama_server_context
                     slot_image img_sl;
                     img_sl.id = img.count("id") != 0 ? img["id"].get<int>() : slot->images.size();
                     img_sl.img_data = clip_image_u8_init();
-                    img_sl.img_data->buf = image_buffer;
+                    if (!clip_image_load_from_bytes(image_buffer.data(), image_buffer.size(), img_sl.img_data))
+                    {
+                        LOG_TEE("slot %i - failed to load image [id: %i]\n", slot->id, img_sl.id);
+                        return false;
+                    }
                     LOG_TEE("slot %i - loaded image\n", slot->id);
                     img_sl.request_encode_image = true;
                     slot->images.push_back(img_sl);
@@ -998,14 +985,11 @@ struct llama_server_context
                 continue;
             }
 
-            llava_image_embed * embed = llava_image_embed_make_with_bytes(clp_ctx, params.n_threads, img.img_data->buf.data(), img.img_data->buf.size());
-            if (!embed) {
+            if (!llava_image_embed_make_with_clip_img(clp_ctx, params.n_threads, img.img_data, &img.image_embedding, &img.image_tokens)) {
                 LOG_TEE("Error processing the given image");
                 return false;
             }
 
-            img.image_embedding = embed->embed;
-            img.image_tokens = embed->n_image_pos;
 
             img.request_encode_image = false;
         }

From ad60bece9c4e3d420229c32276c69e5ca5fbf6c9 Mon Sep 17 00:00:00 2001
From: CJ Pais <cj@cjpais.com>
Date: Tue, 20 Feb 2024 07:50:43 -0800
Subject: [PATCH 7/7] fix zig build

---
 build.zig | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/build.zig b/build.zig
index 699738f3dd509..c0af454dc9e92 100644
--- a/build.zig
+++ b/build.zig
@@ -123,6 +123,7 @@ pub fn build(b: *std.build.Builder) !void {
     const grammar_parser = make.obj("grammar-parser", "common/grammar-parser.cpp");
     const train = make.obj("train", "common/train.cpp");
     const clip = make.obj("clip", "examples/llava/clip.cpp");
+    const llava = make.obj("llava", "examples/llava/llava.cpp");
 
     _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, sampling, console, grammar_parser });
     _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo });
@@ -131,7 +132,7 @@ pub fn build(b: *std.build.Builder) !void {
     _ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, train });
     _ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, train });
 
-    const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, sampling, grammar_parser, clip });
+    const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, sampling, grammar_parser, clip, llava });
     if (server.target.isWindows()) {
         server.linkSystemLibrary("ws2_32");
     }