Skip to content

Commit

Permalink
Add support for full CUDA GPU offloading (#105)
Browse files Browse the repository at this point in the history
Signed-off-by: mudler <mudler@mocaccino.org>
Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
  • Loading branch information
mudler and dependabot[bot] committed Jun 16, 2023
1 parent a796025 commit 35a3c99
Show file tree
Hide file tree
Showing 8 changed files with 155 additions and 24 deletions.
14 changes: 2 additions & 12 deletions .github/workflows/test.yaml
Expand Up @@ -27,14 +27,9 @@ jobs:
with:
submodules: true

- name: Dependencies
run: |
brew update
brew install sdl2
- name: Test
run: |
make test
CMAKE_ARGS="-DLLAMA_F16C=OFF -DLLAMA_AVX512=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF" make test
macOS-metal-latest:
runs-on: macOS-latest
Expand All @@ -45,12 +40,7 @@ jobs:
with:
submodules: true

- name: Dependencies
run: |
brew update
brew install sdl2
- name: Test
run: |
make BUILD_TYPE=metal test
CMAKE_ARGS="-DLLAMA_F16C=OFF -DLLAMA_AVX512=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF" make BUILD_TYPE=metal test
CGO_LDFLAGS="-framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders" LIBRARY_PATH=$PWD C_INCLUDE_PATH=$PWD go build -o testbuild ./examples
120 changes: 120 additions & 0 deletions 1902.patch
@@ -0,0 +1,120 @@
From 200892a3a54323eb65ca9c8d8afb6043ca2d8944 Mon Sep 17 00:00:00 2001
From: mudler <mudler@mocaccino.org>
Date: Fri, 16 Jun 2023 23:43:36 +0200
Subject: [PATCH] Pass pointer to params in llama_init_from_file

Especially with golang bindings, calling by value has the side-effect of
values not being copied correctly. This has been observed with the
bindings in https://github.com/go-skynet/go-llama.cpp/pull/105.
---
examples/common.cpp | 2 +-
examples/quantize-stats/quantize-stats.cpp | 2 +-
examples/save-load-state/save-load-state.cpp | 4 ++--
examples/train-text-from-scratch/train-text-from-scratch.cpp | 2 +-
llama.cpp | 3 ++-
llama.h | 2 +-
tests/test-tokenizer-0.cpp | 2 +-
7 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/examples/common.cpp b/examples/common.cpp
index 055383beff9..7cf48e82158 100644
--- a/examples/common.cpp
+++ b/examples/common.cpp
@@ -555,7 +555,7 @@ struct llama_context * llama_init_from_gpt_params(const gpt_params & params) {
lparams.logits_all = params.perplexity;
lparams.embedding = params.embedding;

- llama_context * lctx = llama_init_from_file(params.model.c_str(), lparams);
+ llama_context * lctx = llama_init_from_file(params.model.c_str(), &lparams);

if (lctx == NULL) {
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp
index 6b8018ee284..a7c1e873a92 100644
--- a/examples/quantize-stats/quantize-stats.cpp
+++ b/examples/quantize-stats/quantize-stats.cpp
@@ -330,7 +330,7 @@ int main(int argc, char ** argv) {
lparams.f16_kv = false;
lparams.use_mlock = false;

- ctx = llama_init_from_file(params.model.c_str(), lparams);
+ ctx = llama_init_from_file(params.model.c_str(), &lparams);

if (ctx == NULL) {
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp
index da4d37ad03d..07ee6750d4c 100644
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@@ -35,7 +35,7 @@ int main(int argc, char ** argv) {
auto last_n_tokens_data = std::vector<llama_token>(params.repeat_last_n, 0);

// init
- auto ctx = llama_init_from_file(params.model.c_str(), lparams);
+ auto ctx = llama_init_from_file(params.model.c_str(), &lparams);
auto tokens = std::vector<llama_token>(params.n_ctx);
auto n_prompt_tokens = llama_tokenize(ctx, params.prompt.c_str(), tokens.data(), int(tokens.size()), true);

@@ -95,7 +95,7 @@ int main(int argc, char ** argv) {
llama_free(ctx);

// load new model
- auto ctx2 = llama_init_from_file(params.model.c_str(), lparams);
+ auto ctx2 = llama_init_from_file(params.model.c_str(), &lparams);

// Load state (rng, logits, embedding and kv_cache) from file
{
diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 7ec85951adc..1c7a06c21be 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -3054,7 +3054,7 @@ int main(int argc, char ** argv) {
struct llama_context_params llama_params = llama_context_default_params();
llama_params.vocab_only = true;

- struct llama_context * lctx = llama_init_from_file(params.fn_vocab_model, llama_params);
+ struct llama_context * lctx = llama_init_from_file(params.fn_vocab_model, &llama_params);

struct llama_vocab vocab;
{
diff --git a/llama.cpp b/llama.cpp
index 81f047ed298..0629e873886 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2618,8 +2618,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s

struct llama_context * llama_init_from_file(
const char * path_model,
- struct llama_context_params params) {
+ const struct llama_context_params * params_ptr) {
ggml_time_init();
+ struct llama_context_params params = *params_ptr;

llama_context * ctx = new llama_context;

diff --git a/llama.h b/llama.h
index 1241ba6c0ec..faf2675f125 100644
--- a/llama.h
+++ b/llama.h
@@ -142,7 +142,7 @@ extern "C" {
// Return NULL on failure
LLAMA_API struct llama_context * llama_init_from_file(
const char * path_model,
- struct llama_context_params params);
+ const struct llama_context_params * params);

// Frees all allocated memory
LLAMA_API void llama_free(struct llama_context * ctx);
diff --git a/tests/test-tokenizer-0.cpp b/tests/test-tokenizer-0.cpp
index ab1538a0cf3..b405df8e687 100644
--- a/tests/test-tokenizer-0.cpp
+++ b/tests/test-tokenizer-0.cpp
@@ -36,7 +36,7 @@ int main(int argc, char **argv) {

lparams.vocab_only = true;

- ctx = llama_init_from_file(fname.c_str(), lparams);
+ ctx = llama_init_from_file(fname.c_str(), &lparams);

if (ctx == NULL) {
fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
14 changes: 10 additions & 4 deletions Makefile
Expand Up @@ -176,7 +176,7 @@ $(info )
# Use this if you want to set the default behavior


llama.cpp/ggml.o:
llama.cpp/ggml.o: prepare
mkdir -p build
cd build && cmake ../llama.cpp $(CMAKE_ARGS) && VERBOSE=1 cmake --build . --config Release && cp -rf CMakeFiles/ggml.dir/ggml.c.o ../llama.cpp/ggml.o

Expand All @@ -193,16 +193,22 @@ llama.cpp/k_quants.o: llama.cpp/ggml.o
cd build && cp -rf CMakeFiles/ggml.dir/k_quants.c.o ../llama.cpp/k_quants.o

llama.cpp/llama.o:
cd build && make llama.o && cp -rf CMakeFiles/llama.dir/llama.cpp.o ../llama.cpp/llama.o
cd build && cp -rf CMakeFiles/llama.dir/llama.cpp.o ../llama.cpp/llama.o

llama.cpp/common.o:
cd build && make common && cp -rf examples/CMakeFiles/common.dir/common.cpp.o ../llama.cpp/common.o
cd build && cp -rf examples/CMakeFiles/common.dir/common.cpp.o ../llama.cpp/common.o

binding.o: llama.cpp/ggml.o llama.cpp/llama.o llama.cpp/common.o
binding.o: prepare llama.cpp/ggml.o llama.cpp/llama.o llama.cpp/common.o
$(CXX) $(CXXFLAGS) -I./llama.cpp -I./llama.cpp/examples binding.cpp -o binding.o -c $(LDFLAGS)

## https://github.com/ggerganov/llama.cpp/pull/1902
prepare:
cd llama.cpp && patch -p1 < ../1902.patch
touch $@

libbinding.a: binding.o llama.cpp/k_quants.o $(EXTRA_TARGETS)
ar src libbinding.a llama.cpp/ggml.o llama.cpp/k_quants.o $(EXTRA_TARGETS) llama.cpp/common.o llama.cpp/llama.o binding.o

clean:
rm -rf *.o
rm -rf *.a
Expand Down
12 changes: 8 additions & 4 deletions binding.cpp
@@ -1,5 +1,6 @@
#include "common.h"
#include "llama.h"

#include "binding.h"

#include <cassert>
Expand Down Expand Up @@ -125,7 +126,7 @@ int llama_predict(void* params_ptr, void* state_pr, char* result, bool debug) {

std::mt19937 rng(params.seed);

llama_init_backend();


std::string path_session = params.path_prompt_cache;
std::vector<llama_token> session_tokens;
Expand Down Expand Up @@ -590,7 +591,7 @@ void* llama_allocate_params(const char *prompt, int seed, int threads, int token
}


void* load_model(const char *fname, int n_ctx, int n_seed, bool memory_f16, bool mlock, bool embeddings, bool mmap, int n_gpu_layers, int n_batch, const char *maingpu, const char *tensorsplit) {
void* load_model(const char *fname, int n_ctx, int n_seed, bool memory_f16, bool mlock, bool embeddings, bool mmap, bool low_vram, bool vocab_only, int n_gpu_layers, int n_batch, const char *maingpu, const char *tensorsplit) {
// load the model
auto lparams = llama_context_default_params();

Expand All @@ -601,6 +602,8 @@ void* load_model(const char *fname, int n_ctx, int n_seed, bool memory_f16, bool
lparams.use_mlock = mlock;
lparams.n_gpu_layers = n_gpu_layers;
lparams.use_mmap = mmap;
lparams.low_vram = low_vram;
lparams.vocab_only = vocab_only;

if (maingpu[0] != '\0') {
lparams.main_gpu = std::stoi(maingpu);
Expand All @@ -625,13 +628,14 @@ void* load_model(const char *fname, int n_ctx, int n_seed, bool memory_f16, bool

lparams.n_batch = n_batch;

llama_init_backend();
void* res = nullptr;
try {
res = llama_init_from_file(fname, lparams);
res = llama_init_from_file(fname, &lparams);
} catch(std::runtime_error& e) {
fprintf(stderr, "failed %s",e.what());
return res;
}

return res;
}
}
2 changes: 1 addition & 1 deletion binding.h
Expand Up @@ -14,7 +14,7 @@ int eval(void* params_ptr, void *ctx, char*text);

void save_state(void *ctx, char *dst, char*modes);

void* load_model(const char *fname, int n_ctx, int n_seed, bool memory_f16, bool mlock, bool embeddings, bool mmap, int n_gpu, int n_batch, const char *maingpu, const char *tensorsplit);
void* load_model(const char *fname, int n_ctx, int n_seed, bool memory_f16, bool mlock, bool embeddings, bool mmap, bool low_vram, bool vocab_only, int n_gpu, int n_batch, const char *maingpu, const char *tensorsplit);

int get_embeddings(void* params_ptr, void* state_pr, float * res_embeddings);

Expand Down
2 changes: 1 addition & 1 deletion llama.go
Expand Up @@ -23,7 +23,7 @@ type LLama struct {
func New(model string, opts ...ModelOption) (*LLama, error) {
mo := NewModelOptions(opts...)
modelPath := C.CString(model)
result := C.load_model(modelPath, C.int(mo.ContextSize), C.int(mo.Seed), C.bool(mo.F16Memory), C.bool(mo.MLock), C.bool(mo.Embeddings), C.bool(mo.MMap), C.int(mo.NGPULayers), C.int(mo.NBatch), C.CString(mo.MainGPU), C.CString(mo.TensorSplit))
result := C.load_model(modelPath, C.int(mo.ContextSize), C.int(mo.Seed), C.bool(mo.F16Memory), C.bool(mo.MLock), C.bool(mo.Embeddings), C.bool(mo.MMap), C.bool(mo.LowVRAM), C.bool(mo.VocabOnly), C.int(mo.NGPULayers), C.int(mo.NBatch), C.CString(mo.MainGPU), C.CString(mo.TensorSplit))
if result == nil {
return nil, fmt.Errorf("failed loading model")
}
Expand Down
13 changes: 12 additions & 1 deletion options.go
Expand Up @@ -7,6 +7,8 @@ type ModelOptions struct {
F16Memory bool
MLock bool
MMap bool
VocabOnly bool
LowVRAM bool
Embeddings bool
NGPULayers int
MainGPU string
Expand Down Expand Up @@ -50,6 +52,7 @@ var DefaultModelOptions ModelOptions = ModelOptions{
MLock: false,
Embeddings: false,
MMap: true,
LowVRAM: false,
}

var DefaultOptions PredictOptions = PredictOptions{
Expand All @@ -58,7 +61,7 @@ var DefaultOptions PredictOptions = PredictOptions{
Tokens: 128,
Penalty: 1.1,
Repeat: 64,
Batch: 8,
Batch: 512,
NKeep: 64,
TopK: 40,
TopP: 0.95,
Expand Down Expand Up @@ -128,6 +131,14 @@ func SetPredictionMainGPU(maingpu string) PredictOption {
}
}

var VocabOnly ModelOption = func(p *ModelOptions) {
p.VocabOnly = true
}

var EnabelLowVRAM ModelOption = func(p *ModelOptions) {
p.LowVRAM = true
}

var EnableEmbeddings ModelOption = func(p *ModelOptions) {
p.Embeddings = true
}
Expand Down

0 comments on commit 35a3c99

Please sign in to comment.