Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for full CUDA GPU offloading #105

Merged
merged 7 commits into from
Jun 16, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
14 changes: 2 additions & 12 deletions .github/workflows/test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,14 +27,9 @@ jobs:
with:
submodules: true

- name: Dependencies
run: |
brew update
brew install sdl2

- name: Test
run: |
make test
CMAKE_ARGS="-DLLAMA_F16C=OFF -DLLAMA_AVX512=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF" make test

macOS-metal-latest:
runs-on: macOS-latest
Expand All @@ -45,12 +40,7 @@ jobs:
with:
submodules: true

- name: Dependencies
run: |
brew update
brew install sdl2

- name: Test
run: |
make BUILD_TYPE=metal test
CMAKE_ARGS="-DLLAMA_F16C=OFF -DLLAMA_AVX512=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF" make BUILD_TYPE=metal test
CGO_LDFLAGS="-framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders" LIBRARY_PATH=$PWD C_INCLUDE_PATH=$PWD go build -o testbuild ./examples
120 changes: 120 additions & 0 deletions 1902.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
From 200892a3a54323eb65ca9c8d8afb6043ca2d8944 Mon Sep 17 00:00:00 2001
From: mudler <mudler@mocaccino.org>
Date: Fri, 16 Jun 2023 23:43:36 +0200
Subject: [PATCH] Pass pointer to params in llama_init_from_file

Especially with golang bindings, calling by value has the side-effect of
values not being copied correctly. This has been observed with the
bindings in https://github.com/go-skynet/go-llama.cpp/pull/105.
---
examples/common.cpp | 2 +-
examples/quantize-stats/quantize-stats.cpp | 2 +-
examples/save-load-state/save-load-state.cpp | 4 ++--
examples/train-text-from-scratch/train-text-from-scratch.cpp | 2 +-
llama.cpp | 3 ++-
llama.h | 2 +-
tests/test-tokenizer-0.cpp | 2 +-
7 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/examples/common.cpp b/examples/common.cpp
index 055383beff9..7cf48e82158 100644
--- a/examples/common.cpp
+++ b/examples/common.cpp
@@ -555,7 +555,7 @@ struct llama_context * llama_init_from_gpt_params(const gpt_params & params) {
lparams.logits_all = params.perplexity;
lparams.embedding = params.embedding;

- llama_context * lctx = llama_init_from_file(params.model.c_str(), lparams);
+ llama_context * lctx = llama_init_from_file(params.model.c_str(), &lparams);

if (lctx == NULL) {
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp
index 6b8018ee284..a7c1e873a92 100644
--- a/examples/quantize-stats/quantize-stats.cpp
+++ b/examples/quantize-stats/quantize-stats.cpp
@@ -330,7 +330,7 @@ int main(int argc, char ** argv) {
lparams.f16_kv = false;
lparams.use_mlock = false;

- ctx = llama_init_from_file(params.model.c_str(), lparams);
+ ctx = llama_init_from_file(params.model.c_str(), &lparams);

if (ctx == NULL) {
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp
index da4d37ad03d..07ee6750d4c 100644
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@@ -35,7 +35,7 @@ int main(int argc, char ** argv) {
auto last_n_tokens_data = std::vector<llama_token>(params.repeat_last_n, 0);

// init
- auto ctx = llama_init_from_file(params.model.c_str(), lparams);
+ auto ctx = llama_init_from_file(params.model.c_str(), &lparams);
auto tokens = std::vector<llama_token>(params.n_ctx);
auto n_prompt_tokens = llama_tokenize(ctx, params.prompt.c_str(), tokens.data(), int(tokens.size()), true);

@@ -95,7 +95,7 @@ int main(int argc, char ** argv) {
llama_free(ctx);

// load new model
- auto ctx2 = llama_init_from_file(params.model.c_str(), lparams);
+ auto ctx2 = llama_init_from_file(params.model.c_str(), &lparams);

// Load state (rng, logits, embedding and kv_cache) from file
{
diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 7ec85951adc..1c7a06c21be 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -3054,7 +3054,7 @@ int main(int argc, char ** argv) {
struct llama_context_params llama_params = llama_context_default_params();
llama_params.vocab_only = true;

- struct llama_context * lctx = llama_init_from_file(params.fn_vocab_model, llama_params);
+ struct llama_context * lctx = llama_init_from_file(params.fn_vocab_model, &llama_params);

struct llama_vocab vocab;
{
diff --git a/llama.cpp b/llama.cpp
index 81f047ed298..0629e873886 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2618,8 +2618,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s

struct llama_context * llama_init_from_file(
const char * path_model,
- struct llama_context_params params) {
+ const struct llama_context_params * params_ptr) {
ggml_time_init();
+ struct llama_context_params params = *params_ptr;

llama_context * ctx = new llama_context;

diff --git a/llama.h b/llama.h
index 1241ba6c0ec..faf2675f125 100644
--- a/llama.h
+++ b/llama.h
@@ -142,7 +142,7 @@ extern "C" {
// Return NULL on failure
LLAMA_API struct llama_context * llama_init_from_file(
const char * path_model,
- struct llama_context_params params);
+ const struct llama_context_params * params);

// Frees all allocated memory
LLAMA_API void llama_free(struct llama_context * ctx);
diff --git a/tests/test-tokenizer-0.cpp b/tests/test-tokenizer-0.cpp
index ab1538a0cf3..b405df8e687 100644
--- a/tests/test-tokenizer-0.cpp
+++ b/tests/test-tokenizer-0.cpp
@@ -36,7 +36,7 @@ int main(int argc, char **argv) {

lparams.vocab_only = true;

- ctx = llama_init_from_file(fname.c_str(), lparams);
+ ctx = llama_init_from_file(fname.c_str(), &lparams);

if (ctx == NULL) {
fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
14 changes: 10 additions & 4 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,7 @@ $(info )
# Use this if you want to set the default behavior


llama.cpp/ggml.o:
llama.cpp/ggml.o: prepare
mkdir -p build
cd build && cmake ../llama.cpp $(CMAKE_ARGS) && VERBOSE=1 cmake --build . --config Release && cp -rf CMakeFiles/ggml.dir/ggml.c.o ../llama.cpp/ggml.o

Expand All @@ -193,16 +193,22 @@ llama.cpp/k_quants.o: llama.cpp/ggml.o
cd build && cp -rf CMakeFiles/ggml.dir/k_quants.c.o ../llama.cpp/k_quants.o

llama.cpp/llama.o:
cd build && make llama.o && cp -rf CMakeFiles/llama.dir/llama.cpp.o ../llama.cpp/llama.o
cd build && cp -rf CMakeFiles/llama.dir/llama.cpp.o ../llama.cpp/llama.o

llama.cpp/common.o:
cd build && make common && cp -rf examples/CMakeFiles/common.dir/common.cpp.o ../llama.cpp/common.o
cd build && cp -rf examples/CMakeFiles/common.dir/common.cpp.o ../llama.cpp/common.o

binding.o: llama.cpp/ggml.o llama.cpp/llama.o llama.cpp/common.o
binding.o: prepare llama.cpp/ggml.o llama.cpp/llama.o llama.cpp/common.o
$(CXX) $(CXXFLAGS) -I./llama.cpp -I./llama.cpp/examples binding.cpp -o binding.o -c $(LDFLAGS)

## https://github.com/ggerganov/llama.cpp/pull/1902
prepare:
cd llama.cpp && patch -p1 < ../1902.patch
touch $@

libbinding.a: binding.o llama.cpp/k_quants.o $(EXTRA_TARGETS)
ar src libbinding.a llama.cpp/ggml.o llama.cpp/k_quants.o $(EXTRA_TARGETS) llama.cpp/common.o llama.cpp/llama.o binding.o

clean:
rm -rf *.o
rm -rf *.a
Expand Down
12 changes: 8 additions & 4 deletions binding.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#include "common.h"
#include "llama.h"

#include "binding.h"

#include <cassert>
Expand Down Expand Up @@ -125,7 +126,7 @@ int llama_predict(void* params_ptr, void* state_pr, char* result, bool debug) {

std::mt19937 rng(params.seed);

llama_init_backend();


std::string path_session = params.path_prompt_cache;
std::vector<llama_token> session_tokens;
Expand Down Expand Up @@ -590,7 +591,7 @@ void* llama_allocate_params(const char *prompt, int seed, int threads, int token
}


void* load_model(const char *fname, int n_ctx, int n_seed, bool memory_f16, bool mlock, bool embeddings, bool mmap, int n_gpu_layers, int n_batch, const char *maingpu, const char *tensorsplit) {
void* load_model(const char *fname, int n_ctx, int n_seed, bool memory_f16, bool mlock, bool embeddings, bool mmap, bool low_vram, bool vocab_only, int n_gpu_layers, int n_batch, const char *maingpu, const char *tensorsplit) {
// load the model
auto lparams = llama_context_default_params();

Expand All @@ -601,6 +602,8 @@ void* load_model(const char *fname, int n_ctx, int n_seed, bool memory_f16, bool
lparams.use_mlock = mlock;
lparams.n_gpu_layers = n_gpu_layers;
lparams.use_mmap = mmap;
lparams.low_vram = low_vram;
lparams.vocab_only = vocab_only;

if (maingpu[0] != '\0') {
lparams.main_gpu = std::stoi(maingpu);
Expand All @@ -625,13 +628,14 @@ void* load_model(const char *fname, int n_ctx, int n_seed, bool memory_f16, bool

lparams.n_batch = n_batch;

llama_init_backend();
void* res = nullptr;
try {
res = llama_init_from_file(fname, lparams);
res = llama_init_from_file(fname, &lparams);
} catch(std::runtime_error& e) {
fprintf(stderr, "failed %s",e.what());
return res;
}

return res;
}
}
2 changes: 1 addition & 1 deletion binding.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ int eval(void* params_ptr, void *ctx, char*text);

void save_state(void *ctx, char *dst, char*modes);

void* load_model(const char *fname, int n_ctx, int n_seed, bool memory_f16, bool mlock, bool embeddings, bool mmap, int n_gpu, int n_batch, const char *maingpu, const char *tensorsplit);
void* load_model(const char *fname, int n_ctx, int n_seed, bool memory_f16, bool mlock, bool embeddings, bool mmap, bool low_vram, bool vocab_only, int n_gpu, int n_batch, const char *maingpu, const char *tensorsplit);

int get_embeddings(void* params_ptr, void* state_pr, float * res_embeddings);

Expand Down
2 changes: 1 addition & 1 deletion llama.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ type LLama struct {
func New(model string, opts ...ModelOption) (*LLama, error) {
mo := NewModelOptions(opts...)
modelPath := C.CString(model)
result := C.load_model(modelPath, C.int(mo.ContextSize), C.int(mo.Seed), C.bool(mo.F16Memory), C.bool(mo.MLock), C.bool(mo.Embeddings), C.bool(mo.MMap), C.int(mo.NGPULayers), C.int(mo.NBatch), C.CString(mo.MainGPU), C.CString(mo.TensorSplit))
result := C.load_model(modelPath, C.int(mo.ContextSize), C.int(mo.Seed), C.bool(mo.F16Memory), C.bool(mo.MLock), C.bool(mo.Embeddings), C.bool(mo.MMap), C.bool(mo.LowVRAM), C.bool(mo.VocabOnly), C.int(mo.NGPULayers), C.int(mo.NBatch), C.CString(mo.MainGPU), C.CString(mo.TensorSplit))
if result == nil {
return nil, fmt.Errorf("failed loading model")
}
Expand Down
13 changes: 12 additions & 1 deletion options.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ type ModelOptions struct {
F16Memory bool
MLock bool
MMap bool
VocabOnly bool
LowVRAM bool
Embeddings bool
NGPULayers int
MainGPU string
Expand Down Expand Up @@ -50,6 +52,7 @@ var DefaultModelOptions ModelOptions = ModelOptions{
MLock: false,
Embeddings: false,
MMap: true,
LowVRAM: false,
}

var DefaultOptions PredictOptions = PredictOptions{
Expand All @@ -58,7 +61,7 @@ var DefaultOptions PredictOptions = PredictOptions{
Tokens: 128,
Penalty: 1.1,
Repeat: 64,
Batch: 8,
Batch: 512,
NKeep: 64,
TopK: 40,
TopP: 0.95,
Expand Down Expand Up @@ -128,6 +131,14 @@ func SetPredictionMainGPU(maingpu string) PredictOption {
}
}

var VocabOnly ModelOption = func(p *ModelOptions) {
p.VocabOnly = true
}

var EnabelLowVRAM ModelOption = func(p *ModelOptions) {
p.LowVRAM = true
}

var EnableEmbeddings ModelOption = func(p *ModelOptions) {
p.Embeddings = true
}
Expand Down