Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ggml : remove bit shuffling #1305

Closed
wants to merge 20 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Expand Up @@ -44,5 +44,6 @@ zig-cache/

ppl-*.txt
qnt-*.txt
perf-*.txt

examples/jeopardy/results.txt
24 changes: 12 additions & 12 deletions README.md
Expand Up @@ -330,18 +330,18 @@ As the models are currently fully loaded into memory, you will need adequate dis

Several quantization methods are supported. They differ in the resulting model disk size and inference speed.

| Model | Measure | F16 | Q4_0 | Q4_1 | Q4_2 | Q5_0 | Q5_1 | Q8_0 |
|------:|--------------|-------:|-------:|-------:|-------:|-------:|-------:|-------:|
| 7B | perplexity | 5.9066 | 6.1620 | 6.0910 | 6.1466 | 5.9862 | 5.9481 | 5.9069 |
| 7B | file size | 13.0G | 4.0G | 4.8G | 4.0G | 4.4G | 4.8G | 7.1G |
| 7B | ms/tok @ 4th | 128 | 56 | 61 | 84 | 91 | 95 | 75 |
| 7B | ms/tok @ 8th | 128 | 47 | 55 | 48 | 53 | 59 | 75 |
| 7B | bits/weight | 16.0 | 5.0 | 6.0 | 5.0 | 5.5 | 6.0 | 9.0 |
| 13B | perplexity | 5.2543 | 5.3863 | 5.3607 | 5.3513 | 5.2856 | 5.2706 | 5.2548 |
| 13B | file size | 25.0G | 7.6G | 9.1G | 7.6G | 8.4G | 9.1G | 14G |
| 13B | ms/tok @ 4th | 239 | 104 | 113 | 160 | 176 | 185 | 141 |
| 13B | ms/tok @ 8th | 240 | 85 | 99 | 97 | 108 | 117 | 147 |
| 13B | bits/weight | 16.0 | 5.0 | 6.0 | 5.0 | 5.5 | 6.0 | 9.0 |
| Model | Measure | F16 | Q4_0 | Q4_1 | Q5_0 | Q5_1 | Q8_0 |
|------:|--------------|-------:|-------:|-------:|-------:|-------:|-------:|
| 7B | perplexity | 5.9066 | 6.1620 | 6.0910 | 5.9862 | 5.9481 | 5.9069 |
| 7B | file size | 13.0G | 4.0G | 4.8G | 4.4G | 4.8G | 7.1G |
| 7B | ms/tok @ 4th | 128 | 56 | 61 | 91 | 95 | 75 |
| 7B | ms/tok @ 8th | 128 | 47 | 55 | 53 | 59 | 75 |
| 7B | bits/weight | 16.0 | 5.0 | 6.0 | 5.5 | 6.0 | 9.0 |
| 13B | perplexity | 5.2543 | 5.3863 | 5.3607 | 5.2856 | 5.2706 | 5.2548 |
| 13B | file size | 25.0G | 7.6G | 9.1G | 8.4G | 9.1G | 14G |
| 13B | ms/tok @ 4th | 239 | 104 | 113 | 176 | 185 | 141 |
| 13B | ms/tok @ 8th | 240 | 85 | 99 | 108 | 117 | 147 |
| 13B | bits/weight | 16.0 | 5.0 | 6.0 | 5.5 | 6.0 | 9.0 |

### Perplexity (measuring model quality)

Expand Down
16 changes: 4 additions & 12 deletions SHA256SUMS
@@ -1,24 +1,19 @@
700df0d3013b703a806d2ae7f1bfb8e59814e3d06ae78be0c66368a50059f33d models/7B/consolidated.00.pth
666a4bb533b303bdaf89e1b6a3b6f93535d868de31d903afdc20983dc526c847 models/7B/ggml-model-f16.bin
99aeb35f26b577fa2732716cca4d8b5ada39a78ea9b2dca2651fc632b5d101b6 models/7B/ggml-model-q4_0.bin
cc061458339a3eb8bcecbf0a825e9924fb7d1a8150f63cd5d091caa99215aafe models/7B/ggml-model-q4_1.bin
25b050337a87344da687a7f2adddc03bd99b7f6c140450e836649f3585fb6496 models/7B/ggml-model-q4_2.bin
ae89af479ab4d31c4e555ad8cc1dc9bf1f68d617186158cc381cd5a0fccd10bd models/7B/ggml-model-q4_0.bin
862072e2036a1bdb1a01ec2e159381f332a9e2357b886031c075fb7efa86db9b models/7B/ggml-model-q4_1.bin
0bef7cefa880a67a0b6d2a7e4559ded235823535ad616808dd8b5e47ff0a202f models/7B/ggml-model-q5_0.bin
97b9c38b2b8aed0c0aa90e0a975570ce3455c47d62128b382c55acbf6e2035f6 models/7B/ggml-model-q5_1.bin
7e89e242ddc0dd6f060b43ca219ce8b3e8f08959a72cb3c0855df8bb04d46265 models/7B/params.json
745bf4e29a4dd6f411e72976d92b452da1b49168a4f41c951cfcc8051823cf08 models/13B/consolidated.00.pth
d5ccbcc465c71c0de439a5aeffebe8344c68a519bce70bc7f9f92654ee567085 models/13B/consolidated.01.pth
2b206e9b21fb1076f11cafc624e2af97c9e48ea09312a0962153acc20d45f808 models/13B/ggml-model-f16.bin
eecb575d325d935157761172e2bf05984dad216eb2b06777b73463cf9b818bab models/13B/ggml-model-q4_0.bin
d9581b5b88e5622532fe897c9f9b0e67a317d22dd27a6f90fa4ab8c6d23ccdbb models/13B/ggml-model-q4_1.bin
75a218a47df03f5f96354656329864613abcb67779412b9bc2282b28c1c3cbaa models/13B/ggml-model-q4_2.bin
4ab77bec4d4405ccb66a97b282574c89a94417e3c32e5f68f37e2876fc21322f models/13B/params.json
e23294a58552d8cdec5b7e8abb87993b97ea6eced4178ff2697c02472539d067 models/30B/consolidated.00.pth
4e077b7136c7ae2302e954860cf64930458d3076fcde9443f4d0e939e95903ff models/30B/consolidated.01.pth
24a87f01028cbd3a12de551dcedb712346c0b5cbdeff1454e0ddf2df9b675378 models/30B/consolidated.02.pth
1adfcef71420886119544949767f6a56cb6339b4d5fcde755d80fe68b49de93b models/30B/consolidated.03.pth
7e1b524061a9f4b27c22a12d6d2a5bf13b8ebbea73e99f218809351ed9cf7d37 models/30B/ggml-model-f16.bin
517b9e525742c42b5478a6280a4b41ec66f46298c57aba7f0453d491682fe42d models/30B/ggml-model-q4_0.bin
7b75ac615fa369ee593493a7e6ef87542bf0350255db928b22c5a24f6d598bcd models/30B/ggml-model-q4_1.bin
aadbc9cf806313a55be570f62884eed289d30c313fac3b7838717e01bd553204 models/30B/ggml-model-q4_2.bin
2c07118ea98d69dbe7810d88520e30288fa994751b337f8fca02b171955f44cb models/30B/params.json
135c563f6b3938114458183afb01adc9a63bef3d8ff7cccc3977e5d3664ecafe models/65B/consolidated.00.pth
9a600b37b19d38c7e43809485f70d17d1dc12206c07efa83bc72bb498a568bde models/65B/consolidated.01.pth
Expand All @@ -29,8 +24,5 @@ a287c0dfe49081626567c7fe87f74cce5831f58e459b427b5e05567641f47b78 models/65B/con
72b4eba67a1a3b18cb67a85b70f8f1640caae9b40033ea943fb166bd80a7b36b models/65B/consolidated.06.pth
d27f5b0677d7ff129ceacd73fd461c4d06910ad7787cf217b249948c3f3bc638 models/65B/consolidated.07.pth
60758f2384d74e423dffddfd020ffed9d3bb186ebc54506f9c4a787d0f5367b0 models/65B/ggml-model-f16.bin
01672072136f8be6ca9d7cebe5f86ed316e8b85851b9fe3de951809233cea4f2 models/65B/ggml-model-q4_0.bin
4743a28aac3e5f32a6e838a815f51d3779de44fbbe251d745251e66c23c5950f models/65B/ggml-model-q4_1.bin
1b6f6588d0e2ecfe6c4d849088e48e5e3083466b962daa32e3261363e21fc5e9 models/65B/ggml-model-q4_2.bin
999ed1659b469ccc2a941714c0a9656fa571d17c9f7c8c7589817ca90edef51b models/65B/params.json
9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 models/tokenizer.model
11 changes: 5 additions & 6 deletions examples/quantize/quantize.cpp
Expand Up @@ -7,12 +7,11 @@
#include <string>

static const std::map<std::string, llama_ftype> LLAMA_FTYPE_MAP = {
{"q4_0", LLAMA_FTYPE_MOSTLY_Q4_0},
{"q4_1", LLAMA_FTYPE_MOSTLY_Q4_1},
{"q4_2", LLAMA_FTYPE_MOSTLY_Q4_2},
{"q5_0", LLAMA_FTYPE_MOSTLY_Q5_0},
{"q5_1", LLAMA_FTYPE_MOSTLY_Q5_1},
{"q8_0", LLAMA_FTYPE_MOSTLY_Q8_0},
{"q4_0", LLAMA_FTYPE_MOSTLY_Q4_0},
{"q4_1", LLAMA_FTYPE_MOSTLY_Q4_1},
{"q5_0", LLAMA_FTYPE_MOSTLY_Q5_0},
{"q5_1", LLAMA_FTYPE_MOSTLY_Q5_1},
{"q8_0", LLAMA_FTYPE_MOSTLY_Q8_0},
};

bool try_parse_ftype(const std::string & ftype_str, llama_ftype & ftype, std::string & ftype_str_out) {
Expand Down
121 changes: 32 additions & 89 deletions ggml-cuda.cu
Expand Up @@ -49,13 +49,6 @@ typedef struct {
} block_q4_1;
static_assert(sizeof(block_q4_1) == sizeof(float) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding");

#define QK4_2 16
typedef struct {
half d; // delta
uint8_t qs[QK4_2 / 2]; // nibbles / quants
} block_q4_2;
static_assert(sizeof(block_q4_2) == sizeof(ggml_fp16_t) + QK4_2 / 2, "wrong q4_2 block size/padding");

#define QK5_0 32
typedef struct {
half d; // delta
Expand All @@ -81,131 +74,88 @@ typedef struct {
static_assert(sizeof(block_q8_0) == sizeof(float) + QK8_0, "wrong q8_0 block size/padding");

static __global__ void dequantize_block_q4_0(const void * vx, float * y) {
static const int qk = QK4_0;

const block_q4_0 * x = (const block_q4_0 *) vx;

const int i = blockIdx.x;

const float d = x[i].d;

const uint8_t * pp = x[i].qs;

for (int l = 0; l < QK4_0; l += 2) {
const uint8_t vi = pp[l/2];
for (int j = 0; j < qk/2; ++j) {
const int x0 = (x[i].qs[j] & 0xf) - 8;
const int x1 = (x[i].qs[j] >> 4) - 8;

const int8_t vi0 = vi & 0xf;
const int8_t vi1 = vi >> 4;

const float v0 = (vi0 - 8)*d;
const float v1 = (vi1 - 8)*d;

y[i*QK4_0 + l + 0] = v0;
y[i*QK4_0 + l + 1] = v1;
y[i*qk + j + 0 ] = x0*d;
y[i*qk + j + qk/2] = x1*d;
}
}

static __global__ void dequantize_block_q4_1(const void * vx, float * y) {
static const int qk = QK4_1;

const block_q4_1 * x = (const block_q4_1 *) vx;

const int i = blockIdx.x;

const float d = x[i].d;
const float m = x[i].m;

const uint8_t * pp = x[i].qs;

for (int l = 0; l < QK4_1; l += 2) {
const uint8_t vi = pp[l/2];

const int8_t vi0 = vi & 0xf;
const int8_t vi1 = vi >> 4;

const float v0 = vi0*d + m;
const float v1 = vi1*d + m;

y[i*QK4_1 + l + 0] = v0;
y[i*QK4_1 + l + 1] = v1;
}
}

static __global__ void dequantize_block_q4_2(const void * vx, float * y) {
const block_q4_2 * x = (const block_q4_2 *) vx;

const int i = blockIdx.x;

const float d = x[i].d;

const uint8_t * pp = x[i].qs;

for (int l = 0; l < QK4_2; l += 2) {
const uint8_t vi = pp[l/2];
for (int j = 0; j < qk/2; ++j) {
const int x0 = (x[i].qs[j] & 0xf);
const int x1 = (x[i].qs[j] >> 4);

const int8_t vi0 = vi & 0xf;
const int8_t vi1 = vi >> 4;

const float v0 = (vi0 - 8)*d;
const float v1 = (vi1 - 8)*d;

y[i*QK4_2 + l + 0] = v0;
y[i*QK4_2 + l + 1] = v1;
y[i*qk + j + 0 ] = x0*d + m;
y[i*qk + j + qk/2] = x1*d + m;
}
}

static __global__ void dequantize_block_q5_0(const void * vx, float * y) {
static const int qk = QK5_0;

const block_q5_0 * x = (const block_q5_0 *) vx;

const int i = blockIdx.x;

const float d = x[i].d;

const uint8_t * pp = x[i].qs;

uint32_t qh;
memcpy(&qh, x[i].qh, sizeof(qh));

for (int l = 0; l < QK5_0; l += 2) {
const uint8_t vi = pp[l/2];
for (int j = 0; j < qk/2; ++j) {
const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10;

const int8_t vh0 = ((qh & (1 << (l + 0))) >> (l + 0)) << 4;
const int8_t vh1 = ((qh & (1 << (l + 1))) >> (l + 1)) << 4;
const int32_t x0 = ((x[i].qs[j] & 0xf) | xh_0) - 16;
const int32_t x1 = ((x[i].qs[j] >> 4) | xh_1) - 16;

const int8_t vi0 = ((vi & 0xf) | vh0);
const int8_t vi1 = ((vi >> 4) | vh1);

const float v0 = (vi0 - 16)*d;
const float v1 = (vi1 - 16)*d;

y[i*QK5_0 + l + 0] = v0;
y[i*QK5_0 + l + 1] = v1;
y[i*qk + j + 0 ] = x0*d;
y[i*qk + j + qk/2] = x1*d;
}
}

static __global__ void dequantize_block_q5_1(const void * vx, float * y) {
static const int qk = QK5_1;

const block_q5_1 * x = (const block_q5_1 *) vx;

const int i = blockIdx.x;

const float d = x[i].d;
const float m = x[i].m;

const uint8_t * pp = x[i].qs;

uint32_t qh;
memcpy(&qh, x[i].qh, sizeof(qh));

for (int l = 0; l < QK5_1; l += 2) {
const uint8_t vi = pp[l/2];
for (int j = 0; j < qk/2; ++j) {
const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10;

const int8_t vh0 = ((qh & (1 << (l + 0))) >> (l + 0)) << 4;
const int8_t vh1 = ((qh & (1 << (l + 1))) >> (l + 1)) << 4;
const int x0 = (x[i].qs[j] & 0xf) | xh_0;
const int x1 = (x[i].qs[j] >> 4) | xh_1;

const int8_t vi0 = (vi & 0xf) | vh0;
const int8_t vi1 = (vi >> 4) | vh1;

const float v0 = vi0*d + m;
const float v1 = vi1*d + m;

y[i*QK5_1 + l + 0] = v0;
y[i*QK5_1 + l + 1] = v1;
y[i*qk + j + 0 ] = x0*d + m;
y[i*qk + j + qk/2] = x1*d + m;
}
}

Expand Down Expand Up @@ -235,11 +185,6 @@ static void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, cudaStre
dequantize_block_q4_1<<<nb, 1, 0, stream>>>(vx, y);
}

static void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
const int nb = k / QK4_2;
dequantize_block_q4_2<<<nb, 1, 0, stream>>>(vx, y);
}

static void dequantize_row_q5_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
const int nb = k / QK5_0;
dequantize_block_q5_0<<<nb, 1, 0, stream>>>(vx, y);
Expand Down Expand Up @@ -274,8 +219,6 @@ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
return dequantize_row_q4_0_cuda;
case GGML_TYPE_Q4_1:
return dequantize_row_q4_1_cuda;
case GGML_TYPE_Q4_2:
return dequantize_row_q4_2_cuda;
case GGML_TYPE_Q5_0:
return dequantize_row_q5_0_cuda;
case GGML_TYPE_Q5_1:
Expand Down
30 changes: 1 addition & 29 deletions ggml-opencl.c
Expand Up @@ -52,26 +52,6 @@ __kernel void dequantize_row_q4_1(__global struct block_q4_1* blocks, __global f
result[index + 1] = (vi >> 4) * d + m;
}

struct block_q4_2
{
ushort d;
uchar qs[8];
};

__kernel void dequantize_row_q4_2(__global struct block_q4_2* blocks, __global float* result) {
const uint i = get_global_id(0) / 16;
const uint l = get_local_id(0);

const float d = vload_half(0, (__global half*) &blocks[i].d);

const uchar vi = blocks[i].qs[l];

const uint index = i*16 + l*2;
result[index + 0] = ((vi & 0xf) - 8)*d;
result[index + 1] = ((vi >> 4) - 8)*d;
}


struct block_q5_0
{
float d;
Expand Down Expand Up @@ -167,7 +147,7 @@ static cl_device_id device;
static cl_context context;
static cl_command_queue queue;
static cl_program program;
static cl_kernel kernel_q4_0, kernel_q4_1, kernel_q4_2, kernel_q5_0, kernel_q5_1, kernel_q8_0;
static cl_kernel kernel_q4_0, kernel_q4_1, kernel_q5_0, kernel_q5_1, kernel_q8_0;
static cl_mem cl_buffer_a, cl_buffer_qb, cl_buffer_b, cl_buffer_c;
static size_t cl_size_a = 0, cl_size_qb = 0, cl_size_b = 0, cl_size_c = 0;

Expand Down Expand Up @@ -238,8 +218,6 @@ void ggml_cl_init(void) {
CL_CHECK(err, "clCreateKernel");
kernel_q4_1 = clCreateKernel(program, "dequantize_row_q4_1", &err);
CL_CHECK(err, "clCreateKernel");
kernel_q4_2 = clCreateKernel(program, "dequantize_row_q4_2", &err);
CL_CHECK(err, "clCreateKernel");
kernel_q5_0 = clCreateKernel(program, "dequantize_row_q5_0", &err);
CL_CHECK(err, "clCreateKernel");
kernel_q5_1 = clCreateKernel(program, "dequantize_row_q5_1", &err);
Expand Down Expand Up @@ -292,12 +270,6 @@ void ggml_cl_sgemm_wrapper(
local = 16;
size_qb = global * (sizeof(float) * 2 + local) / 32;
break;
case GGML_TYPE_Q4_2:
dequant = true;
kernel = kernel_q4_2;
local = 8;
size_qb = global * (sizeof(ggml_fp16_t) + local) / 16;
break;
case GGML_TYPE_Q5_0:
dequant = true;
kernel = kernel_q5_0;
Expand Down