Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

1.5 bit: we can do even better #5999

Merged
merged 6 commits into from
Mar 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions ggml-common.h
Original file line number Diff line number Diff line change
Expand Up @@ -645,6 +645,7 @@ GGML_TABLE_BEGIN(uint32_t, iq3s_grid, 512)
GGML_TABLE_END()

#define NGRID_IQ1S 2048
#define IQ1S_DELTA 0.125f
#if defined(GGML_COMMON_IMPL_C)
GGML_TABLE_BEGIN(uint64_t, iq1s_grid, NGRID_IQ1S)
0xffffffffffffffff, 0xffffffffffffff01, 0xffffffffffff0000, 0xffffffffffff01ff,
Expand Down
36 changes: 16 additions & 20 deletions ggml-cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -1722,22 +1722,15 @@ static __global__ void dequantize_block_iq1_s(const void * __restrict__ vx, dst_
const int il = tid/8; // 0...3
const int ib = tid%8; // 0...7
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
const float d = (float)x[i].d * (2*((x[i].qh[ib] >> 12) & 0xf) + 1);
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
int grid32[2]; const int8_t * q = (const int8_t *)grid32;
grid32[0] = *((const int *)(iq1s_grid_gpu + (x[i].qs[4*ib+il] | (((x[i].qh[ib] >> 3*il) & 7) << 8))));
grid32[1] = __vsub4((grid32[0] >> 4) & 0x0f0f0f0f, 0x01010101);
grid32[0] = __vsub4(grid32[0] & 0x0f0f0f0f, 0x01010101);
const float delta = x[i].qh[ib] & 0x8000 ? -1 - IQ1S_DELTA : -1 + IQ1S_DELTA;
const float d = (float)x[i].d * (2*((x[i].qh[ib] >> 12) & 7) + 1);
uint32_t grid32[2]; const int8_t * q = (const int8_t *)grid32;
grid32[0] = iq1s_grid_gpu[x[i].qs[4*ib+il] | (((x[i].qh[ib] >> 3*il) & 7) << 8)];
grid32[1] = (grid32[0] >> 4) & 0x0f0f0f0f;
grid32[0] &= 0x0f0f0f0f;
for (int j = 0; j < 8; ++j) {
y[j] = d * q[j];
}
#else
const uint8_t * grid = (const uint8_t *)(iq1s_grid_gpu + (x[i].qs[4*ib+il] | (((x[i].qh[ib] >> 3*il) & 7) << 8)));
for (int j = 0; j < 4; ++j) {
y[j+0] = d * ((grid[j] & 0xf) - 1);
y[j+4] = d * ((grid[j] >> 4) - 1);
y[j] = d * (q[j] + delta);
}
#endif
#else
assert(false);
#endif
Expand Down Expand Up @@ -4560,22 +4553,25 @@ static __device__ __forceinline__ float vec_dot_iq1_s_q8_1(
const int * q8 = (const int *)bq8_1[ib32].qs;
for (int l = 0; l < 4; ++l) {
const int * grid = (const int *)(iq1s_grid_gpu + (bq1->qs[4*ib32+l] | (((bq1->qh[ib32] >> 3*l) & 7) << 8)));
int grid0 = __vsub4(grid[0] & 0x0f0f0f0f, 0x01010101);
int grid1 = __vsub4((grid[0] >> 4) & 0x0f0f0f0f, 0x01010101);
int grid0 = grid[0] & 0x0f0f0f0f;
int grid1 = (grid[0] >> 4) & 0x0f0f0f0f;
sumi = __dp4a(q8[2*l+1], grid1, __dp4a(q8[2*l+0], grid0, sumi));
}
#else
const int8_t * q8 = bq8_1[ib32].qs;
const int8_t * q8 = bq8_1[ib32].qs;
for (int l = 0; l < 4; ++l) {
const uint8_t * grid = (const uint8_t *)(iq1s_grid_gpu + (bq1->qs[4*ib32+l] | (((bq1->qh[ib32] >> 3*l) & 7) << 8)));
for (int j = 0; j < 4; ++j) {
sumi += q8[j] * ((grid[j] & 0xf) - 1) + q8[j+4] * ((grid[j] >> 4) - 1);
sumi += q8[j] * (grid[j] & 0xf) + q8[j+4] * (grid[j] >> 4);
}
q8 += 8;
}
#endif
const float d = (float)bq1->d * __low2float(bq8_1[ib32].ds);
return d * sumi * (2*(bq1->qh[ib32] >> 12) + 1);
const float delta = bq1->qh[ib32] & 0x8000 ? -1-IQ1S_DELTA : -1+IQ1S_DELTA;
const float d1q = (float)bq1->d * (2*((bq1->qh[ib32] >> 12) & 7) + 1);
const float d = d1q * __low2float (bq8_1[ib32].ds);
const float m = d1q * __high2float(bq8_1[ib32].ds);
return d * sumi + m * delta;
#else
assert(false);
return 0.f;
Expand Down
18 changes: 10 additions & 8 deletions ggml-metal.metal
Original file line number Diff line number Diff line change
Expand Up @@ -4377,7 +4377,7 @@ void kernel_mul_mv_iq1_s_f32_impl(
+ yl[j+16] * (grid3[j] & 0xf) + yl[j+20] * (grid3[j] >> 4)
+ yl[j+24] * (grid4[j] & 0xf) + yl[j+28] * (grid4[j] >> 4);
}
sumf[row] += (float)dh[0] * (sum - sumy) * (2*(qh[0] >> 12) + 1);
sumf[row] += (float)dh[0] * (sum + sumy * (qh[0] & 0x8000 ? -1 - IQ1S_DELTA : -1 + IQ1S_DELTA)) * (2*((qh[0] >> 12) & 7) + 1);

dh += nb*sizeof(block_iq1_s)/2;
qs += nb*sizeof(block_iq1_s);
Expand Down Expand Up @@ -5076,14 +5076,16 @@ void dequantize_iq1_s(device const block_iq1_s * xb, short il, thread type4x4 &
const float d = xb->d;
device const uint8_t * qs = xb->qs + 4*ib32 + 2*il;
device const uint16_t * qh = xb->qh;
const float dl = d * (2*(qh[ib32] >> 12) + 1);
constant uint8_t * grid1 = (constant uint8_t *)(iq1s_grid_gpu + (qs[0] | (((qh[ib32] >> (6*il+0)) & 7) << 8)));
constant uint8_t * grid2 = (constant uint8_t *)(iq1s_grid_gpu + (qs[1] | (((qh[ib32] >> (6*il+3)) & 7) << 8)));
const float dl = d * (2*((qh[ib32] >> 12) & 7) + 1);
const float ml = dl * (qh[ib32] & 0x8000 ? -1 - IQ1S_DELTA : -1 + IQ1S_DELTA);
const uint16_t h = qh[ib32] >> 6*il;
constant uint8_t * grid1 = (constant uint8_t *)(iq1s_grid_gpu + (qs[0] | ((h << 8) & 0x700)));
constant uint8_t * grid2 = (constant uint8_t *)(iq1s_grid_gpu + (qs[1] | ((h << 5) & 0x700)));
for (int i = 0; i < 4; ++i) {
reg[0][i] = dl * (grid1[i] & 0xf) - dl;
reg[1][i] = dl * (grid1[i] >> 4) - dl;
reg[2][i] = dl * (grid2[i] & 0xf) - dl;
reg[3][i] = dl * (grid2[i] >> 4) - dl;
reg[0][i] = dl * (grid1[i] & 0xf) + ml;
reg[1][i] = dl * (grid1[i] >> 4) + ml;
reg[2][i] = dl * (grid2[i] & 0xf) + ml;
reg[3][i] = dl * (grid2[i] >> 4) + ml;
}
}

Expand Down
83 changes: 56 additions & 27 deletions ggml-quants.c
Original file line number Diff line number Diff line change
Expand Up @@ -3456,11 +3456,12 @@ void dequantize_row_iq1_s(const block_iq1_s * restrict x, float * restrict y, in
const uint16_t * qh = x[i].qh;

for (int ib = 0; ib < QK_K/32; ++ib) {
const float dl = d * (2*(qh[ib] >> 12) + 1);
const float dl = d * (2*((qh[ib] >> 12) & 7) + 1);
const float delta = qh[ib] & 0x8000 ? -IQ1S_DELTA : IQ1S_DELTA;
for (int l = 0; l < 4; ++l) {
const int8_t * grid = (const int8_t *)(iq1s_grid + (qs[l] | (((qh[ib] >> 3*l) & 7) << 8)));
for (int j = 0; j < 8; ++j) {
y[j] = dl * grid[j];
y[j] = dl * (grid[j] + delta);
}
y += 8;
}
Expand Down Expand Up @@ -9582,7 +9583,7 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void
const uint8_t * qs = x[i].qs;
const uint16_t * qh = x[i].qh;

int sumi1 = 0, sumi2 = 0;
int sumi1 = 0, sumi2 = 0, sumi3 = 0;

for (int ib = 0; ib < QK_K/32; ib += 2) {

Expand All @@ -9601,26 +9602,32 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void
const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q1b.val[0], q8b.val[0]), q1b.val[1], q8b.val[1]);
const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q1b.val[2], q8b.val[2]), q1b.val[3], q8b.val[3]);

sumi1 += vaddvq_s32(p1) * (2*(qh[ib+0] >> 12) + 1);
sumi2 += vaddvq_s32(p2) * (2*(qh[ib+1] >> 12) + 1);
const int ls1 = 2*((qh[ib+0] >> 12) & 7) + 1;
const int ls2 = 2*((qh[ib+1] >> 12) & 7) + 1;
sumi1 += vaddvq_s32(p1) * ls1;
sumi2 += vaddvq_s32(p2) * ls2;
sumi3 += (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]) * ls1 * (qh[ib+0] & 0x8000 ? -1 : 1)
+ (y[i].bsums[2*ib+2] + y[i].bsums[2*ib+3]) * ls2 * (qh[ib+1] & 0x8000 ? -1 : 1);

}

sumf += y[i].d * GGML_FP16_TO_FP32(x[i].d) * (sumi1 + sumi2);
sumf += y[i].d * GGML_FP16_TO_FP32(x[i].d) * (sumi1 + sumi2 + IQ1S_DELTA * sumi3);
}

*s = sumf;

#elif defined __AVX2__

__m256 accum = _mm256_setzero_ps();
float accum1 = 0;
for (int i = 0; i < nb; ++i) {

const int8_t * q8 = y[i].qs;
const uint8_t * qs = x[i].qs;
const uint16_t * qh = x[i].qh;

__m256i sumi = _mm256_setzero_si256();
int sumi1 = 0;
for (int ib = 0; ib < QK_K/32; ib += 2) {
const __m256i q1b_1 = _mm256_set_epi64x(iq1s_grid[qs[3] | ((qh[ib+0] >> 1) & 0x700)], iq1s_grid[qs[2] | ((qh[ib+0] << 2) & 0x700)],
iq1s_grid[qs[1] | ((qh[ib+0] << 5) & 0x700)], iq1s_grid[qs[0] | ((qh[ib+0] << 8) & 0x700)]);
Expand All @@ -9632,17 +9639,23 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void

const __m256i dot1 = mul_add_epi8(q1b_1, q8b_1);
const __m256i dot2 = mul_add_epi8(q1b_2, q8b_2);
const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_set1_epi16(2*(qh[ib+0] >> 12) + 1));
const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_set1_epi16(2*(qh[ib+1] >> 12) + 1));
const int16_t ls1 = 2*((qh[ib+0] >> 12) & 7) + 1;
const int16_t ls2 = 2*((qh[ib+1] >> 12) & 7) + 1;
const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_set1_epi16(ls1));
const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_set1_epi16(ls2));

sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p1, p2));
sumi1 += (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]) * (qh[ib+0] & 0x8000 ? -1 : 1) * ls1
+ (y[i].bsums[2*ib+2] + y[i].bsums[2*ib+3]) * (qh[ib+1] & 0x8000 ? -1 : 1) * ls2;
}

accum = _mm256_fmadd_ps(_mm256_set1_ps(y[i].d * GGML_FP16_TO_FP32(x[i].d)), _mm256_cvtepi32_ps(sumi), accum);
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
accum = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(sumi), accum);
accum1 += d * sumi1;

}

*s = hsum_float_8(accum);
*s = hsum_float_8(accum) + IQ1S_DELTA * accum1;

#else

Expand All @@ -9653,9 +9666,10 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void
const uint8_t * qs = x[i].qs;
const uint16_t * qh = x[i].qh;

int sumi = 0;
int sumi = 0, sumi1 = 0;
for (int ib = 0; ib < QK_K/32; ++ib) {
const int ls = 2*(qh[ib] >> 12) + 1;
const int ls = 2*((qh[ib] >> 12) & 7) + 1;
const int delta = qh[ib] & 0x8000 ? -1 : 1;
int lsum = 0;
for (int l = 0; l < 4; ++l) {
const int8_t * grid = (const int8_t *)(iq1s_grid + (qs[l] | (((qh[ib] >> 3*l) & 7) << 8)));
Expand All @@ -9664,11 +9678,12 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void
}
q8 += 8;
}
sumi += ls * lsum;
sumi += ls * lsum;
sumi1 += ls * delta * (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]);
qs += 4;
}

sumf += GGML_FP16_TO_FP32(x[i].d) * y[i].d * sumi;
sumf += GGML_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1);
}

*s = sumf;
Expand Down Expand Up @@ -11438,7 +11453,7 @@ static int iq1_find_best_neighbour(const uint16_t * restrict neighbours, const u
}

static int iq1_find_best_neighbour2(const uint16_t * restrict neighbours, const uint64_t * restrict grid,
const float * restrict xval, const float * restrict weight, float scale, int8_t * restrict L, int ngrid) {
const float * restrict xval, const float * restrict weight, float scale, const float * restrict xg, int8_t * restrict L, int ngrid) {
int num_neighbors = neighbours[0];
GGML_ASSERT(num_neighbors > 0);
float best_score = FLT_MAX;
Expand All @@ -11447,7 +11462,7 @@ static int iq1_find_best_neighbour2(const uint16_t * restrict neighbours, const
const int8_t * pg = (const int8_t *)(grid + neighbours[j]);
float d2 = 0;
for (int i = 0; i < 8; ++i) {
float q = (pg[i] - 3)/2;
float q = xg[(pg[i] - 1)/2];
float w = weight[i];
float diff = scale*q - xval[i];
d2 += w*diff*diff;
Expand All @@ -11463,7 +11478,7 @@ static int iq1_find_best_neighbour2(const uint16_t * restrict neighbours, const
float d2 = 0;
for (int j = 0; j < 8; ++j) {
float w = weight[j];
float q = (grid_i[j] - 3)/2;
float q = xg[(grid_i[j] - 1)/2];
float diff = scale*q - xval[i];
d2 += w*diff*diff;
}
Expand All @@ -11480,7 +11495,7 @@ static int iq1_find_best_neighbour2(const uint16_t * restrict neighbours, const
const int8_t * pg = (const int8_t *)(grid + neighbours[j]);
float sumqx = 0, sumq2 = 0;
for (int i = 0; i < 8; ++i) {
float q = (pg[i] - 3)/2;
float q = xg[(pg[i] - 1)/2];
float w = weight[i];
sumqx += w*q*xval[i];
sumq2 += w*q*q;
Expand Down Expand Up @@ -11519,6 +11534,9 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy

block_iq1_s * y = vy;

const float x_p[3] = {-1 + IQ1S_DELTA, IQ1S_DELTA, 1 + IQ1S_DELTA};
const float x_m[3] = {-1 - IQ1S_DELTA, -IQ1S_DELTA, 1 - IQ1S_DELTA};

float scales[QK_K/IQ1S_BLOCK_SIZE];
float weight[IQ1S_BLOCK_SIZE];
int8_t L[IQ1S_BLOCK_SIZE];
Expand All @@ -11527,6 +11545,7 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
float pairs[2*IQ1S_BLOCK_SIZE];
int * idx = (int *)(pairs + 1);
uint16_t index[IQ1S_BLOCK_SIZE/8];
int8_t shifts[QK_K/IQ1S_BLOCK_SIZE];

for (int ibl = 0; ibl < nbl; ++ibl) {

Expand Down Expand Up @@ -11572,33 +11591,41 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
}
}
float best_score = 0, scale = max;
int besti1 = 0, besti2 = 0;
int besti1 = -1, besti2 = -1, best_shift = 0;
for (int i1 = 0; i1 <= IQ1S_BLOCK_SIZE; ++i1) {
for (int i2 = i1; i2 <= IQ1S_BLOCK_SIZE; ++i2) {
float sumqx = -(sumx[i1] - sumx[0]) + (sumx[IQ1S_BLOCK_SIZE] - sumx[i2]);
float sumq2 = (sumw[i1] - sumw[0]) + (sumw[IQ1S_BLOCK_SIZE] - sumw[i2]);
float sumqx = (sumx[i1] - sumx[0])*x_p[0] + (sumx[i2] - sumx[i1])*x_p[1] + (sumx[IQ1S_BLOCK_SIZE] - sumx[i2])*x_p[2];
float sumq2 = (sumw[i1] - sumw[0])*x_p[0]*x_p[0] + (sumw[i2] - sumw[i1])*x_p[1]*x_p[1] + (sumw[IQ1S_BLOCK_SIZE] - sumw[i2])*x_p[2]*x_p[2];
if (sumq2 > 0 && sumqx*sumqx > best_score*sumq2) {
scale = sumqx/sumq2; best_score = scale*sumqx;
besti1 = i1; besti2 = i2; best_shift = 1;
}
sumqx = (sumx[i1] - sumx[0])*x_m[0] + (sumx[i2] - sumx[i1])*x_m[1] + (sumx[IQ1S_BLOCK_SIZE] - sumx[i2])*x_m[2];
sumq2 = (sumw[i1] - sumw[0])*x_m[0]*x_m[0] + (sumw[i2] - sumw[i1])*x_m[1]*x_m[1] + (sumw[IQ1S_BLOCK_SIZE] - sumw[i2])*x_m[2]*x_m[2];
if (sumq2 > 0 && sumqx*sumqx > best_score*sumq2) {
scale = sumqx/sumq2; best_score = scale*sumqx;
besti1 = i1; besti2 = i2;
besti1 = i1; besti2 = i2; best_shift = -1;
}
}
}
GGML_ASSERT(besti1 >= 0 && besti2 >= 0 && best_shift != 0);
for (int j = 0; j < besti1; ++j) L[idx[2*j]] = 0;
for (int j = besti1; j < besti2; ++j) L[idx[2*j]] = 1;
for (int j = besti2; j < IQ1S_BLOCK_SIZE; ++j) L[idx[2*j]] = 2;
if (scale < 0) {
for (int j = 0; j < IQ1S_BLOCK_SIZE; ++j) L[j] = 2 - L[j];
scale = -scale;
scale = -scale; best_shift = -best_shift;
}
bool all_on_grid = true;
const float * xx = best_shift == 1 ? x_p : x_m;
for (int k = 0; k < IQ1S_BLOCK_SIZE/8; ++k) {
uint16_t u = 0;
for (int j = 0; j < 8; ++j) u |= (L[8*k+j] << 2*j);
int grid_index = kmap_q2xs[u];
if (grid_index < 0) {
all_on_grid = false;
const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
grid_index = iq1_find_best_neighbour2(neighbours, kgrid_q2xs, xb + 8*k, weight + 8*k, scale, L + 8*k, NGRID_IQ1S);
grid_index = iq1_find_best_neighbour2(neighbours, kgrid_q2xs, xb + 8*k, weight + 8*k, scale, xx, L + 8*k, NGRID_IQ1S);
GGML_ASSERT(grid_index >= 0);
}
index[k] = grid_index;
Expand All @@ -11609,7 +11636,7 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
const int8_t * pg = (const int8_t *)(kgrid_q2xs + index[k]);
for (int j = 0; j < 8; ++j) {
float w = weight[8*k + j];
float q = (pg[j] - 3)/2;
float q = xx[(pg[j] - 1)/2];
sumqx += w*q*xb[8*k+j];
sumq2 += w*q*q;
}
Expand All @@ -11624,6 +11651,7 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
y[ibl].qh[ib] = h;
GGML_ASSERT(scale >= 0);
scales[ib] = scale;
shifts[ib] = best_shift;
max_scale = MAX(max_scale, scale);
}

Expand All @@ -11632,12 +11660,13 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
continue;
}

float d = max_scale/31;
float d = max_scale/15;
y[ibl].d = GGML_FP32_TO_FP16(d*1.125f); // 1.085f is another fudge factor. Don't ask me why it is needed.
float id = 1/d;
for (int ib = 0; ib < QK_K/IQ1S_BLOCK_SIZE; ++ib) {
int l = nearest_int(0.5f*(id*scales[ib]-1));
l = MAX(0, MIN(15, l));
l = MAX(0, MIN(7, l));
if (shifts[ib] == -1) l |= 8;
y[ibl].qh[ib] |= (l << 12);
}
}
Expand Down