Skip to content

Commit 6048993

Browse files
aicss-genaictao456malsbat
authored
sycl: Q5_K reorder MMVQ/dequant + Q8_0 reorder MMVQ path (#22152)
* sycl: Q5_K reorder MMVQ/dequant + Q8_0 reorder MMVQ path Signed-off-by: Chun Tao <chun.tao@intel.com> * Remove duplicate definitions --------- Signed-off-by: Chun Tao <chun.tao@intel.com> Co-authored-by: Chun Tao <chun.tao@intel.com> Co-authored-by: Todd Malsbary <todd.malsbary@intel.com>
1 parent 4a4f819 commit 6048993

6 files changed

Lines changed: 265 additions & 26 deletions

File tree

ggml/src/ggml-sycl/convert.cpp

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -252,6 +252,23 @@ static void dequantize_row_q5_K_sycl(const void *vx, dst_t *y, const int64_t k,
252252
#endif
253253
}
254254

255+
template <typename dst_t>
256+
static void dequantize_row_q5_K_sycl_reorder(const void * vx, dst_t * y, const int64_t k, dpct::queue_ptr stream) {
257+
const int64_t nb = k / QK_K;
258+
259+
dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });
260+
261+
stream->submit([&](sycl::handler & cgh) {
262+
sycl::local_accessor<uint8_t, 1> scale_local_acc(sycl::range<1>(K_SCALE_SIZE), cgh);
263+
264+
cgh.parallel_for(
265+
sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 64), sycl::range<3>(1, 1, 64)),
266+
[=](sycl::nd_item<3> item_ct1) {
267+
dequantize_block_q5_K_reorder(vx, y, get_pointer(scale_local_acc), item_ct1, nb);
268+
});
269+
});
270+
}
271+
255272
template <typename dst_t>
256273
static void dequantize_row_q6_K_sycl(const void *vx, dst_t *y, const int64_t k,
257274
dpct::queue_ptr stream) {
@@ -643,7 +660,11 @@ to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor * dst) {
643660
return dequantize_row_q4_K_sycl;
644661
}
645662
case GGML_TYPE_Q5_K:
646-
return dequantize_row_q5_K_sycl;
663+
if (dst->src[0]->extra && ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
664+
return dequantize_row_q5_K_sycl_reorder;
665+
} else {
666+
return dequantize_row_q5_K_sycl;
667+
}
647668
case GGML_TYPE_Q6_K:
648669
if (dst->src[0]->extra && ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
649670
return dequantize_row_q6_K_sycl_reorder;
@@ -718,7 +739,11 @@ to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type, ggml_tensor *dst) {
718739
return dequantize_row_q4_K_sycl;
719740
}
720741
case GGML_TYPE_Q5_K:
721-
return dequantize_row_q5_K_sycl;
742+
if (dst->src[0]->extra && ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
743+
return dequantize_row_q5_K_sycl_reorder;
744+
} else {
745+
return dequantize_row_q5_K_sycl;
746+
}
722747
case GGML_TYPE_Q6_K:
723748
if (dst->src[0]->extra && ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
724749
return dequantize_row_q6_K_sycl_reorder;

ggml/src/ggml-sycl/dequantize.hpp

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -537,6 +537,63 @@ static void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restri
537537
#endif
538538
}
539539

540+
template <typename dst_t>
541+
static void dequantize_block_q5_K_reorder(const void * __restrict__ vx, dst_t * __restrict__ yy,
542+
uint8_t * scales_local, const sycl::nd_item<3> & item_ct1, int64_t n_blocks) {
543+
const int64_t ib = item_ct1.get_group(2);
544+
545+
#if QK_K == 256
546+
// assume 64 threads
547+
const int64_t tid = item_ct1.get_local_id(2);
548+
const int64_t il = tid / 16; // 0...3
549+
const int64_t ir = tid % 16; // 0...15
550+
const int64_t is = 2 * il;
551+
552+
dst_t * y = yy + ib * QK_K + 64 * il + 2 * ir;
553+
554+
const uint8_t * base = static_cast<const uint8_t *>(vx);
555+
556+
// Reordered layout: [qs (QK_K/2 per block)] [qh (QK_K/8 per block)] [scales (K_SCALE_SIZE per block)] [dm (half2 per block)]
557+
const size_t qs_offset = ib * (QK_K / 2);
558+
const size_t qh_offset = n_blocks * (QK_K / 2) + ib * (QK_K / 8);
559+
const size_t scales_offset = n_blocks * (QK_K / 2) + n_blocks * (QK_K / 8) + ib * K_SCALE_SIZE;
560+
const size_t dm_offset = n_blocks * (QK_K / 2) + n_blocks * (QK_K / 8) + n_blocks * K_SCALE_SIZE + ib * sizeof(ggml_half2);
561+
562+
const uint8_t * qs_ptr = base + qs_offset;
563+
const uint8_t * qh_ptr = base + qh_offset;
564+
const uint8_t * scales_ptr = base + scales_offset;
565+
const ggml_half2 dm_values = *reinterpret_cast<const ggml_half2 *>(base + dm_offset);
566+
567+
const float dall = dm_values.x();
568+
const float dmin = dm_values.y();
569+
570+
const uint8_t * ql = qs_ptr + 32 * il + 2 * ir;
571+
const uint8_t * qh = qh_ptr + 2 * ir;
572+
573+
if (tid < K_SCALE_SIZE) {
574+
scales_local[tid] = scales_ptr[tid];
575+
}
576+
577+
item_ct1.barrier(sycl::access::fence_space::local_space);
578+
579+
uint8_t sc, m;
580+
get_scale_min_k4(is + 0, scales_local, sc, m);
581+
const float d1 = dall * sc; const float m1 = dmin * m;
582+
get_scale_min_k4(is + 1, scales_local, sc, m);
583+
const float d2 = dall * sc; const float m2 = dmin * m;
584+
585+
uint8_t hm = 1 << (2 * il);
586+
y[ 0] = d1 * ((ql[ 0] & 0xF) + (qh[ 0] & hm ? 16 : 0)) - m1;
587+
y[ 1] = d1 * ((ql[ 1] & 0xF) + (qh[ 1] & hm ? 16 : 0)) - m1;
588+
hm <<= 1;
589+
y[32] = d2 * ((ql[ 0] >> 4) + (qh[ 0] & hm ? 16 : 0)) - m2;
590+
y[33] = d2 * ((ql[ 1] >> 4) + (qh[ 1] & hm ? 16 : 0)) - m2;
591+
#else
592+
GGML_UNUSED(ib); GGML_UNUSED(tid); GGML_UNUSED(yy); GGML_UNUSED(scales_local); GGML_UNUSED(n_blocks);
593+
GGML_ABORT("Q5_K reorder dequantize not supported for QK_K != 256");
594+
#endif
595+
}
596+
540597
template<typename dst_t>
541598
static void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restrict__ yy,
542599
const sycl::nd_item<3> &item_ct1) {

ggml/src/ggml-sycl/ggml-sycl.cpp

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3303,6 +3303,7 @@ inline bool ggml_sycl_supports_reorder_mul_mat_sycl(enum ggml_type type) {
33033303
case GGML_TYPE_Q8_0:
33043304
return true;
33053305
case GGML_TYPE_Q4_K:
3306+
case GGML_TYPE_Q5_K:
33063307
case GGML_TYPE_Q6_K:
33073308
return !g_ggml_sycl_prioritize_dmmv;
33083309
default:
@@ -3325,6 +3326,7 @@ inline bool ggml_sycl_supports_reorder_mmvq(enum ggml_type type) {
33253326
case GGML_TYPE_Q4_0:
33263327
case GGML_TYPE_Q8_0:
33273328
case GGML_TYPE_Q4_K:
3329+
case GGML_TYPE_Q5_K:
33283330
case GGML_TYPE_Q6_K:
33293331
return true;
33303332
default:
@@ -3541,6 +3543,54 @@ static bool reorder_qw_q4_k(uint8_t * data_device, size_t size, size_t offset, d
35413543
return true;
35423544
}
35433545

3546+
static bool reorder_qw_q5_k(uint8_t * data_device, size_t size, size_t offset, dpct::queue_ptr stream) {
3547+
GGML_ASSERT(size % sizeof(block_q5_K) == 0);
3548+
GGML_ASSERT(offset % sizeof(block_q5_K) == 0);
3549+
3550+
const int nblocks = size / sizeof(block_q5_K);
3551+
3552+
sycl_reorder_temp_buffer tmp(stream, size);
3553+
if (!tmp) {
3554+
GGML_LOG_WARN("%s: failed to allocate %zu bytes for reorder temp buffer, skipping reorder\n", __func__, size);
3555+
return false;
3556+
}
3557+
uint8_t * tmp_buf = static_cast<uint8_t *>(tmp.ptr);
3558+
3559+
sycl::event copy_event;
3560+
SYCL_CHECK(CHECK_TRY_ERROR(copy_event = stream->memcpy(tmp_buf, data_device, size)));
3561+
if (!g_ggml_sycl_use_async_mem_op) {
3562+
copy_event.wait();
3563+
}
3564+
3565+
auto * qs_ptr = data_device;
3566+
auto * qh_ptr = qs_ptr + (QK_K / 2) * nblocks;
3567+
auto * scales_ptr = qh_ptr + (QK_K / 8) * nblocks;
3568+
auto * dm_ptr = (sycl::half2 *) (scales_ptr + K_SCALE_SIZE * nblocks);
3569+
3570+
auto reorder_event = stream->parallel_for(nblocks, [=](auto i) {
3571+
const block_q5_K * x = (const block_q5_K *) tmp_buf;
3572+
const int ib = i;
3573+
3574+
for (int j = 0; j < QK_K / 2; ++j) {
3575+
qs_ptr[ib * (QK_K / 2) + j] = x[ib].qs[j];
3576+
}
3577+
3578+
for (int j = 0; j < QK_K / 8; ++j) {
3579+
qh_ptr[ib * (QK_K / 8) + j] = x[ib].qh[j];
3580+
}
3581+
3582+
for (int j = 0; j < K_SCALE_SIZE; ++j) {
3583+
scales_ptr[ib * K_SCALE_SIZE + j] = x[ib].scales[j];
3584+
}
3585+
3586+
dm_ptr[ib] = x[ib].dm;
3587+
});
3588+
if (!g_ggml_sycl_use_async_mem_op) {
3589+
reorder_event.wait_and_throw();
3590+
}
3591+
return true;
3592+
}
3593+
35443594
static bool reorder_qw_q6_k(uint8_t * data_device, size_t size, size_t offset, dpct::queue_ptr stream) {
35453595
GGML_ASSERT(size % sizeof(block_q6_K) == 0);
35463596
GGML_ASSERT(offset % sizeof(block_q6_K) == 0);
@@ -3607,6 +3657,8 @@ static bool reorder_qw(const ggml_tensor * src0, dpct::queue_ptr stream) {
36073657
return reorder_qw_q8_0(data_device, ncols, nrows, size, 0, stream);
36083658
case GGML_TYPE_Q4_K:
36093659
return reorder_qw_q4_k(data_device, size, 0, stream);
3660+
case GGML_TYPE_Q5_K:
3661+
return reorder_qw_q5_k(data_device, size, 0, stream);
36103662
case GGML_TYPE_Q6_K:
36113663
return reorder_qw_q6_k(data_device, size, 0, stream);
36123664
default:

ggml/src/ggml-sycl/mmvq.cpp

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -839,6 +839,26 @@ static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy,
839839
}
840840
}
841841

842+
static void reorder_mul_mat_vec_q5_k_q8_1_sycl(const void * vx, const void * vy, float * dst, const int ncols,
843+
const int nrows, dpct::queue_ptr stream) {
844+
GGML_ASSERT(ncols % QK_K == 0);
845+
846+
const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y);
847+
constexpr size_t num_subgroups = 16;
848+
GGML_ASSERT(block_num_y % num_subgroups == 0);
849+
850+
const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, block_num_y * WARP_SIZE);
851+
const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE);
852+
853+
stream->submit([&](sycl::handler & cgh) {
854+
cgh.parallel_for(sycl::nd_range<3>(global_size, workgroup_size),
855+
[=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
856+
mul_mat_vec_q_reorder<reorder_vec_dot_q_sycl<GGML_TYPE_Q5_K>>(vx, vy, dst, ncols,
857+
nrows, nd_item);
858+
});
859+
});
860+
}
861+
842862
static void reorder_mul_mat_vec_q6_k_q8_1_sycl(const void * vx, const void * vy, float * dst, const int ncols,
843863
const int nrows, dpct::queue_ptr stream) {
844864
GGML_ASSERT(ncols % QK_K == 0);
@@ -1125,6 +1145,7 @@ void ggml_sycl_op_mul_mat_vec_q(ggml_backend_sycl_context & ctx, const ggml_tens
11251145
GGML_SYCL_DEBUG("Calling reorder_mul_mat_vec_q8_0_q8_1_sycl\n");
11261146
reorder_mul_mat_vec_q8_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
11271147
} else {
1148+
GGML_SYCL_DEBUG("Calling mul_mat_vec_q8_0_q8_1_sycl\n");
11281149
mul_mat_vec_q8_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
11291150
}
11301151
break;
@@ -1145,7 +1166,14 @@ void ggml_sycl_op_mul_mat_vec_q(ggml_backend_sycl_context & ctx, const ggml_tens
11451166
}
11461167
break;
11471168
case GGML_TYPE_Q5_K:
1148-
mul_mat_vec_q5_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
1169+
if ((ggml_tensor_extra_gpu *) dst->src[0]->extra &&
1170+
((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
1171+
GGML_SYCL_DEBUG("Calling reorder_mul_mat_vec_q5_k_q8_1_sycl\n");
1172+
reorder_mul_mat_vec_q5_k_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
1173+
} else {
1174+
GGML_SYCL_DEBUG("Calling mul_mat_vec_q5_K_q8_1_sycl\n");
1175+
mul_mat_vec_q5_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
1176+
}
11491177
break;
11501178
case GGML_TYPE_Q6_K:
11511179
if ((ggml_tensor_extra_gpu *) dst->src[0]->extra &&

ggml/src/ggml-sycl/quants.hpp

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,31 @@ template <> struct block_q_t<GGML_TYPE_Q4_K> {
7979
static constexpr int block_to_q8_1_ratio() { return traits::qk / QK8_1; }
8080
};
8181

82+
template <> struct block_q_t<GGML_TYPE_Q5_K> {
83+
struct traits {
84+
static constexpr uint32_t qk = QK_K;
85+
static constexpr uint32_t qi = QI5_K;
86+
static constexpr uint32_t qr = QR5_K;
87+
static constexpr uint32_t vdr_mmvq = 2;
88+
};
89+
90+
// Reordered layout: [qs (QK_K/2 per block)] [qh (QK_K/8 per block)] [scales] [dm]
91+
static constexpr std::pair<int, int> get_block_offset(const int block_index, const int n_blocks) {
92+
auto qs_offset = block_index * (QK_K / 2);
93+
auto qh_offset = n_blocks * (QK_K / 2) + block_index * (QK_K / 8);
94+
return { qs_offset, qh_offset };
95+
}
96+
97+
static constexpr std::pair<int, int> get_d_offset(int nrows, int ncols, const int block_index) {
98+
auto nblocks = (nrows * (ncols / QK_K));
99+
auto total_qs_bytes = nblocks * (QK_K / 2) + nblocks * (QK_K / 8);
100+
return { total_qs_bytes + block_index * K_SCALE_SIZE,
101+
total_qs_bytes + nblocks * K_SCALE_SIZE + block_index * sizeof(ggml_half2) };
102+
}
103+
104+
static constexpr int block_to_q8_1_ratio() { return traits::qk / QK8_1; }
105+
};
106+
82107
template <> struct block_q_t<GGML_TYPE_Q6_K> {
83108
struct traits {
84109
static constexpr uint32_t qk = QK_K;

ggml/src/ggml-sycl/vecdotq.hpp

Lines changed: 75 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -357,38 +357,31 @@ template <> struct reorder_vec_dot_q_sycl<GGML_TYPE_Q8_0> {
357357
using q8_0_block = ggml_sycl_reordered::block_q_t<GGML_TYPE_Q8_0>;
358358
using q8_0_traits = typename q8_0_block::traits;
359359

360-
__dpct_inline__ float vec_dot_q8_0_q8_1_impl(const int * v, const int * u, const float & d8_0, const sycl::half2 & ds8) {
361-
int sumi = 0;
362-
363-
#pragma unroll
364-
for (size_t i = 0; i < q8_0_traits::vdr_mmvq; ++i) {
365-
// Q8_0 values are signed int8, no nibble extraction needed
366-
// Direct dp4a: each int packs 4 int8 values
367-
sumi = dpct::dp4a(v[i], u[i], sumi);
368-
}
369-
370-
const sycl::float2 ds8f = ds8.convert<float, sycl::rounding_mode::automatic>();
371-
372-
// Q8_0 has no bias term (values are signed), so just scale
373-
return d8_0 * sumi * ds8f.x();
374-
}
375-
376360
__dpct_inline__ float operator()(const void * __restrict__ vbq, const std::pair<int, int> ibx_offset,
377361
const std::pair<int, int> d_offset, const int8_t * q8_1_quant_ptr,
378362
const sycl::half2 * q8_1_ds, const int & iqs) {
379-
const int8_t * bq8_0 = static_cast<const int8_t *>(vbq) + ibx_offset.first;
380-
const ggml_half d = *(reinterpret_cast<const ggml_half *>(static_cast<const uint8_t *>(vbq) + d_offset.first));
381-
int v[q8_0_traits::vdr_mmvq];
382-
int u[q8_0_traits::vdr_mmvq];
363+
const uint8_t * base = static_cast<const uint8_t *>(vbq);
364+
const int8_t * qs = reinterpret_cast<const int8_t *>(base + ibx_offset.first);
365+
const ggml_half d = *reinterpret_cast<const ggml_half *>(base + d_offset.first);
366+
367+
int v[q8_0_traits::vdr_mmvq];
368+
int u[q8_0_traits::vdr_mmvq];
383369

384370
#pragma unroll
385371
for (size_t i = 0; i < q8_0_traits::vdr_mmvq; ++i) {
386-
v[i] = get_int_from_int8(bq8_0, iqs + i);
372+
v[i] = get_int_from_int8(qs, iqs + i);
387373
u[i] = get_int_from_int8_aligned(q8_1_quant_ptr, iqs + i);
388374
}
389375

390-
return vec_dot_q8_0_q8_1_impl(v, u, d, *q8_1_ds);
391-
};
376+
int sumi = 0;
377+
#pragma unroll
378+
for (size_t i = 0; i < q8_0_traits::vdr_mmvq; ++i) {
379+
sumi = dpct::dp4a(v[i], u[i], sumi);
380+
}
381+
382+
const sycl::half2 ds_values = *q8_1_ds;
383+
return static_cast<float>(d) * static_cast<float>(ds_values[0]) * sumi;
384+
}
392385
};
393386

394387
static inline float vec_dot_q4_K_q8_1_common(const int * __restrict__ q4, const uint16_t * __restrict__ scales,
@@ -481,6 +474,65 @@ template <> struct reorder_vec_dot_q_sycl<GGML_TYPE_Q4_K> {
481474
}
482475
};
483476

477+
template <> struct reorder_vec_dot_q_sycl<GGML_TYPE_Q5_K> {
478+
static constexpr ggml_type gtype = GGML_TYPE_Q5_K;
479+
480+
using q5_k_block = ggml_sycl_reordered::block_q_t<GGML_TYPE_Q5_K>;
481+
using q5_k_traits = typename q5_k_block::traits;
482+
483+
__dpct_inline__ float operator()(const void * __restrict__ vbq, const std::pair<int, int> ibx_offset,
484+
const std::pair<int, int> d_offset, const int8_t * q8_1_quant_ptr,
485+
const sycl::half2 * q8_1_ds, const int & iqs) {
486+
const uint8_t * base = static_cast<const uint8_t *>(vbq);
487+
const uint8_t * qs = base + ibx_offset.first; // low 4 bits
488+
const uint8_t * qh_base = base + ibx_offset.second; // high bit
489+
const uint8_t * scs = base + d_offset.first;
490+
const ggml_half2 * dms = reinterpret_cast<const ggml_half2 *>(base + d_offset.second);
491+
492+
const int bq8_offset = QR5_K * ((iqs / 2) / (QI8_1 / 2));
493+
const int * ql_ptr = (const int *) (qs + 16 * bq8_offset + 4 * ((iqs / 2) % 4));
494+
const int * qh_ptr = (const int *) (qh_base + 4 * ((iqs / 2) % 4));
495+
const uint16_t * scales = (const uint16_t *) scs;
496+
497+
int vl[2];
498+
int vh[2];
499+
int u[2 * QR5_K];
500+
float d8[QR5_K];
501+
502+
vl[0] = ql_ptr[0];
503+
vl[1] = ql_ptr[4];
504+
505+
vh[0] = qh_ptr[0] >> bq8_offset;
506+
vh[1] = qh_ptr[4] >> bq8_offset;
507+
508+
uint16_t aux[2];
509+
const int j = (QR5_K * ((iqs / 2) / (QI8_1 / 2))) / 2;
510+
if (j < 2) {
511+
aux[0] = scales[j + 0] & 0x3f3f;
512+
aux[1] = scales[j + 2] & 0x3f3f;
513+
} else {
514+
aux[0] = ((scales[j + 2] >> 0) & 0x0f0f) | ((scales[j - 2] & 0xc0c0) >> 2);
515+
aux[1] = ((scales[j + 2] >> 4) & 0x0f0f) | ((scales[j - 0] & 0xc0c0) >> 2);
516+
}
517+
518+
const uint8_t * sc = (const uint8_t *) aux;
519+
const uint8_t * m = sc + 2;
520+
521+
for (int i = 0; i < QR5_K; ++i) {
522+
const int8_t* quant_base_ptr = q8_1_quant_ptr + (bq8_offset + i) * QK8_1;
523+
sycl::half2 ds_values = *(q8_1_ds + bq8_offset + i);
524+
525+
d8[i] = ds_values[0];
526+
527+
const int * q8 = (const int *) quant_base_ptr + ((iqs / 2) % 4);
528+
u[2 * i + 0] = q8[0];
529+
u[2 * i + 1] = q8[4];
530+
}
531+
532+
return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, *dms, d8);
533+
}
534+
};
535+
484536
template <> struct reorder_vec_dot_q_sycl<GGML_TYPE_Q6_K> {
485537
static constexpr ggml_type gtype = GGML_TYPE_Q6_K;
486538

0 commit comments

Comments
 (0)