Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions ggml/src/ggml-vulkan/ggml-vulkan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -638,6 +638,7 @@ struct vk_device_struct {
vk_pipeline pipeline_contig_cpy_f32_f32, pipeline_contig_cpy_f32_f16, pipeline_contig_cpy_f16_f16, pipeline_contig_cpy_f16_f32, pipeline_contig_cpy_f32_bf16, pipeline_contig_cpy_f32_i32, pipeline_contig_cpy_i32_f32;
vk_pipeline pipeline_cpy_f32_quant[GGML_TYPE_COUNT];
vk_pipeline pipeline_cpy_quant_f32[GGML_TYPE_COUNT];
vk_pipeline pipeline_cpy_transpose_16, pipeline_cpy_transpose_32;
vk_pipeline pipeline_set_rows_i32[GGML_TYPE_COUNT];
vk_pipeline pipeline_set_rows_i64[GGML_TYPE_COUNT];
vk_pipeline pipeline_norm_f32;
Expand Down Expand Up @@ -3697,6 +3698,9 @@ static void ggml_vk_load_shaders(vk_device& device) {
ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_i32_f32, "contig_cpy_i32_f32", contig_cpy_i32_f32_len, contig_cpy_i32_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_f32_i32, "contig_cpy_f32_i32", contig_cpy_f32_i32_len, contig_cpy_f32_i32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);

ggml_vk_create_pipeline(device, device->pipeline_cpy_transpose_32, "cpy_transpose_32", cpy_transpose_32_len, cpy_transpose_32_data, "main", 2, sizeof(vk_op_unary_push_constants), {1, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_cpy_transpose_16, "cpy_transpose_16", cpy_transpose_16_len, cpy_transpose_16_data, "main", 2, sizeof(vk_op_unary_push_constants), {1, 1, 1}, {}, 1);

if (device->float_controls_rte_fp16) {
ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q4_0], "cpy_f32_q4_0", cpy_f32_q4_0_rte_len, cpy_f32_q4_0_rte_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q4_1], "cpy_f32_q4_1", cpy_f32_q4_1_rte_len, cpy_f32_q4_1_rte_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1);
Expand Down Expand Up @@ -6247,6 +6251,17 @@ static vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, const
// Choose "contiguous copy" shader if src/dst are contiguous
bool contig = ggml_is_contiguous(src) && (!dst || ggml_is_contiguous(dst));

// Use optimized "transpose" shader if src dim1 is the innermost dimension.
bool transpose = dst && src->nb[1] == ggml_type_size(to) && ggml_are_same_shape(dst, src);

if (transpose && src->type == to) {
if (ggml_type_size(to) == 4) {
return ctx->device->pipeline_cpy_transpose_32;
} else if (ggml_type_size(to) == 2) {
return ctx->device->pipeline_cpy_transpose_16;
}
}

if (src->type == GGML_TYPE_F32 && to == GGML_TYPE_F32) {
if (contig) {
return ctx->device->pipeline_contig_cpy_f32_f32;
Expand Down Expand Up @@ -8858,6 +8873,17 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
} else {
elements = { ne, 1, 1 };
}

if (pipeline == ctx->device->pipeline_cpy_transpose_32 ||
pipeline == ctx->device->pipeline_cpy_transpose_16) {
// 32x32 tiles
elements[0] = (uint32_t)CEIL_DIV(dst->ne[0], 32);
elements[1] = (uint32_t)CEIL_DIV(dst->ne[1], 32);
elements[2] = (uint32_t)(dst->ne[2]*dst->ne[3]);
elements[0] = std::min(elements[0], ctx->device->properties.limits.maxComputeWorkGroupCount[0]);
elements[1] = std::min(elements[1], ctx->device->properties.limits.maxComputeWorkGroupCount[1]);
elements[2] = std::min(elements[2], ctx->device->properties.limits.maxComputeWorkGroupCount[2]);
}
} break;
case GGML_OP_ADD_ID:
{
Expand Down
67 changes: 67 additions & 0 deletions ggml/src/ggml-vulkan/vulkan-shaders/copy_transpose.comp
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
#version 450

#include "types.glsl"
#include "generic_unary_head.glsl"

// workgroup does 32x32 tile, but uses 32x8 threads
#define TILE_DIM 32
layout(local_size_x = 32, local_size_y = 8, local_size_z = 1) in;

shared uint sh[TILE_DIM][TILE_DIM + 1];

void iter(uvec3 wg_id) {
const uint tile_col = wg_id.x;
const uint tile_row = wg_id.y;

const uint tid_col = gl_LocalInvocationID.x;
const uint tid_row = gl_LocalInvocationID.y;

const uint i2 = wg_id.z % p.ne12;
const uint i3 = wg_id.z / p.ne12;
const uint i02 = i2;
const uint i03 = i3;

// The workgroup does TILE_DIM x TILE_DIM, but swaps the LSBs of the
// src coords to make memory accesses contiguous, dst has tid.x in i0,
// src has tid.x in i01

[[unroll]] for (uint y = 0; y < 4; ++y) {
const uint i00 = tile_col * TILE_DIM + tid_row + 8 * y;
const uint i01 = tile_row * TILE_DIM + tid_col;
if (i00 < p.ne00 && i01 < p.ne01 && i02 < p.ne02 && i03 < p.ne03) {
const uint src_idx = i00 * p.nb00 + i01 * p.nb01 + i02 * p.nb02 + i03 * p.nb03;
sh[tid_row + 8 * y][tid_col] = uint(data_a[get_aoffset() + src_idx]);
}
}

barrier();

[[unroll]] for (uint y = 0; y < 4; ++y) {
const uint i0 = tile_col * TILE_DIM + tid_col;
const uint i1 = tile_row * TILE_DIM + tid_row + 8 * y;
if (i0 < p.ne10 && i1 < p.ne11 && i2 < p.ne12 && i3 < p.ne13) {
const uint dst_idx = i0 * p.nb10 + i1 * p.nb11 + i2 * p.nb12 + i3 * p.nb13;
// load transposed
data_d[get_doffset() + dst_idx] = D_TYPE(sh[tid_col][tid_row + 8 * y]);
}
}
}

#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b))

void main() {
uint z = gl_WorkGroupID.z;
uint y = gl_WorkGroupID.y;
bool need_barrier = false;
for (uint z = gl_WorkGroupID.z; z < p.ne12 * p.ne13; z += gl_NumWorkGroups.z) {
for (uint y = gl_WorkGroupID.y; y < CEIL_DIV(p.ne11, TILE_DIM); y += gl_NumWorkGroups.y) {
for (uint x = gl_WorkGroupID.x; x < CEIL_DIV(p.ne10, TILE_DIM); x += gl_NumWorkGroups.x) {
if (need_barrier) {
barrier();
}
need_barrier = true;
iter(uvec3(x, y, z));
}
}
}
}
3 changes: 3 additions & 0 deletions ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -734,6 +734,9 @@ void process_shaders() {
string_to_spv("cpy_f32_i32", "copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "int"}});
string_to_spv("cpy_i32_f32", "copy.comp", {{"A_TYPE", "int"}, {"D_TYPE", "float"}});

string_to_spv("cpy_transpose_16", "copy_transpose.comp", {{"A_TYPE", "uint16_t"}, {"D_TYPE", "uint16_t"}});
string_to_spv("cpy_transpose_32", "copy_transpose.comp", {{"A_TYPE", "uint"}, {"D_TYPE", "uint"}});

for (std::string t : {"q4_0", "q4_1", "q5_0", "q5_1", "q8_0", "iq4_nl"}) {
string_to_spv("cpy_f32_" + t, "copy_to_quant.comp", {{"DATA_A_" + to_uppercase(t), "1"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
string_to_spv("cpy_f32_" + t + "_rte", "copy_to_quant.comp", {{"DATA_A_" + to_uppercase(t), "1"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}, {"RTE16", "1"}});
Expand Down
Loading