Skip to content

Commit

Permalink
ggml : metal impl + cleanup + sycl dev warnings
Browse files Browse the repository at this point in the history
  • Loading branch information
ggerganov committed May 12, 2024
1 parent c8a3335 commit cc8e262
Show file tree
Hide file tree
Showing 6 changed files with 77 additions and 74 deletions.
46 changes: 20 additions & 26 deletions src/ggml-cuda/upscale.cu
Original file line number Diff line number Diff line change
@@ -1,41 +1,36 @@
#include "upscale.cuh"

static __global__ void upscale_f32(const float * x, float * dst, const int ne00, const int ne01, const int ne02, const int ne03,
const int ne10, const int ne11, const int ne12, const int ne13, const float ne0_scale_factor,
const float ne1_scale_factor, const float ne2_scale_factor, const float ne3_scale_factor) {


const int ne10, const int ne11, const int ne12, const int ne13,
const float sf0, const float sf1, const float sf2, const float sf3) {
int index = threadIdx.x + blockIdx.x * blockDim.x;
if (index>= ne10 * ne11 * ne12 * ne13)
{
if (index >= ne10 * ne11 * ne12 * ne13) {
return;
}

int i10 = index % ne10;
int i11 = (index / ne10) % ne11;
int i12 = (index / (ne10* ne11)) % ne12;
int i13 = (index / (ne10* ne11 * ne12)) % ne13;
int i11 = (index / ne10) % ne11;
int i12 = (index / (ne10 * ne11)) % ne12;
int i13 = (index / (ne10 * ne11 * ne12)) % ne13;

int i00 = i10 / ne0_scale_factor;
int i01 = i11 / ne1_scale_factor;
int i02 = i12 / ne2_scale_factor;
int i03 = i13 / ne3_scale_factor;
int i00 = i10 / sf0;
int i01 = i11 / sf1;
int i02 = i12 / sf2;
int i03 = i13 / sf3;

int src_index = i00 + (i01 * ne00) + (i02 * ne00 * ne01) + (i02 * ne00 * ne01 * ne02);


dst[index] = x[src_index];
}


static void upscale_f32_cuda(const float * x, float * dst, const int ne00, const int ne01, const int ne02, const int ne03,
const int ne10, const int ne11, const int ne12, const int ne13, float ne0_scale_factor, float ne1_scale_factor,
float ne2_scale_factor, float ne3_scale_factor, cudaStream_t stream) {
const int ne10, const int ne11, const int ne12, const int ne13,
const float sf0, const float sf1, const float sf2, const float sf3,
cudaStream_t stream) {
int dst_size = ne10 * ne11 * ne12* ne13;
int num_blocks = (dst_size + CUDA_UPSCALE_BLOCK_SIZE - 1) / CUDA_UPSCALE_BLOCK_SIZE;

upscale_f32<<<num_blocks, CUDA_UPSCALE_BLOCK_SIZE,0,stream>>>(x, dst, ne00,ne01,ne02,ne03, ne10,ne11, ne12,ne13,
ne0_scale_factor, ne1_scale_factor, ne2_scale_factor, ne3_scale_factor);
upscale_f32<<<num_blocks, CUDA_UPSCALE_BLOCK_SIZE,0,stream>>>(x, dst, ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, sf0, sf1, sf2, sf3);
}

void ggml_cuda_op_upscale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
Expand All @@ -45,13 +40,12 @@ void ggml_cuda_op_upscale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
cudaStream_t stream = ctx.stream();

GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT(dst->type == GGML_TYPE_F32);
//GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
GGML_ASSERT( dst->type == GGML_TYPE_F32);

const float ne0_scale_factor = (float)dst->ne[0]/src0->ne[0];
const float ne1_scale_factor = (float)dst->ne[1]/src0->ne[1];
const float ne2_scale_factor = (float)dst->ne[2]/src0->ne[2];
const float ne3_scale_factor = (float)dst->ne[3]/src0->ne[3];
const float sf0 = (float)dst->ne[0]/src0->ne[0];
const float sf1 = (float)dst->ne[1]/src0->ne[1];
const float sf2 = (float)dst->ne[2]/src0->ne[2];
const float sf3 = (float)dst->ne[3]/src0->ne[3];

upscale_f32_cuda(src0_d, dst_d, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], dst->ne[0],dst->ne[1],dst->ne[2],dst->ne[3], ne0_scale_factor,ne1_scale_factor,ne2_scale_factor,ne3_scale_factor, stream);
upscale_f32_cuda(src0_d, dst_d, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], dst->ne[0],dst->ne[1],dst->ne[2],dst->ne[3], sf0, sf1, sf2, sf3, stream);
}
10 changes: 8 additions & 2 deletions src/ggml-metal.m
Original file line number Diff line number Diff line change
Expand Up @@ -2314,7 +2314,10 @@ static enum ggml_status ggml_metal_graph_compute(
{
GGML_ASSERT(src0->type == GGML_TYPE_F32);

const int sf = dst->op_params[0];
const float sf0 = (float)ne0/src0->ne[0];
const float sf1 = (float)ne1/src0->ne[1];
const float sf2 = (float)ne2/src0->ne[2];
const float sf3 = (float)ne3/src0->ne[3];

const id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_UPSCALE_F32].pipeline;

Expand All @@ -2337,7 +2340,10 @@ static enum ggml_status ggml_metal_graph_compute(
[encoder setBytes:&nb1 length:sizeof(nb1) atIndex:15];
[encoder setBytes:&nb2 length:sizeof(nb2) atIndex:16];
[encoder setBytes:&nb3 length:sizeof(nb3) atIndex:17];
[encoder setBytes:&sf length:sizeof(sf) atIndex:18];
[encoder setBytes:&sf0 length:sizeof(sf0) atIndex:18];
[encoder setBytes:&sf1 length:sizeof(sf1) atIndex:19];
[encoder setBytes:&sf2 length:sizeof(sf2) atIndex:20];
[encoder setBytes:&sf3 length:sizeof(sf3) atIndex:21];

const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne0);

Expand Down
13 changes: 8 additions & 5 deletions src/ggml-metal.metal
Original file line number Diff line number Diff line change
Expand Up @@ -1863,7 +1863,10 @@ kernel void kernel_upscale_f32(
constant uint64_t & nb1,
constant uint64_t & nb2,
constant uint64_t & nb3,
constant int32_t & sf,
constant float & sf0,
constant float & sf1,
constant float & sf2,
constant float & sf3,
uint3 tgpig[[threadgroup_position_in_grid]],
uint3 tpitg[[thread_position_in_threadgroup]],
uint3 ntg[[threads_per_threadgroup]]) {
Expand All @@ -1872,15 +1875,15 @@ kernel void kernel_upscale_f32(
const int64_t i2 = tgpig.y;
const int64_t i1 = tgpig.x;

const int64_t i03 = i3;
const int64_t i02 = i2;
const int64_t i01 = i1/sf;
const int64_t i03 = i3/sf3;
const int64_t i02 = i2/sf2;
const int64_t i01 = i1/sf1;

device const float * src0_ptr = (device const float *) (src0 + i03*nb03 + i02*nb02 + i01*nb01);
device float * dst_ptr = (device float *) (dst + i3*nb3 + i2*nb2 + i1*nb1);

for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
dst_ptr[i0] = src0_ptr[i0/sf];
dst_ptr[i0] = src0_ptr[(int)(i0/sf0)];
}
}

Expand Down
4 changes: 4 additions & 0 deletions src/ggml-sycl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14013,6 +14013,10 @@ inline void ggml_sycl_op_upscale(const ggml_tensor *src0,
GGML_ASSERT(dst->type == GGML_TYPE_F32);
GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors

#pragma message("TODO: generalize upscale operator")
#pragma message(" https://github.com/ggerganov/ggml/pull/814")
GGML_ASSERT(false && "TODO: generalize upscale operator);

const int scale_factor = dst->op_params[0];

upscale_f32_sycl(src0_dd, dst_dd, src0->ne[0], src0->ne[1], src0->ne[2], scale_factor, main_stream);
Expand Down
62 changes: 31 additions & 31 deletions src/ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -6055,7 +6055,6 @@ struct ggml_tensor * ggml_pool_2d(
return result;
}


// ggml_upscale

static struct ggml_tensor * ggml_upscale_impl(
Expand All @@ -6072,7 +6071,6 @@ static struct ggml_tensor * ggml_upscale_impl(
is_node = true;
}


GGML_ASSERT(a->ne[0] <= ne0);
GGML_ASSERT(a->ne[1] <= ne1);
GGML_ASSERT(a->ne[2] <= ne2);
Expand All @@ -6093,6 +6091,25 @@ static struct ggml_tensor * ggml_upscale_impl(
return result;
}

struct ggml_tensor * ggml_upscale(
struct ggml_context * ctx,
struct ggml_tensor * a,
int scale_factor) {
return ggml_upscale_impl(ctx, a, a->ne[0] * scale_factor, a->ne[1] * scale_factor, a->ne[2], a->ne[3]);
}

struct ggml_tensor * ggml_upscale_ext(
struct ggml_context * ctx,
struct ggml_tensor * a,
int ne0,
int ne1,
int ne2,
int ne3) {
return ggml_upscale_impl(ctx, a, ne0, ne1, ne2, ne3);
}

// ggml_pad

struct ggml_tensor * ggml_pad(
struct ggml_context * ctx,
struct ggml_tensor * a,
Expand All @@ -6117,22 +6134,7 @@ struct ggml_tensor * ggml_pad(
return result;
}

struct ggml_tensor * ggml_upscale(
struct ggml_context * ctx,
struct ggml_tensor * a,
int scale_factor) {
return ggml_upscale_impl(ctx, a, a->ne[0] * scale_factor, a->ne[1] * scale_factor, 1, 1);
}

struct ggml_tensor * ggml_upscale_ext(
struct ggml_context * ctx,
struct ggml_tensor * a,
int ne0,
int ne1,
int ne2,
int ne3) {
return ggml_upscale_impl(ctx, a, ne0, ne1, ne2, ne3);
}
// ggml_arange

struct ggml_tensor * ggml_arange(
struct ggml_context * ctx,
Expand All @@ -6154,6 +6156,8 @@ struct ggml_tensor * ggml_arange(
return result;
}

// ggml_timestep_embedding

struct ggml_tensor * ggml_timestep_embedding(
struct ggml_context * ctx,
struct ggml_tensor * timesteps,
Expand Down Expand Up @@ -13887,6 +13891,7 @@ static void ggml_compute_forward_pool_2d(
}
}

// ggml_compute_forward_upscale

static void ggml_compute_forward_upscale_f32(
const struct ggml_compute_params * params,
Expand All @@ -13905,23 +13910,21 @@ static void ggml_compute_forward_upscale_f32(

GGML_TENSOR_UNARY_OP_LOCALS


const float ne0_scale_factor = (float)ne0/src0->ne[0];
const float ne1_scale_factor = (float)ne1/src0->ne[1];
const float ne2_scale_factor = (float)ne2/src0->ne[2];
const float ne3_scale_factor = (float)ne3/src0->ne[3];

const float sf0 = (float)ne0/src0->ne[0];
const float sf1 = (float)ne1/src0->ne[1];
const float sf2 = (float)ne2/src0->ne[2];
const float sf3 = (float)ne3/src0->ne[3];

// TODO: optimize

for (int64_t i3 = 0; i3 < ne3; i3++) {
const int64_t i03 = i3 / ne3_scale_factor;
const int64_t i03 = i3 / sf3;
for (int64_t i2 = ith; i2 < ne2; i2 += nth) {
const int64_t i02 = i2 / ne2_scale_factor;
const int64_t i02 = i2 / sf2;
for (int64_t i1 = 0; i1 < ne1; i1++) {
const int64_t i01 = i1 / ne1_scale_factor;
const int64_t i01 = i1 / sf1;
for (int64_t i0 = 0; i0 < ne0; i0++) {
const int64_t i00 = i0 / ne0_scale_factor;
const int64_t i00 = i0 / sf0;

const float * x = (float *)((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
float * y = (float *)((char *) dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3);
Expand All @@ -13933,9 +13936,6 @@ static void ggml_compute_forward_upscale_f32(
}
}

// ggml_compute_forward_upscale


static void ggml_compute_forward_upscale(
const struct ggml_compute_params * params,
struct ggml_tensor * dst) {
Expand Down
16 changes: 6 additions & 10 deletions tests/test-backend-ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1339,32 +1339,28 @@ struct test_upscale : public test_case {
}
};



// GGML_OP_UPSCALE (ext)
struct test_upscale_ext : public test_case {
const ggml_type type;
const std::array<int64_t, 4> ne;
const std::array<int64_t, 4> target_ne;

const std::array<int64_t, 4> ne_tgt;

std::string vars() override {
return VARS_TO_STR3(type, ne, target_ne);
return VARS_TO_STR3(type, ne, ne_tgt);
}

test_upscale_ext(ggml_type type = GGML_TYPE_F32,
std::array<int64_t, 4> ne = {2, 5, 7, 11},
std::array<int64_t, 4> target_ne = {5, 7, 11, 13})
: type(type), ne(ne), target_ne(target_ne) {}
std::array<int64_t, 4> ne = {2, 5, 7, 11},
std::array<int64_t, 4> ne_tgt = {5, 7, 11, 13})
: type(type), ne(ne), ne_tgt(ne_tgt) {}

ggml_tensor * build_graph(ggml_context * ctx) override {
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
ggml_tensor * out = ggml_upscale_ext(ctx, a, target_ne[0], target_ne[1],target_ne[2], target_ne[3]);
ggml_tensor * out = ggml_upscale_ext(ctx, a, ne_tgt[0], ne_tgt[1],ne_tgt[2], ne_tgt[3]);
return out;
}
};


// GGML_OP_GROUP_NORM
struct test_group_norm : public test_case {
const ggml_type type;
Expand Down

0 comments on commit cc8e262

Please sign in to comment.