Skip to content

Commit

Permalink
Commenting async copy for now in read/write buf functions
Browse files Browse the repository at this point in the history
  • Loading branch information
luraess committed Jul 22, 2023
1 parent 7f151af commit c116726
Showing 1 changed file with 10 additions and 8 deletions.
18 changes: 10 additions & 8 deletions src/update_halo.jl
Original file line number Diff line number Diff line change
Expand Up @@ -492,15 +492,16 @@ let
function iwrite_sendbufs!(n::Integer, dim::Integer, A::ROCArray{T}, i::Integer) where T <: GGNumber
if ol(dim,A) >= 2 # There is only a halo and thus a halo update if the overlap is at least 2...
# DEBUG: the follow section needs perf testing
if dim == 1 || amdgpuaware_MPI(dim) # Use a custom copy kernel for the first dimension to obtain a good copy performance (the CUDA 3-D memcopy does not perform well for this extremely strided case).
# DEBUG 2: commenting read_h2d_async! for now
# if dim == 1 || amdgpuaware_MPI(dim) # Use a custom copy kernel for the first dimension to obtain a good copy performance (the CUDA 3-D memcopy does not perform well for this extremely strided case).
ranges = sendranges(n, dim, A);
nthreads = (dim==1) ? (1, 32, 1) : (32, 1, 1);
halosize = [r[end] - r[1] + 1 for r in ranges];
nblocks = Tuple(ceil.(Int, halosize./nthreads));
@roc gridsize=nblocks groupsize=nthreads stream=rocstreams[n,i] write_d2x!(gpusendbuf(n,dim,i,A), A, ranges[1], ranges[2], ranges[3], dim);
else
write_d2h_async!(sendbuf_flat(n,dim,i,A), A, sendranges(n,dim,A), rocstreams[n,i]);
end
# else
# write_d2h_async!(sendbuf_flat(n,dim,i,A), A, sendranges(n,dim,A), rocstreams[n,i]);
# end
end
end
end
Expand All @@ -521,15 +522,16 @@ let
function iread_recvbufs!(n::Integer, dim::Integer, A::ROCArray{T}, i::Integer) where T <: GGNumber
if ol(dim,A) >= 2 # There is only a halo and thus a halo update if the overlap is at least 2...
# DEBUG: the follow section needs perf testing
if dim == 1 || amdgpuaware_MPI(dim) # Use a custom copy kernel for the first dimension to obtain a good copy performance (the CUDA 3-D memcopy does not perform well for this extremely strided case).
# DEBUG 2: commenting read_h2d_async! for now
# if dim == 1 || amdgpuaware_MPI(dim) # Use a custom copy kernel for the first dimension to obtain a good copy performance (the CUDA 3-D memcopy does not perform well for this extremely strided case).
ranges = recvranges(n, dim, A);
nthreads = (dim==1) ? (1, 32, 1) : (32, 1, 1);
halosize = [r[end] - r[1] + 1 for r in ranges];
nblocks = Tuple(ceil.(Int, halosize./nthreads));
@roc gridsize=nblocks groupsize=nthreads stream=rocstreams[n,i] read_x2d!(gpurecvbuf(n,dim,i,A), A, ranges[1], ranges[2], ranges[3], dim);
else
read_h2d_async!(recvbuf_flat(n,dim,i,A), A, recvranges(n,dim,A), rocstreams[n,i]);
end
# else
# read_h2d_async!(recvbuf_flat(n,dim,i,A), A, recvranges(n,dim,A), rocstreams[n,i]);
# end
end
end

Expand Down

0 comments on commit c116726

Please sign in to comment.