Skip to content

Commit

Permalink
first commit
Browse files Browse the repository at this point in the history
  • Loading branch information
rodburns committed Oct 9, 2018
0 parents commit 8105be5
Show file tree
Hide file tree
Showing 6 changed files with 517 additions and 0 deletions.
58 changes: 58 additions & 0 deletions cuda-vec-add.cu
@@ -0,0 +1,58 @@
#include <stdio.h>
#include <vector>

// CUDA device kernel
__global__ void vector_add(const float *A, const float *B, float *C,
size_t array_size) {
// local thread id
size_t id = threadIdx.x;
// calculating global id
size_t total_threads = gridDim.x * blockDim.x;
for (size_t i = id; i < array_size; i += total_threads) {
C[i] = A[i] + B[i];
}
}

int main() {
const size_t array_size = 256;
std::vector<float> A(array_size, 1.0f);
std::vector<float> B(array_size, 1.0f);
std::vector<float> C(array_size);

// allocating device memory
float *A_dev;
float *B_dev;
float *C_dev;
cudaMalloc((void **)&A_dev, array_size * sizeof(float));
cudaMalloc((void **)&B_dev, array_size * sizeof(float));
cudaMalloc((void **)&C_dev, array_size * sizeof(float));

// explicitly copying data from host to device
cudaMemcpy(A_dev, A.data(), array_size * sizeof(float),
cudaMemcpyHostToDevice);
cudaMemcpy(B_dev, B.data(), array_size * sizeof(float),
cudaMemcpyHostToDevice);

// getting device property in order to query device parameters
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, 0);
const size_t max_thread_per_block = prop.maxThreadsPerBlock;
const size_t num_thread_per_block =
std::min(max_thread_per_block, array_size);
const size_t num_block_per_grid =
(size_t)std::ceil(((float)array_size) / num_thread_per_block);
// constructing block size
dim3 block_size(num_thread_per_block, 1, 1);
// constructing number of blocks (grid size)
dim3 num_blocks(num_block_per_grid, 1, 1);
// launching and executing cuda kernel
vector_add<<<num_blocks, block_size>>>(A_dev, B_dev, C_dev, array_size);
// retruning result to the host vector
cudaMemcpy(C.data(), C_dev, array_size * sizeof(float),
cudaMemcpyDeviceToHost);
// releasing the cuda memory objects
cudaFree(A_dev);
cudaFree(B_dev);
cudaFree(C_dev);
return EXIT_SUCCESS;
}
15 changes: 15 additions & 0 deletions pseudo-cuda-mgpu.cpp
@@ -0,0 +1,15 @@
....
int num_gpu;
// finding available number of NVIDIA GPU devices
cudaGetDeviceCount(&num_gpu);

//looping over number of devices and dispatching a kernel per device.
for (int i = 0; i < ngpus; i++) {
// selecting the current device
cudaSetDevice(i);
// executing a my_kernel on the selected device
my_kernel<<<num_blocks, block_size>>>(...);
// transfering data between the host and the selected device
cudaMemcpy(...);
}
....
20 changes: 20 additions & 0 deletions pseudo-sycl-mgpu.cpp
@@ -0,0 +1,20 @@
...;

// getting the list of all supported sycl platforms
auto platfrom_list = cl::sycl::platform::get_platforms();
// getting the list of devices from the platform
auto device_list = platform.get_devices();
// looping over platforms
for (const auto &platform : platfrom_list) {
// looping over devices
for (const auto &device : device_list) {
auto queue = cl::sycl::queue(device);
// submitting a kernel to a the sycl queue
queue.submit([&](cl::sycl::handler &cgh) {
....
// sycl kernel
cgh.parallel_for(....);
});
}
}
...;
25 changes: 25 additions & 0 deletions pseudo-sycl-sgpu.cpp
@@ -0,0 +1,25 @@
...;

// constructing the quue for an specefic device
auto my_queue = cl::sycl::queue(device_selector);

// submitting a kernel to a the sycl queue
my_queue.submit([&](cl::sycl::handler &cgh) {
....
// sycl kernel 1
cgh.parallel_for(....);
});

my_queue.submit([&](cl::sycl::handler &cgh) {
....
// sycl kernel 2
cgh.parallel_for(....);
});

my_queue.submit([&](cl::sycl::handler &cgh) {
....
// sycl kernel 3
cgh.parallel_for(....);
});

...;

0 comments on commit 8105be5

Please sign in to comment.