first commit

codeplaysoftware · Oct 9, 2018 · 8105be5 · 8105be5
commit 8105be5
Show file tree

Hide file tree

Showing 6 changed files with 517 additions and 0 deletions.
diff --git a/cuda-vec-add.cu b/cuda-vec-add.cu
@@ -0,0 +1,58 @@
+#include <stdio.h>                                                              
+#include <vector>                                                               
+
+// CUDA device kernel                                                           
+__global__ void vector_add(const float *A, const float *B, float *C,            
+                           size_t array_size) {                                 
+  // local thread id                                                            
+  size_t id = threadIdx.x;                                                      
+  // calculating global id                                                      
+  size_t total_threads = gridDim.x * blockDim.x;                                
+  for (size_t i = id; i < array_size; i += total_threads) {                     
+    C[i] = A[i] + B[i];                                                         
+  }                                                                             
+}                                                                               
+
+int main() {                                                                    
+  const size_t array_size = 256;                                                
+  std::vector<float> A(array_size, 1.0f);                                       
+  std::vector<float> B(array_size, 1.0f);                                       
+  std::vector<float> C(array_size);                                             
+
+  // allocating device memory                                                   
+  float *A_dev;                                                                 
+  float *B_dev;                                                                 
+  float *C_dev;                                                                 
+  cudaMalloc((void **)&A_dev, array_size * sizeof(float));                      
+  cudaMalloc((void **)&B_dev, array_size * sizeof(float));                      
+  cudaMalloc((void **)&C_dev, array_size * sizeof(float));                      
+
+  // explicitly copying data from host to device                                
+  cudaMemcpy(A_dev, A.data(), array_size * sizeof(float),                       
+             cudaMemcpyHostToDevice);                                           
+  cudaMemcpy(B_dev, B.data(), array_size * sizeof(float),                       
+             cudaMemcpyHostToDevice);                                           
+
+  // getting device property in order to query device parameters                
+  cudaDeviceProp prop;                                                          
+  cudaGetDeviceProperties(&prop, 0);                                                                         
+  const size_t max_thread_per_block = prop.maxThreadsPerBlock;                  
+  const size_t num_thread_per_block =                                           
+      std::min(max_thread_per_block, array_size);                               
+  const size_t num_block_per_grid =                                                                                                  
+               (size_t)std::ceil(((float)array_size) / num_thread_per_block);  
+  // constructing block size                                                    
+  dim3 block_size(num_thread_per_block, 1, 1);                                  
+  // constructing number of blocks (grid size)                                  
+  dim3 num_blocks(num_block_per_grid, 1, 1);                                    
+  // launching and executing cuda kernel                                        
+  vector_add<<<num_blocks, block_size>>>(A_dev, B_dev, C_dev, array_size);      
+  // retruning result to the host vector                                        
+  cudaMemcpy(C.data(), C_dev, array_size * sizeof(float),                       
+             cudaMemcpyDeviceToHost);                                           
+  // releasing the cuda memory objects                                          
+  cudaFree(A_dev);                                                              
+  cudaFree(B_dev);                                                              
+  cudaFree(C_dev);                                                              
+  return EXIT_SUCCESS;                                                          
+}
diff --git a/pseudo-cuda-mgpu.cpp b/pseudo-cuda-mgpu.cpp
@@ -0,0 +1,15 @@
+....
+int num_gpu;
+// finding available number of NVIDIA GPU devices
+cudaGetDeviceCount(&num_gpu);
+
+//looping over number of devices and dispatching a kernel per device.
+for (int i = 0; i < ngpus; i++) {
+	// selecting the current device
+	cudaSetDevice(i);
+	// executing a my_kernel on the selected device
+	my_kernel<<<num_blocks, block_size>>>(...);
+	// transfering data between the host and the selected device
+	cudaMemcpy(...);
+}
+....
diff --git a/pseudo-sycl-mgpu.cpp b/pseudo-sycl-mgpu.cpp
@@ -0,0 +1,20 @@
+...;
+
+// getting the list of all supported sycl platforms
+auto platfrom_list = cl::sycl::platform::get_platforms();
+// getting the list of devices from the platform
+auto device_list = platform.get_devices();
+// looping over platforms
+for (const auto &platform : platfrom_list) {
+  // looping over devices
+  for (const auto &device : device_list) {
+    auto queue = cl::sycl::queue(device);
+    // submitting a kernel to a the sycl queue
+    queue.submit([&](cl::sycl::handler &cgh) {
+      ....
+          // sycl kernel
+          cgh.parallel_for(....);
+    });
+  }
+}
+...;
diff --git a/pseudo-sycl-sgpu.cpp b/pseudo-sycl-sgpu.cpp
@@ -0,0 +1,25 @@
+...;
+
+// constructing the quue for an specefic device
+auto my_queue = cl::sycl::queue(device_selector);
+
+// submitting a kernel to a the sycl queue
+my_queue.submit([&](cl::sycl::handler &cgh) {
+	....
+  // sycl kernel 1
+  cgh.parallel_for(....);
+});
+
+my_queue.submit([&](cl::sycl::handler &cgh) {
+	....
+  // sycl kernel 2
+  cgh.parallel_for(....);
+});
+
+my_queue.submit([&](cl::sycl::handler &cgh) {
+	....
+  // sycl kernel 3
+  cgh.parallel_for(....);
+});
+
+...;