Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 8105be5
Showing
6 changed files
with
517 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
#include <stdio.h> | ||
#include <vector> | ||
|
||
// CUDA device kernel | ||
__global__ void vector_add(const float *A, const float *B, float *C, | ||
size_t array_size) { | ||
// local thread id | ||
size_t id = threadIdx.x; | ||
// calculating global id | ||
size_t total_threads = gridDim.x * blockDim.x; | ||
for (size_t i = id; i < array_size; i += total_threads) { | ||
C[i] = A[i] + B[i]; | ||
} | ||
} | ||
|
||
int main() { | ||
const size_t array_size = 256; | ||
std::vector<float> A(array_size, 1.0f); | ||
std::vector<float> B(array_size, 1.0f); | ||
std::vector<float> C(array_size); | ||
|
||
// allocating device memory | ||
float *A_dev; | ||
float *B_dev; | ||
float *C_dev; | ||
cudaMalloc((void **)&A_dev, array_size * sizeof(float)); | ||
cudaMalloc((void **)&B_dev, array_size * sizeof(float)); | ||
cudaMalloc((void **)&C_dev, array_size * sizeof(float)); | ||
|
||
// explicitly copying data from host to device | ||
cudaMemcpy(A_dev, A.data(), array_size * sizeof(float), | ||
cudaMemcpyHostToDevice); | ||
cudaMemcpy(B_dev, B.data(), array_size * sizeof(float), | ||
cudaMemcpyHostToDevice); | ||
|
||
// getting device property in order to query device parameters | ||
cudaDeviceProp prop; | ||
cudaGetDeviceProperties(&prop, 0); | ||
const size_t max_thread_per_block = prop.maxThreadsPerBlock; | ||
const size_t num_thread_per_block = | ||
std::min(max_thread_per_block, array_size); | ||
const size_t num_block_per_grid = | ||
(size_t)std::ceil(((float)array_size) / num_thread_per_block); | ||
// constructing block size | ||
dim3 block_size(num_thread_per_block, 1, 1); | ||
// constructing number of blocks (grid size) | ||
dim3 num_blocks(num_block_per_grid, 1, 1); | ||
// launching and executing cuda kernel | ||
vector_add<<<num_blocks, block_size>>>(A_dev, B_dev, C_dev, array_size); | ||
// retruning result to the host vector | ||
cudaMemcpy(C.data(), C_dev, array_size * sizeof(float), | ||
cudaMemcpyDeviceToHost); | ||
// releasing the cuda memory objects | ||
cudaFree(A_dev); | ||
cudaFree(B_dev); | ||
cudaFree(C_dev); | ||
return EXIT_SUCCESS; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
.... | ||
int num_gpu; | ||
// finding available number of NVIDIA GPU devices | ||
cudaGetDeviceCount(&num_gpu); | ||
|
||
//looping over number of devices and dispatching a kernel per device. | ||
for (int i = 0; i < ngpus; i++) { | ||
// selecting the current device | ||
cudaSetDevice(i); | ||
// executing a my_kernel on the selected device | ||
my_kernel<<<num_blocks, block_size>>>(...); | ||
// transfering data between the host and the selected device | ||
cudaMemcpy(...); | ||
} | ||
.... |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
...; | ||
|
||
// getting the list of all supported sycl platforms | ||
auto platfrom_list = cl::sycl::platform::get_platforms(); | ||
// getting the list of devices from the platform | ||
auto device_list = platform.get_devices(); | ||
// looping over platforms | ||
for (const auto &platform : platfrom_list) { | ||
// looping over devices | ||
for (const auto &device : device_list) { | ||
auto queue = cl::sycl::queue(device); | ||
// submitting a kernel to a the sycl queue | ||
queue.submit([&](cl::sycl::handler &cgh) { | ||
.... | ||
// sycl kernel | ||
cgh.parallel_for(....); | ||
}); | ||
} | ||
} | ||
...; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
...; | ||
|
||
// constructing the quue for an specefic device | ||
auto my_queue = cl::sycl::queue(device_selector); | ||
|
||
// submitting a kernel to a the sycl queue | ||
my_queue.submit([&](cl::sycl::handler &cgh) { | ||
.... | ||
// sycl kernel 1 | ||
cgh.parallel_for(....); | ||
}); | ||
|
||
my_queue.submit([&](cl::sycl::handler &cgh) { | ||
.... | ||
// sycl kernel 2 | ||
cgh.parallel_for(....); | ||
}); | ||
|
||
my_queue.submit([&](cl::sycl::handler &cgh) { | ||
.... | ||
// sycl kernel 3 | ||
cgh.parallel_for(....); | ||
}); | ||
|
||
...; |
Oops, something went wrong.