-
-
Notifications
You must be signed in to change notification settings - Fork 79
/
asyncAPI.cu
106 lines (85 loc) · 3.41 KB
/
asyncAPI.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
////////////////////////////////////////////////////////////////////////////
//
// Copyright 1993-2015 NVIDIA Corporation. All rights reserved.
//
// Please refer to the NVIDIA end user license agreement (EULA) associated
// with this source code for terms and conditions that govern your use of
// this software. Any use, reproduction, disclosure, or distribution of
// this software and related documentation outside the terms of the EULA
// is strictly prohibited.
//
////////////////////////////////////////////////////////////////////////////
//
// This sample illustrates the usage of CUDA events for both GPU timing and
// overlapping CPU and GPU execution. Events are inserted into a stream
// of CUDA calls. Since CUDA stream calls are asynchronous, the CPU can
// perform computations while GPU is executing (including DMA memory-copies
// between the host and device). CPU can query CUDA events to determine
// whether GPU has completed tasks.
//
#include "../../common.hpp"
using datum = int;
__global__ void increment_kernel(datum*g_data, datum inc_value)
{
int global_idx = blockIdx.x * blockDim.x + threadIdx.x;
g_data[global_idx] = g_data[global_idx] + inc_value;
}
bool correct_output(cuda::span<const int> data, const int x)
{
for (size_t i = 0; i < data.size(); i++)
if (data[i] != x)
{
printf("Error! data[%lu] = %d, ref = %d\n", i, data[i], x);
return false;
}
return true;
}
int main(int, char **)
{
std::cout << "asyncAPI Starting...\n";
// This will pick the best possible CUDA capable device
// int devID = findCudaDevice(argc, (const char **)argv);
auto device = cuda::device::current::get();
std::cout << "CUDA device [" << device.name() << "]\n";
const int n = 16 * 1024 * 1024;
int value = 26;
// allocate host memory
auto a = cuda::memory::host::make_unique_span<datum>(n);
cuda::memory::host::zero(a);
auto d_a = cuda::memory::make_unique_span<datum>(device, n);
auto launch_config = cuda::launch_config_builder()
.overall_size(n)
.block_size(512).build();
// create cuda event handles
auto start_event = cuda::event::create(
device,
cuda::event::sync_by_blocking,
cuda::event::do_record_timings,
cuda::event::not_interprocess);
auto end_event = cuda::event::create(
device,
cuda::event::sync_by_blocking,
cuda::event::do_record_timings,
cuda::event::not_interprocess);
auto stream = device.default_stream(); // device.create_stream(cuda::stream::async);
auto cpu_time_start = std::chrono::high_resolution_clock::now();
stream.enqueue.event(start_event);
stream.enqueue.copy(d_a, a);
stream.enqueue.kernel_launch(increment_kernel, launch_config, d_a.data(), value);
stream.enqueue.copy(a, d_a);
stream.enqueue.event(end_event);
auto cpu_time_end = std::chrono::high_resolution_clock::now();
// have CPU do some work while waiting for stage 1 to finish
unsigned long int counter=0;
while (not end_event.has_occurred())
{
counter++;
}
std::cout << "time spent executing by the GPU: " << std::setprecision(2)
<< cuda::event::time_elapsed_between(start_event, end_event).count() << '\n';
std::cout << "time spent by CPU in CUDA calls: " << std::setprecision(2)<< (cpu_time_end - cpu_time_start).count() << '\n';
std::cout << "CPU executed " << counter << " iterations while waiting for GPU to finish\n";
auto bFinalResults = correct_output(a, value);
std::cout << (bFinalResults ? "SUCCESS" : "FAILURE") << '\n';
exit(bFinalResults ? EXIT_SUCCESS : EXIT_FAILURE);
}