-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.cu
109 lines (89 loc) · 3.74 KB
/
main.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#include <cstdio>
#include <cstring>
#include <vector>
#include <algorithm>
#include "src/main.hxx"
using std::vector;
using std::max;
const int THREADS = 4;
// 1² + 2² + 3² + ...
inline int sumSquares(int x) {
return x * (x + 1) * (2 * x + 1) / 6;
}
// Each thread computes pairwise product of multiple components of vector.
// Since there are 10 components, but only a maximum of 4 total threads,
// each thread pairwise product of its component, and shifts by a stride
// of the total number of threads. This is done as long as it does not
// exceed the length of the vector. Each thread maintains the sum of the
// pairwise products it calculates.
//
// Once pairiwise product calculation completes, the per-thread sum is
// stored in a cache, and then all threads in a block sync up to calculate
// the sum for the entire block in a binary tree fashion (in log N steps).
// The overall sum of each block is then stored in an array, which holds
// this partial sum. This partial sum is completed on the CPU. Hence, our
// dot product is complete.
//
// 1. Compute sum of pairwise product at respective index, while within bounds.
// 2. Shift to the next component, by a stride of total no. of threads (4).
// 3. Store per-thread sum in shared cache (for further reduction).
// 4. Wait for all threads within the block to finish.
// 5. Reduce the sum in the cache to a single value in binary tree fashion.
// 6. Store this per-block sum into a partial sum array.
__global__ void kernel(float *c, float *a, float *b, int N) {
__shared__ float cache[THREADS];
int i = threadIdx.x + blockIdx.x * blockDim.x;
int t = threadIdx.x, sum = 0;
while (i < N) { // 1
sum += a[i] * b[i]; // 1
i += blockDim.x * gridDim.x; // 2
}
cache[t] = sum; // 3
__syncthreads(); // 4
int T = blockDim.x / 2; // 5
while (T != 0) { // 5
if (t < T) cache[t] += cache[t + T]; // 5
__syncthreads(); // 5
T /= 2; // 5
}
if (t == 0) c[blockIdx.x] = cache[0]; // 6
}
// 1. Allocate space for 2 vectors A, B (of length 10).
// 2. Define vectors A and B.
// 3. Allocate space for partial sum C (of length "blocks").
// 4. Copy A, B from host memory to device memory (GPU).
// 5. Execute kernel with 2 threads per block, and max. 2 blocks (2*2 = 4).
// 6. Wait for kernel to complete, and copy partial sum C from device to host memory.
// 7. Reduce the partial sum C to a single value, the dot product (on CPU).
// 8. Validate if the dot product is correct (on CPU).
int main() {
int N = 10; // 1
vector<float> a(N), b(N); // 1
size_t NB = N * sizeof(float); // 1
for (int i=0; i<N; i++) { // 2
a[i] = 2*i; // 2
b[i] = i; // 2
} // 2
int threads = 2; // 3
int blocks = max(ceilDiv(N, threads), 2); // 3
vector<float> cpartial(blocks); // 3
size_t NC = blocks * sizeof(float); // 3
float *aD, *bD, *cpartialD; // 4
TRY( cudaMalloc(&aD, NB) ); // 4
TRY( cudaMalloc(&bD, NB) ); // 4
TRY( cudaMalloc(&cpartialD, NC) ); // 4
TRY( cudaMemcpy(aD, a.data(), NB, cudaMemcpyHostToDevice) ); // 4
TRY( cudaMemcpy(bD, b.data(), NB, cudaMemcpyHostToDevice) ); // 4
kernel<<<blocks, threads>>>(cpartialD, aD, bD, N); // 5
TRY( cudaMemcpy(cpartial.data(), cpartialD, NC, cudaMemcpyDeviceToHost) ); // 6
float c = sum(cpartial); // 7
printf("a = "); println(a);
printf("b = "); println(b);
printf("a .* b = %.1f\n", c);
float cexpected = 2 * sumSquares(N-1); // 8
if (c != cexpected) { // 8
fprintf(stderr, "ERROR: a .* b != %.1f\n", cexpected);
}
printf("\n");
return 0;
}