-
Notifications
You must be signed in to change notification settings - Fork 7
/
alg_vector_dot_product.cu
220 lines (187 loc) · 6.96 KB
/
alg_vector_dot_product.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
/*!
* \brief Vector dot product: h_result = SUM(A * B).
*/
#include "cuda_util.h"
// Initialize the input data.
void GenArray(const int len, float *arr) {
for (int i = 0; i < len; i++) {
arr[i] = 1;//(float)rand() / RAND_MAX + (float)rand() / (RAND_MAX*RAND_MAX);
}
}
// CPU version: 965ms
// Normal version in cpu as a reference
float VectorDotProductCPU(const float *vec_a, const float *vec_b, const int len) {
float h_result = 0;
for (int i = 0; i<len; i++) {
h_result += vec_a[i] * vec_b[i];
}
return h_result;
}
// CUDA kernel v1 : 283ms
// Multiply and save to shared memory.
// Accumulate data from all of the shared memory to fewer blocks.
template <int BLOCK_SIZE>
__global__ void VectorDotProductKernelv1(const float *vec_a, const float *vec_b, const int len, float &res) {
// Prevents memory access across the border.
for (int i = blockIdx.x * blockDim.x + threadIdx.x;
i < len;
i += blockDim.x * gridDim.x) {
__shared__ float smem[BLOCK_SIZE];
smem[threadIdx.x] = vec_a[i] * vec_b[i];
__syncthreads();
//// Very slow ?
//if (threadIdx.x == 0) {
// int sum = 0;
// for (int i = 0; i < BLOCK_SIZE; i++)
// sum += smem[i];
// atomicAdd(&res, sum);
//}
int count = BLOCK_SIZE / 2;
while (count >= 1) {
if(threadIdx.x < count) {
smem[threadIdx.x] += smem[count + threadIdx.x];
}
// Synchronize the threads within the block,
// then go to next round together.
__syncthreads();
count /= 2;
}
if(threadIdx.x == 0)
atomicAdd(&res, smem[0]);
}
}
// CUDA kernel v2 : 201ms
// Compute two blocks' data to the shared memory of one block.
template <int BLOCK_SIZE>
__global__ void VectorDotProductKernelv2(const float *vec_a, const float *vec_b, const int len, float &res) {
// Prevents memory access across the border.
for (int i = blockIdx.x * blockDim.x + threadIdx.x;
i < len / 2;
i += blockDim.x * gridDim.x) {
__shared__ float smem[BLOCK_SIZE];
smem[threadIdx.x] = vec_a[i] * vec_b[i] + vec_a[i + len / 2] * vec_b[i + len / 2]; // Mainly in here.
__syncthreads();
int count = BLOCK_SIZE >> 1;
while (count >= 1) {
if (threadIdx.x < count) {
smem[threadIdx.x] += smem[count + threadIdx.x];
}
// Synchronize the threads within the block,
// then go to next round together.
__syncthreads();
count >>= 1;
}
if (threadIdx.x == 0)
atomicAdd(&res, smem[0]);
}
}
// CUDA kernel v3 : 179ms
// Condition: The block size should be bigger than 32
// Unroll the last warp
template <int BLOCK_SIZE>
__global__ void VectorDotProductKernelv3(const float *vec_a, const float *vec_b, const int len, float &res) {
// Prevents memory access across the border.
for (int i = blockIdx.x * blockDim.x + threadIdx.x;
i < len / 2;
i += blockDim.x * gridDim.x) {
__shared__ float smem[BLOCK_SIZE];
smem[threadIdx.x] = vec_a[i] * vec_b[i] + vec_a[i + len / 2] * vec_b[i + len / 2];
__syncthreads();
for (int count = BLOCK_SIZE >> 1; count > 32; count >>= 1) {
if (threadIdx.x < count) {
smem[threadIdx.x] += smem[count + threadIdx.x];
}
__syncthreads();
}
////// Mainly in here. Unroll the last warp. (It still need __syncthreads() in a warp ?)
//if (threadIdx.x < 32) {
// smem[threadIdx.x] += smem[threadIdx.x + 32]; __syncthreads();
// smem[threadIdx.x] += smem[threadIdx.x + 16]; __syncthreads();
// smem[threadIdx.x] += smem[threadIdx.x + 8]; __syncthreads();
// smem[threadIdx.x] += smem[threadIdx.x + 4]; __syncthreads();
// smem[threadIdx.x] += smem[threadIdx.x + 2]; __syncthreads();
// smem[threadIdx.x] += smem[threadIdx.x + 1]; __syncthreads();
//}
// 针对上面的最后一个warp仍需要加__syncthreads()的情况,需要注意:
// 编写线程束同步代码时,必须对共享内存的指针使用volatile关键字修饰,
// 否则可能会由于编译器的优化行为改变内存的操作顺序从而使结果不正确。
if (threadIdx.x < 32) {
volatile float *smem_t = smem;
if (blockDim.x > 32) {
smem_t[threadIdx.x] += smem_t[threadIdx.x + 32];
}
smem_t[threadIdx.x] += smem_t[threadIdx.x + 16];
smem_t[threadIdx.x] += smem_t[threadIdx.x + 8];
smem_t[threadIdx.x] += smem_t[threadIdx.x + 4];
smem_t[threadIdx.x] += smem_t[threadIdx.x + 2];
smem_t[threadIdx.x] += smem_t[threadIdx.x + 1];
}
if (threadIdx.x == 0)
atomicAdd(&res, smem[0]);
}
}
float VectorDotProductCUDA(const int loops, const float *vec_a, const float *vec_b, const int len, float &result) {
// Time recorder.
cjmcv_cuda_util::GpuTimer gpu_timer;
const int threads_per_block = 1024; // data_len % threads_per_block == 0
const int blocks_per_grid = (len + threads_per_block - 1) / threads_per_block;
// Warm up.
VectorDotProductKernelv3<threads_per_block> << <blocks_per_grid, threads_per_block >> >
(vec_a, vec_b, len, result);
gpu_timer.Start();
for (int i = 0; i < loops; i++) {
cudaMemset(&result, 0, sizeof(float));
VectorDotProductKernelv3<threads_per_block> << <blocks_per_grid, threads_per_block >> >
(vec_a, vec_b, len, result);
}
gpu_timer.Stop();
return gpu_timer.ElapsedMillis();
}
int main() {
int ret = cjmcv_cuda_util::InitEnvironment(0);
if (ret != 0) {
printf("Failed to initialize the environment for cuda.");
return -1;
}
const int loops = 100;
const int data_len = 1024000; // data_len % threads_per_block == 0
const int data_mem_size = sizeof(float) * data_len;
float *h_vector_a = (float *)malloc(data_mem_size);
float *h_vector_b = (float *)malloc(data_mem_size);
if (h_vector_a == NULL || h_vector_b == NULL) {
printf("Fail to malloc.\n");
return -1;
}
// Initialize
srand(0);
GenArray(data_len, h_vector_a);
GenArray(data_len, h_vector_b);
// CPU
time_t t = clock();
float h_result = 0;
for (int i = 0; i < loops; i++)
h_result = VectorDotProductCPU(h_vector_a, h_vector_b, data_len);
printf("\nIn cpu, msec_total = %lld, h_result = %f\n", clock() - t, h_result);
// GPU
// Allocate memory in host.
float msec_total;
float *d_vector_a = NULL, *d_vector_b = NULL;
float *d_result = NULL;
CUDA_CHECK(cudaMalloc((void **)&d_vector_a, data_mem_size));
CUDA_CHECK(cudaMalloc((void **)&d_vector_b, data_mem_size));
CUDA_CHECK(cudaMalloc((void **)&d_result, sizeof(float)));
// Copy host memory to device
CUDA_CHECK(cudaMemcpy(d_vector_a, h_vector_a, data_mem_size, cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpy(d_vector_b, h_vector_b, data_mem_size, cudaMemcpyHostToDevice));
msec_total = VectorDotProductCUDA(loops, d_vector_a, d_vector_b, data_len, *d_result);
CUDA_CHECK(cudaMemcpy(&h_result, d_result, sizeof(float), cudaMemcpyDeviceToHost));
printf("\nIn gpu, msec_total = %f, h_result = %f\n", msec_total, h_result);
free(h_vector_a);
free(h_vector_b);
cudaFree(d_vector_a);
cudaFree(d_vector_b);
cudaFree(d_result);
cjmcv_cuda_util::CleanUpEnvironment();
system("pause");
return 0;
}