-
Notifications
You must be signed in to change notification settings - Fork 7
/
vector_scan.cpp
113 lines (96 loc) · 3.36 KB
/
vector_scan.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
/*!
* \brief Scan. Prefix Sum.
* \example: input: 1,2,3,4
* operation: Add
* ouput: 1,3,6,10 (out[i]=sum(in[0:i]))
*/
#include <iostream>
#include "time.h"
#include "xmmintrin.h"
// Initialize the input data.
void GenVector(const int len, float *vec) {
for (int i = 0; i < len; i++)
vec[i] = 2;//(float)rand() / RAND_MAX + (float)rand() / (RAND_MAX*RAND_MAX);
}
// Normal version in cpu as a reference
void VectorScanNormal(const float *vec_in, const int len, float *vec_out) {
vec_out[0] = vec_in[0];
for (int i = 1; i<len; i++) {
vec_out[i] = vec_in[i] + vec_out[i - 1];
}
}
// 1, Change the type of x to __m128i for calling the shifts function.
// 2, Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
// 4 * 8 = 32 = one float data.
// 3, Change the type of x back to __m128.
// \example (1 2 3 4) + (0 1 2 3) = (1 3 5 7)
// (1 3 5 7) + (0 0 1 3) = (1 3 6 10)
inline __m128 ScanM128(__m128 x) {
x = _mm_add_ps(x, _mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(x), 4)));
x = _mm_add_ps(x, _mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(x), 8)));
return x;
}
void VectorScanSIMDv1(const float *vec_in, const int len, float *vec_out) {
__m128 offset = _mm_setzero_ps();
for (int i = 0; i < len; i += 4) {
__m128 x = _mm_loadu_ps(vec_in + i);
__m128 y = ScanM128(x);
y = _mm_add_ps(y, offset);
_mm_store_ps(vec_out + i, y);
// offset = _mm_set1_ps(scan_out.m128_f32[3]);
// Selete the third element to form a new m128.
offset = _mm_shuffle_ps(y, y, _MM_SHUFFLE(3, 3, 3, 3));
}
}
inline __m256 ScanM256(__m256 x) {
__m256 t0, t1;
//shift1_SIMD + add
t0 = _mm256_permute_ps(x, _MM_SHUFFLE(2, 1, 0, 3));
t1 = _mm256_permute2f128_ps(t0, t0, 41);
x = _mm256_add_ps(x, _mm256_blend_ps(t0, t1, 0x11));
//shift2_SIMD + add
t0 = _mm256_permute_ps(x, _MM_SHUFFLE(1, 0, 3, 2));
t1 = _mm256_permute2f128_ps(t0, t0, 41);
x = _mm256_add_ps(x, _mm256_blend_ps(t0, t1, 0x33));
//shift3_SIMD + add
x = _mm256_add_ps(x, _mm256_permute2f128_ps(x, x, 41));
return x;
}
void VectorScanSIMDv2(const float *vec_in, const int len, float *vec_out) {
__m256 offset = _mm256_setzero_ps();
for (int i = 0; i < len; i += 8) {
__m256 x = _mm256_loadu_ps(vec_in + i);
__m256 y = ScanM256(x);
y = _mm256_add_ps(y, offset);
_mm256_storeu_ps(vec_out + i, y);
// broadcast last element
__m256 t0 = _mm256_permute2f128_ps(y, y, 0x11);
offset = _mm256_permute_ps(t0, 0xff);
}
}
int main() {
const int loops = 100;
const int len = 1000000;
float *vec_in = new float[len];
float *vec_out = new float[len];
srand(0);
GenVector(len, vec_in);
time_t stime;
stime = clock();
for(int i=0; i<loops; i++)
VectorScanNormal(vec_in, len, vec_out);
std::cout << "Normal -> time: " << clock() - stime << ", result: " << vec_out[len - 1] << std::endl;
memset(vec_out, 0, sizeof(float) * len);
stime = clock();
for (int i = 0; i < loops; i++)
VectorScanSIMDv1(vec_in, len, vec_out);
std::cout << "SIMDv1 -> time: " << clock() - stime << ", result: " << vec_out[len - 1] << std::endl;
stime = clock();
for (int i = 0; i < loops; i++)
VectorScanSIMDv2(vec_in, len, vec_out);
std::cout << "SIMDv2 -> time: " << clock() - stime << ", result: " << vec_out[len - 1] << std::endl;
delete[] vec_in;
delete[] vec_out;
system("pause");
return 0;
}