-
Notifications
You must be signed in to change notification settings - Fork 724
/
patch-libs_ardour_x86__functions__avx512f.cc
109 lines (87 loc) · 4.49 KB
/
patch-libs_ardour_x86__functions__avx512f.cc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
--- libs/ardour/x86_functions_avx512f.cc.orig 2023-02-21 01:05:04 UTC
+++ libs/ardour/x86_functions_avx512f.cc
@@ -83,7 +83,7 @@ x86_avx512f_compute_peak(const float *src, uint32_t nf
}
while (frames >= 256) {
- _mm_prefetch(reinterpret_cast<void const *>(src + 256), _mm_hint(0));
+ _mm_prefetch(reinterpret_cast<void const *>(src + 256), _MM_HINT_NTA);
__m512 x0 = _mm512_load_ps(src + 0);
__m512 x1 = _mm512_load_ps(src + 16);
@@ -142,7 +142,7 @@ x86_avx512f_compute_peak(const float *src, uint32_t nf
}
while (frames >= 128) {
- _mm_prefetch(reinterpret_cast<void const *>(src + 128), _mm_hint(0));
+ _mm_prefetch(reinterpret_cast<void const *>(src + 128), _MM_HINT_NTA);
__m512 x0 = _mm512_load_ps(src + 0);
__m512 x1 = _mm512_load_ps(src + 16);
@@ -176,7 +176,7 @@ x86_avx512f_compute_peak(const float *src, uint32_t nf
}
while (frames >= 64) {
- _mm_prefetch(reinterpret_cast<void const *>(src + 64), _mm_hint(0));
+ _mm_prefetch(reinterpret_cast<void const *>(src + 64), _MM_HINT_NTA);
__m512 x0 = _mm512_load_ps(src + 0);
__m512 x1 = _mm512_load_ps(src + 16);
@@ -310,7 +310,7 @@ x86_avx512f_find_peaks(const float *src, uint32_t nfra
}
while (frames >= 256) {
- _mm_prefetch(reinterpret_cast<void const *>(src + 256), _mm_hint(0));
+ _mm_prefetch(reinterpret_cast<void const *>(src + 256), _MM_HINT_NTA);
__m512 x0 = _mm512_load_ps(src + 0);
__m512 x1 = _mm512_load_ps(src + 16);
@@ -371,7 +371,7 @@ x86_avx512f_find_peaks(const float *src, uint32_t nfra
}
while (frames >= 128) {
- _mm_prefetch(reinterpret_cast<void const *>(src + 128), _mm_hint(0));
+ _mm_prefetch(reinterpret_cast<void const *>(src + 128), _MM_HINT_NTA);
__m512 x0 = _mm512_load_ps(src + 0);
__m512 x1 = _mm512_load_ps(src + 16);
@@ -405,7 +405,7 @@ x86_avx512f_find_peaks(const float *src, uint32_t nfra
}
while (frames >= 64) {
- _mm_prefetch(reinterpret_cast<void const *>(src + 64), _mm_hint(0));
+ _mm_prefetch(reinterpret_cast<void const *>(src + 64), _MM_HINT_NTA);
__m512 x0 = _mm512_load_ps(src + 0);
__m512 x1 = _mm512_load_ps(src + 16);
@@ -532,7 +532,7 @@ x86_avx512f_apply_gain_to_buffer(float *dst, uint32_t
// Process the remaining samples 128 at a time
while (frames >= 128) {
- _mm_prefetch(reinterpret_cast<void const *>(dst + 128), _mm_hint(0));
+ _mm_prefetch(reinterpret_cast<void const *>(dst + 128), _MM_HINT_NTA);
__m512 x0 = _mm512_load_ps(dst + 0);
__m512 x1 = _mm512_load_ps(dst + 16);
@@ -679,8 +679,8 @@ x86_avx512f_mix_buffers_with_gain(float *dst, const fl
// Process the remaining samples 128 at a time
while (frames >= 128) {
- _mm_prefetch(reinterpret_cast<void const *>(src + 128), _mm_hint(0));
- _mm_prefetch(reinterpret_cast<void const *>(dst + 128), _mm_hint(0));
+ _mm_prefetch(reinterpret_cast<void const *>(src + 128), _MM_HINT_NTA);
+ _mm_prefetch(reinterpret_cast<void const *>(dst + 128), _MM_HINT_NTA);
__m512 x0 = _mm512_load_ps(src + 0);
__m512 x1 = _mm512_load_ps(src + 16);
@@ -836,8 +836,8 @@ x86_avx512f_mix_buffers_no_gain(float *dst, const floa
// Process the remaining samples 128 at a time
while (frames >= 128) {
- _mm_prefetch(reinterpret_cast<void const *>(src + 128), _mm_hint(0));
- _mm_prefetch(reinterpret_cast<void const *>(dst + 128), _mm_hint(0));
+ _mm_prefetch(reinterpret_cast<void const *>(src + 128), _MM_HINT_NTA);
+ _mm_prefetch(reinterpret_cast<void const *>(dst + 128), _MM_HINT_NTA);
__m512 x0 = _mm512_load_ps(src + 0);
__m512 x1 = _mm512_load_ps(src + 16);
@@ -987,8 +987,8 @@ x86_avx512f_copy_vector(float *dst, const float *src,
// Process 256 samples at a time
while (frames >= 256) {
- _mm_prefetch(reinterpret_cast<void const *>(src + 256), _mm_hint(0));
- _mm_prefetch(reinterpret_cast<void const *>(dst + 256), _mm_hint(0));
+ _mm_prefetch(reinterpret_cast<void const *>(src + 256), _MM_HINT_NTA);
+ _mm_prefetch(reinterpret_cast<void const *>(dst + 256), _MM_HINT_NTA);
__m512 x0 = _mm512_load_ps(src + 0);
__m512 x1 = _mm512_load_ps(src + 16);
@@ -1033,8 +1033,8 @@ x86_avx512f_copy_vector(float *dst, const float *src,
// Process remaining samples 64 at a time
while (frames >= 64) {
- _mm_prefetch(reinterpret_cast<void const *>(src + 64), _mm_hint(0));
- _mm_prefetch(reinterpret_cast<void const *>(dst + 64), _mm_hint(0));
+ _mm_prefetch(reinterpret_cast<void const *>(src + 64), _MM_HINT_NTA);
+ _mm_prefetch(reinterpret_cast<void const *>(dst + 64), _MM_HINT_NTA);
__m512 x0 = _mm512_load_ps(src + 0);
__m512 x1 = _mm512_load_ps(src + 16);