From e156651190fefcb9426e49cdc9f5075f142f5dcd Mon Sep 17 00:00:00 2001
From: Matt Pharr <matt.pharr@intel.com>
Date: Fri, 8 Jul 2011 11:21:11 +0100
Subject: [PATCH] Fix __load_masked_{32,64} to properly obey the mask.  Fixes
 issue #28.

Fixed the implementations of these builtin functions for targets that don't have native masked load instructions so that they do no loads if the vector mask is all off, and only do an (unaligned) vector load if both the first and last element of the mask are on.  Otherwise they serialize and do scalar loads for only the active lanes.  This fixes a number of potential sources of crashes due to accessing invalid memory.
---
 stdlib-sse.ll    | 80 +++------------------------------------------
 stdlib-sse4x2.ll | 51 +++--------------------------
 stdlib.m4        | 84 ++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 92 insertions(+), 123 deletions(-)

diff --git a/stdlib-sse.ll b/stdlib-sse.ll
index 77d5287368..345fdded7a 100644
--- a/stdlib-sse.ll
+++ b/stdlib-sse.ll
@@ -401,82 +401,10 @@ define void @__masked_store_64(<4 x i64>* nocapture, <4 x i64>, <4 x i32>) nounw
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; unaligned loads/loads+broadcasts
 
-define <4 x i32> @__load_and_broadcast_32(i8 *, <4 x i32> %mask) nounwind alwaysinline {
-  ; must not load if the mask is all off; the address may be invalid
-  %mm = call i32 @__movmsk(<4 x i32> %mask)
-  %any_on = icmp ne i32 %mm, 0
-  br i1 %any_on, label %load, label %skip
-
-load:
-  %ptr = bitcast i8 * %0 to i32 *
-  %val = load i32 * %ptr
-
-  %ret0 = insertelement <4 x i32> undef, i32 %val, i32 0
-  %ret1 = insertelement <4 x i32> %ret0, i32 %val, i32 1
-  %ret2 = insertelement <4 x i32> %ret1, i32 %val, i32 2
-  %ret3 = insertelement <4 x i32> %ret2, i32 %val, i32 3
-  ret <4 x i32> %ret3
-
-skip:
-  ret <4 x i32> undef
-}
-
-define <4 x i64> @__load_and_broadcast_64(i8 *, <4 x i32> %mask) nounwind alwaysinline {
-  ; must not load if the mask is all off; the address may be invalid
-  %mm = call i32 @__movmsk(<4 x i32> %mask)
-  %any_on = icmp ne i32 %mm, 0
-  br i1 %any_on, label %load, label %skip
-
-load:
-  %ptr = bitcast i8 * %0 to i64 *
-  %val = load i64 * %ptr
-
-  %ret0 = insertelement <4 x i64> undef, i64 %val, i32 0
-  %ret1 = insertelement <4 x i64> %ret0, i64 %val, i32 1
-  %ret2 = insertelement <4 x i64> %ret1, i64 %val, i32 2
-  %ret3 = insertelement <4 x i64> %ret2, i64 %val, i32 3
-  ret <4 x i64> %ret3
-
-skip:
-  ret <4 x i64> undef
-}
-
-define <4 x i32> @__load_masked_32(i8 *, <4 x i32> %mask) nounwind alwaysinline {
-  %mm = call i32 @__movmsk(<4 x i32> %mask)
-  %any_on = icmp ne i32 %mm, 0
-  br i1 %any_on, label %load, label %skip
-
-load: 
-  ; if any mask lane is on, just load all of the values
-  ; FIXME: there is a lurking bug here if we straddle a page boundary, the
-  ; next page is invalid to read, but the mask bits are set so that we
-  ; aren't supposed to be reading those elements...
-  %ptr = bitcast i8 * %0 to <4 x i32> *
-  %val = load <4 x i32> * %ptr, align 4
-  ret <4 x i32> %val
-
-skip:
-  ret <4 x i32> undef
-}
-
-define <4 x i64> @__load_masked_64(i8 *, <4 x i32> %mask) nounwind alwaysinline {
-  %mm = call i32 @__movmsk(<4 x i32> %mask)
-  %any_on = icmp ne i32 %mm, 0
-  br i1 %any_on, label %load, label %skip
-
-load:
-  ; if any mask lane is on, just load all of the values
-  ; FIXME: there is a lurking bug here if we straddle a page boundary, the
-  ; next page is invalid to read, but the mask bits are set so that we
-  ; aren't supposed to be reading those elements...
-  %ptr = bitcast i8 * %0 to <4 x i64> *
-  %val = load <4 x i64> * %ptr, align 8
-  ret <4 x i64> %val
-
-skip:
-  ret <4 x i64> undef
-}
-
+load_and_broadcast(4, i32, 32)
+load_and_broadcast(4, i64, 64)
+load_masked(4, i32, 32, 4)
+load_masked(4, i64, 64, 8)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather/scatter
diff --git a/stdlib-sse4x2.ll b/stdlib-sse4x2.ll
index 83baaecd0e..1b57cdc614 100644
--- a/stdlib-sse4x2.ll
+++ b/stdlib-sse4x2.ll
@@ -463,53 +463,10 @@ define void @__masked_store_64(<8 x i64>* nocapture, <8 x i64>,
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; unaligned loads/loads+broadcasts
 
-; FIXME: I think this and the next one need to verify that the mask isn't
-; all off before doing the load!!!  (See e.g. stdlib-sse.ll)
-
-define <8 x i32> @__load_and_broadcast_32(i8 *, <8 x i32> %mask) nounwind alwaysinline {
-  %ptr = bitcast i8 * %0 to i32 *
-  %val = load i32 * %ptr
-
-  %ret0 = insertelement <8 x i32> undef, i32 %val, i32 0
-  %ret1 = insertelement <8 x i32> %ret0, i32 %val, i32 1
-  %ret2 = insertelement <8 x i32> %ret1, i32 %val, i32 2
-  %ret3 = insertelement <8 x i32> %ret2, i32 %val, i32 3
-  %ret4 = insertelement <8 x i32> %ret3, i32 %val, i32 4
-  %ret5 = insertelement <8 x i32> %ret4, i32 %val, i32 5
-  %ret6 = insertelement <8 x i32> %ret5, i32 %val, i32 6
-  %ret7 = insertelement <8 x i32> %ret6, i32 %val, i32 7
-  ret <8 x i32> %ret7
-}
-
-
-define <8 x i64> @__load_and_broadcast_64(i8 *, <8 x i32> %mask) nounwind alwaysinline {
-  %ptr = bitcast i8 * %0 to i64 *
-  %val = load i64 * %ptr
-
-  %ret0 = insertelement <8 x i64> undef, i64 %val, i32 0
-  %ret1 = insertelement <8 x i64> %ret0, i64 %val, i32 1
-  %ret2 = insertelement <8 x i64> %ret1, i64 %val, i32 2
-  %ret3 = insertelement <8 x i64> %ret2, i64 %val, i32 3
-  %ret4 = insertelement <8 x i64> %ret3, i64 %val, i32 4
-  %ret5 = insertelement <8 x i64> %ret4, i64 %val, i32 5
-  %ret6 = insertelement <8 x i64> %ret5, i64 %val, i32 6
-  %ret7 = insertelement <8 x i64> %ret6, i64 %val, i32 7
-  ret <8 x i64> %ret7
-}
-
-
-define <8 x i32> @__load_masked_32(i8 *, <8 x i32> %mask) nounwind alwaysinline {
-  %ptr = bitcast i8 * %0 to <8 x i32> *
-  %val = load <8 x i32> * %ptr, align 4
-  ret <8 x i32> %val
-}
-
-
-define <8 x i64> @__load_masked_64(i8 *, <8 x i32> %mask) nounwind alwaysinline {
-  %ptr = bitcast i8 * %0 to <8 x i64> *
-  %val = load <8 x i64> * %ptr, align 8
-  ret <8 x i64> %val
-}
+load_and_broadcast(8, i32, 32)
+load_and_broadcast(8, i64, 64)
+load_masked(8, i32, 32, 4)
+load_masked(8, i64, 64, 8)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather/scatter
diff --git a/stdlib.m4 b/stdlib.m4
index 50fbc9cc59..797aeb51a3 100644
--- a/stdlib.m4
+++ b/stdlib.m4
@@ -1055,6 +1055,90 @@ skip:
 '
 )
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Emit code to safely load a scalar value and broadcast it across the
+;; elements of a vector.  Parameters:
+;; $1: target vector width
+;; $2: element type for which to emit the function (i32, i64, ...)
+;; $3: suffix for function name (32, 64, ...)
+
+
+define(`load_and_broadcast', `
+define <$1 x $2> @__load_and_broadcast_$3(i8 *, <$1 x i32> %mask) nounwind alwaysinline {
+  ; must not load if the mask is all off; the address may be invalid
+  %mm = call i32 @__movmsk(<$1 x i32> %mask)
+  %any_on = icmp ne i32 %mm, 0
+  br i1 %any_on, label %load, label %skip
+
+load:
+  %ptr = bitcast i8 * %0 to $2 *
+  %val = load $2 * %ptr
+
+  %ret0 = insertelement <$1 x $2> undef, $2 %val, i32 0
+  forloop(i, 1, eval($1-1), `
+  %ret`'i = insertelement <$1 x $2> %ret`'eval(i-1), $2 %val, i32 i')
+  ret <$1 x $2> %ret`'eval($1-1)
+
+skip:
+  ret <$1 x $2> undef
+}
+')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Emit general-purpose code to do a masked load for targets that dont have
+;; an instruction to do that.  Parameters:
+;; $1: target vector width
+;; $2: element type for which to emit the function (i32, i64, ...)
+;; $3: suffix for function name (32, 64, ...)
+;; $4: alignment for elements of type $2 (4, 8, ...)
+
+define(`load_masked', `
+define <$1 x $2> @__load_masked_$3(i8 *, <$1 x i32> %mask) nounwind alwaysinline {
+entry:
+  %mm = call i32 @__movmsk(<$1 x i32> %mask)
+  ; if the first lane and the last lane are on, then it is safe to do a vector load
+  ; of the whole thing--what the lanes in the middle want turns out to not matter...
+  %mm_and = and i32 %mm, eval(1 | (1<<($1-1)))
+  %can_vload = icmp eq i32 %mm_and, eval(1 | (1<<($1-1)))
+  ; if we are not able to do a singe vload, we will accumulate lanes in this memory..
+  %retptr = alloca <$1 x $2>
+  %retptr32 = bitcast <$1 x $2> * %retptr to $2 *
+  br i1 %can_vload, label %load, label %loop
+
+load: 
+  %ptr = bitcast i8 * %0 to <$1 x $2> *
+  %valall = load <$1 x $2> * %ptr, align $4
+  ret <$1 x $2> %valall
+
+loop:
+  ; loop over the lanes and see if each one is on...
+  %lane = phi i32 [ 0, %entry ], [ %next_lane, %lane_done ]
+  %lanemask = shl i32 1, %lane
+  %mask_and = and i32 %mm, %lanemask
+  %do_lane = icmp ne i32 %mask_and, 0
+  br i1 %do_lane, label %load_lane, label %lane_done
+
+load_lane:
+  ; yes!  do the load and store the result into the appropriate place in the
+  ; allocaed memory above
+  %ptr32 = bitcast i8 * %0 to $2 *
+  %lane_ptr = getelementptr $2 * %ptr32, i32 %lane
+  %val = load $2 * %lane_ptr
+  %store_ptr = getelementptr $2 * %retptr32, i32 %lane
+  store $2 %val, $2 * %store_ptr
+  br label %lane_done
+
+lane_done:
+  %next_lane = add i32 %lane, 1
+  %done = icmp eq i32 %lane, eval($1-1)
+  br i1 %done, label %return, label %loop
+
+return:
+  %r = load <$1 x $2> * %retptr
+  ret <$1 x $2> %r
+}
+')
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; packed load and store functions
 ;;