Skip to content

Commit

Permalink
Fix __load_masked_{32,64} to properly obey the mask. Fixes issue ispc#28
Browse files Browse the repository at this point in the history
.

Fixed the implementations of these builtin functions for targets that don't have native masked load instructions so that they do no loads if the vector mask is all off, and only do an (unaligned) vector load if both the first and last element of the mask are on.  Otherwise they serialize and do scalar loads for only the active lanes.  This fixes a number of potential sources of crashes due to accessing invalid memory.
  • Loading branch information
Matt Pharr committed Jul 8, 2011
1 parent 092d288 commit e156651
Show file tree
Hide file tree
Showing 3 changed files with 92 additions and 123 deletions.
80 changes: 4 additions & 76 deletions stdlib-sse.ll
Expand Up @@ -401,82 +401,10 @@ define void @__masked_store_64(<4 x i64>* nocapture, <4 x i64>, <4 x i32>) nounw
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; unaligned loads/loads+broadcasts

define <4 x i32> @__load_and_broadcast_32(i8 *, <4 x i32> %mask) nounwind alwaysinline {
; must not load if the mask is all off; the address may be invalid
%mm = call i32 @__movmsk(<4 x i32> %mask)
%any_on = icmp ne i32 %mm, 0
br i1 %any_on, label %load, label %skip

load:
%ptr = bitcast i8 * %0 to i32 *
%val = load i32 * %ptr

%ret0 = insertelement <4 x i32> undef, i32 %val, i32 0
%ret1 = insertelement <4 x i32> %ret0, i32 %val, i32 1
%ret2 = insertelement <4 x i32> %ret1, i32 %val, i32 2
%ret3 = insertelement <4 x i32> %ret2, i32 %val, i32 3
ret <4 x i32> %ret3

skip:
ret <4 x i32> undef
}

define <4 x i64> @__load_and_broadcast_64(i8 *, <4 x i32> %mask) nounwind alwaysinline {
; must not load if the mask is all off; the address may be invalid
%mm = call i32 @__movmsk(<4 x i32> %mask)
%any_on = icmp ne i32 %mm, 0
br i1 %any_on, label %load, label %skip

load:
%ptr = bitcast i8 * %0 to i64 *
%val = load i64 * %ptr

%ret0 = insertelement <4 x i64> undef, i64 %val, i32 0
%ret1 = insertelement <4 x i64> %ret0, i64 %val, i32 1
%ret2 = insertelement <4 x i64> %ret1, i64 %val, i32 2
%ret3 = insertelement <4 x i64> %ret2, i64 %val, i32 3
ret <4 x i64> %ret3

skip:
ret <4 x i64> undef
}

define <4 x i32> @__load_masked_32(i8 *, <4 x i32> %mask) nounwind alwaysinline {
%mm = call i32 @__movmsk(<4 x i32> %mask)
%any_on = icmp ne i32 %mm, 0
br i1 %any_on, label %load, label %skip

load:
; if any mask lane is on, just load all of the values
; FIXME: there is a lurking bug here if we straddle a page boundary, the
; next page is invalid to read, but the mask bits are set so that we
; aren't supposed to be reading those elements...
%ptr = bitcast i8 * %0 to <4 x i32> *
%val = load <4 x i32> * %ptr, align 4
ret <4 x i32> %val

skip:
ret <4 x i32> undef
}

define <4 x i64> @__load_masked_64(i8 *, <4 x i32> %mask) nounwind alwaysinline {
%mm = call i32 @__movmsk(<4 x i32> %mask)
%any_on = icmp ne i32 %mm, 0
br i1 %any_on, label %load, label %skip

load:
; if any mask lane is on, just load all of the values
; FIXME: there is a lurking bug here if we straddle a page boundary, the
; next page is invalid to read, but the mask bits are set so that we
; aren't supposed to be reading those elements...
%ptr = bitcast i8 * %0 to <4 x i64> *
%val = load <4 x i64> * %ptr, align 8
ret <4 x i64> %val

skip:
ret <4 x i64> undef
}

load_and_broadcast(4, i32, 32)
load_and_broadcast(4, i64, 64)
load_masked(4, i32, 32, 4)
load_masked(4, i64, 64, 8)

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; gather/scatter
Expand Down
51 changes: 4 additions & 47 deletions stdlib-sse4x2.ll
Expand Up @@ -463,53 +463,10 @@ define void @__masked_store_64(<8 x i64>* nocapture, <8 x i64>,
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; unaligned loads/loads+broadcasts

; FIXME: I think this and the next one need to verify that the mask isn't
; all off before doing the load!!! (See e.g. stdlib-sse.ll)

define <8 x i32> @__load_and_broadcast_32(i8 *, <8 x i32> %mask) nounwind alwaysinline {
%ptr = bitcast i8 * %0 to i32 *
%val = load i32 * %ptr

%ret0 = insertelement <8 x i32> undef, i32 %val, i32 0
%ret1 = insertelement <8 x i32> %ret0, i32 %val, i32 1
%ret2 = insertelement <8 x i32> %ret1, i32 %val, i32 2
%ret3 = insertelement <8 x i32> %ret2, i32 %val, i32 3
%ret4 = insertelement <8 x i32> %ret3, i32 %val, i32 4
%ret5 = insertelement <8 x i32> %ret4, i32 %val, i32 5
%ret6 = insertelement <8 x i32> %ret5, i32 %val, i32 6
%ret7 = insertelement <8 x i32> %ret6, i32 %val, i32 7
ret <8 x i32> %ret7
}


define <8 x i64> @__load_and_broadcast_64(i8 *, <8 x i32> %mask) nounwind alwaysinline {
%ptr = bitcast i8 * %0 to i64 *
%val = load i64 * %ptr

%ret0 = insertelement <8 x i64> undef, i64 %val, i32 0
%ret1 = insertelement <8 x i64> %ret0, i64 %val, i32 1
%ret2 = insertelement <8 x i64> %ret1, i64 %val, i32 2
%ret3 = insertelement <8 x i64> %ret2, i64 %val, i32 3
%ret4 = insertelement <8 x i64> %ret3, i64 %val, i32 4
%ret5 = insertelement <8 x i64> %ret4, i64 %val, i32 5
%ret6 = insertelement <8 x i64> %ret5, i64 %val, i32 6
%ret7 = insertelement <8 x i64> %ret6, i64 %val, i32 7
ret <8 x i64> %ret7
}


define <8 x i32> @__load_masked_32(i8 *, <8 x i32> %mask) nounwind alwaysinline {
%ptr = bitcast i8 * %0 to <8 x i32> *
%val = load <8 x i32> * %ptr, align 4
ret <8 x i32> %val
}


define <8 x i64> @__load_masked_64(i8 *, <8 x i32> %mask) nounwind alwaysinline {
%ptr = bitcast i8 * %0 to <8 x i64> *
%val = load <8 x i64> * %ptr, align 8
ret <8 x i64> %val
}
load_and_broadcast(8, i32, 32)
load_and_broadcast(8, i64, 64)
load_masked(8, i32, 32, 4)
load_masked(8, i64, 64, 8)

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; gather/scatter
Expand Down
84 changes: 84 additions & 0 deletions stdlib.m4
Expand Up @@ -1055,6 +1055,90 @@ skip:
'
)

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Emit code to safely load a scalar value and broadcast it across the
;; elements of a vector. Parameters:
;; $1: target vector width
;; $2: element type for which to emit the function (i32, i64, ...)
;; $3: suffix for function name (32, 64, ...)


define(`load_and_broadcast', `
define <$1 x $2> @__load_and_broadcast_$3(i8 *, <$1 x i32> %mask) nounwind alwaysinline {
; must not load if the mask is all off; the address may be invalid
%mm = call i32 @__movmsk(<$1 x i32> %mask)
%any_on = icmp ne i32 %mm, 0
br i1 %any_on, label %load, label %skip

load:
%ptr = bitcast i8 * %0 to $2 *
%val = load $2 * %ptr

%ret0 = insertelement <$1 x $2> undef, $2 %val, i32 0
forloop(i, 1, eval($1-1), `
%ret`'i = insertelement <$1 x $2> %ret`'eval(i-1), $2 %val, i32 i')
ret <$1 x $2> %ret`'eval($1-1)

skip:
ret <$1 x $2> undef
}
')

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Emit general-purpose code to do a masked load for targets that dont have
;; an instruction to do that. Parameters:
;; $1: target vector width
;; $2: element type for which to emit the function (i32, i64, ...)
;; $3: suffix for function name (32, 64, ...)
;; $4: alignment for elements of type $2 (4, 8, ...)

define(`load_masked', `
define <$1 x $2> @__load_masked_$3(i8 *, <$1 x i32> %mask) nounwind alwaysinline {
entry:
%mm = call i32 @__movmsk(<$1 x i32> %mask)
; if the first lane and the last lane are on, then it is safe to do a vector load
; of the whole thing--what the lanes in the middle want turns out to not matter...
%mm_and = and i32 %mm, eval(1 | (1<<($1-1)))
%can_vload = icmp eq i32 %mm_and, eval(1 | (1<<($1-1)))
; if we are not able to do a singe vload, we will accumulate lanes in this memory..
%retptr = alloca <$1 x $2>
%retptr32 = bitcast <$1 x $2> * %retptr to $2 *
br i1 %can_vload, label %load, label %loop

load:
%ptr = bitcast i8 * %0 to <$1 x $2> *
%valall = load <$1 x $2> * %ptr, align $4
ret <$1 x $2> %valall

loop:
; loop over the lanes and see if each one is on...
%lane = phi i32 [ 0, %entry ], [ %next_lane, %lane_done ]
%lanemask = shl i32 1, %lane
%mask_and = and i32 %mm, %lanemask
%do_lane = icmp ne i32 %mask_and, 0
br i1 %do_lane, label %load_lane, label %lane_done

load_lane:
; yes! do the load and store the result into the appropriate place in the
; allocaed memory above
%ptr32 = bitcast i8 * %0 to $2 *
%lane_ptr = getelementptr $2 * %ptr32, i32 %lane
%val = load $2 * %lane_ptr
%store_ptr = getelementptr $2 * %retptr32, i32 %lane
store $2 %val, $2 * %store_ptr
br label %lane_done

lane_done:
%next_lane = add i32 %lane, 1
%done = icmp eq i32 %lane, eval($1-1)
br i1 %done, label %return, label %loop

return:
%r = load <$1 x $2> * %retptr
ret <$1 x $2> %r
}
')

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; packed load and store functions
;;
Expand Down

0 comments on commit e156651

Please sign in to comment.