Skip to content

Commit

Permalink
Use 64-bit arithmetic for computing array offsets.
Browse files Browse the repository at this point in the history
This is not a complete solution to #134, because individual array
dimensions must still fit in a signed 32-bit integer.  However, it
does allow the product of dimensions to be large.
  • Loading branch information
athas committed Aug 31, 2020
1 parent 9bba492 commit ec3009b
Show file tree
Hide file tree
Showing 8 changed files with 57 additions and 38 deletions.
2 changes: 1 addition & 1 deletion src/Futhark/CodeGen/Backends/GenericC.hs
Original file line number Diff line number Diff line change
Expand Up @@ -1873,7 +1873,7 @@ compileExp = compilePrimExp compileLeaf
return [C.cexp|$id:src[$exp:iexp']|]

compileLeaf (SizeOf t) =
return [C.cexp|(typename int32_t)sizeof($ty:t')|]
return [C.cexp|(typename int64_t)sizeof($ty:t')|]
where t' = primTypeToCType t

-- | Tell me how to compile a @v@, and I'll Compile any @PrimExp v@ for you.
Expand Down
10 changes: 5 additions & 5 deletions src/Futhark/CodeGen/ImpCode.hs
Original file line number Diff line number Diff line change
Expand Up @@ -206,13 +206,13 @@ data Code a = Skip
-- all memory blocks will be freed with this statement.
-- Backends are free to ignore it entirely.
| Copy
VName (Count Bytes (TExp Int32)) Space
VName (Count Bytes (TExp Int32)) Space
VName (Count Bytes (TExp Int64)) Space
VName (Count Bytes (TExp Int64)) Space
(Count Bytes (TExp Int64))
-- ^ Destination, offset in destination, destination
-- space, source, offset in source, offset space, number
-- of bytes.
| Write VName (Count Elements (TExp Int32)) PrimType Space Volatility Exp
| Write VName (Count Elements (TExp Int64)) PrimType Space Volatility Exp
-- ^ @Write mem i t space vol v@ writes the value @v@ to
-- @mem@ offset by @i@ elements of type @t@. The
-- 'Space' argument is the memory space of @mem@
Expand Down Expand Up @@ -310,7 +310,7 @@ data ExpLeaf = ScalarVar VName
-- 'LeafExp' constructor itself.
| SizeOf PrimType
-- ^ The size of a primitive type.
| Index VName (Count Elements (TExp Int32)) PrimType Space Volatility
| Index VName (Count Elements (TExp Int64)) PrimType Space Volatility
-- ^ Reading a value from memory. The arguments have
-- the same meaning as with 'Write'.
deriving (Eq, Show)
Expand Down Expand Up @@ -356,7 +356,7 @@ vi32 :: VName -> TExp Int32
vi32 = TPrimExp . flip var (IntType Int32)

-- | Concise wrapper for using 'Index'.
index :: VName -> Count Elements (TExp Int32) -> PrimType -> Space -> Volatility -> Exp
index :: VName -> Count Elements (TExp Int64) -> PrimType -> Space -> Volatility -> Exp
index arr i t s vol = LeafExp (Index arr i t s vol) t

-- Prettyprinting definitions.
Expand Down
27 changes: 18 additions & 9 deletions src/Futhark/CodeGen/ImpGen.hs
Original file line number Diff line number Diff line change
Expand Up @@ -1055,22 +1055,24 @@ destinationFromPattern pat =
return $ ScalarDestination name

fullyIndexArray :: VName -> [Imp.TExp Int32]
-> ImpM lore r op (VName, Imp.Space, Count Elements (Imp.TExp Int32))
-> ImpM lore r op (VName, Imp.Space, Count Elements (Imp.TExp Int64))
fullyIndexArray name indices = do
arr <- lookupArray name
fullyIndexArray' (entryArrayLocation arr) indices

fullyIndexArray' :: MemLocation -> [Imp.TExp Int32]
-> ImpM lore r op (VName, Imp.Space, Count Elements (Imp.TExp Int32))
-> ImpM lore r op (VName, Imp.Space, Count Elements (Imp.TExp Int64))
fullyIndexArray' (MemLocation mem _ ixfun) indices = do
space <- entryMemSpace <$> lookupMemory mem
let indices' = case space of
ScalarSpace ds _ ->
let (zero_is, is) = splitFromEnd (length ds) indices
in map (const 0) zero_is ++ is
_ -> indices
ixfun64 = fmap sExt64 ixfun
indices64 = fmap sExt64 indices'
return (mem, space,
elements $ IxFun.index ixfun indices')
elements $ IxFun.index ixfun64 indices64)

-- More complicated read/write operations that use index functions.

Expand All @@ -1083,9 +1085,9 @@ copy bt dest destslice src srcslice = do
defaultCopy :: CopyCompiler lore r op
defaultCopy bt dest destslice src srcslice
| Just destoffset <-
IxFun.linearWithOffset (IxFun.slice destIxFun destslice) bt_size,
IxFun.linearWithOffset (IxFun.slice dest_ixfun64 destslice64) bt_size,
Just srcoffset <-
IxFun.linearWithOffset (IxFun.slice srcIxFun srcslice) bt_size = do
IxFun.linearWithOffset (IxFun.slice src_ixfun64 srcslice64) bt_size = do
srcspace <- entryMemSpace <$> lookupMemory srcmem
destspace <- entryMemSpace <$> lookupMemory destmem
if isScalarSpace srcspace || isScalarSpace destspace
Expand All @@ -1098,8 +1100,15 @@ defaultCopy bt dest destslice src srcslice
copyElementWise bt dest destslice src srcslice
where bt_size = primByteSize bt
num_elems = Imp.elements $ product $ sliceDims srcslice
MemLocation destmem _ destIxFun = dest
MemLocation srcmem _ srcIxFun = src

MemLocation destmem _ dest_ixfun = dest
MemLocation srcmem _ src_ixfun = src

dest_ixfun64 = fmap sExt64 dest_ixfun
destslice64 = map (fmap sExt64) destslice
src_ixfun64 = fmap sExt64 src_ixfun
srcslice64 = map (fmap sExt64) srcslice

isScalarSpace ScalarSpace{} = True
isScalarSpace _ = False

Expand Down Expand Up @@ -1292,8 +1301,8 @@ compileAlloc pat _ _ =
typeSize :: Type -> Count Bytes (Imp.TExp Int64)
typeSize t =
Imp.bytes $
isInt64 (sExt Int64 (Imp.LeafExp (Imp.SizeOf $ elemType t) int32)) *
product (map (isInt64 . sExt Int64 . toExp' int32) (arrayDims t))
isInt64 (Imp.LeafExp (Imp.SizeOf $ elemType t) int64) *
product (map (sExt64 . toInt32Exp) (arrayDims t))

--- Building blocks for constructing code.

Expand Down
8 changes: 4 additions & 4 deletions src/Futhark/CodeGen/ImpGen/Kernels.hs
Original file line number Diff line number Diff line change
Expand Up @@ -228,8 +228,8 @@ callKernelCopy bt
srcspace <- entryMemSpace <$> lookupMemory srcmem
destspace <- entryMemSpace <$> lookupMemory destmem
emit $ Imp.Copy
destmem (bytes destoffset) destspace
srcmem (bytes srcoffset) srcspace $
destmem (bytes $ sExt64 destoffset) destspace
srcmem (bytes $ sExt64 srcoffset) srcspace $
num_elems `Imp.withElemType` bt

| otherwise = sCopy bt destloc destslice srcloc srcslice
Expand Down Expand Up @@ -322,8 +322,8 @@ mapTransposeFunction bt =
sExt64 $
Imp.vi32 x * Imp.vi32 y * isInt32 (Imp.LeafExp (Imp.SizeOf bt) (IntType Int32))
in Imp.Copy
destmem (Imp.Count $ Imp.vi32 destoffset) space
srcmem (Imp.Count $ Imp.vi32 srcoffset) space
destmem (Imp.Count $ sExt64 $ Imp.vi32 destoffset) space
srcmem (Imp.Count $ sExt64 $ Imp.vi32 srcoffset) space
(Imp.Count num_bytes)

callTransposeKernel =
Expand Down
8 changes: 4 additions & 4 deletions src/Futhark/CodeGen/ImpGen/Kernels/Base.hs
Original file line number Diff line number Diff line change
Expand Up @@ -505,7 +505,7 @@ atomicUpdateLocking atomicBinOp lam

(arr', _a_space, bucket_offset) <- fullyIndexArray a bucket

case opHasAtomicSupport space old arr' bucket_offset op of
case opHasAtomicSupport space old arr' (sExt32 <$> bucket_offset) op of
Just f -> sOp $ f $ Imp.var y t
Nothing -> atomicUpdateCAS space t a old bucket x $
x <~~ Imp.BinOpExp op (Imp.var x t) (Imp.var y t)
Expand Down Expand Up @@ -544,14 +544,14 @@ atomicUpdateLocking _ op = AtomicLocking $ \locking space arrs bucket -> do
-- Critical section
let try_acquire_lock =
sOp $ Imp.Atomic space $
Imp.AtomicCmpXchg int32 old locks' locks_offset
Imp.AtomicCmpXchg int32 old locks' (sExt32 <$> locks_offset)
(untyped $ lockingIsUnlocked locking) (untyped $ lockingToLock locking)
lock_acquired = Imp.vi32 old .==. lockingIsUnlocked locking
-- Even the releasing is done with an atomic rather than a
-- simple write, for memory coherency reasons.
release_lock =
sOp $ Imp.Atomic space $
Imp.AtomicCmpXchg int32 old locks' locks_offset
Imp.AtomicCmpXchg int32 old locks' (sExt32 <$> locks_offset)
(untyped $ lockingToLock locking) (untyped $ lockingToUnlock locking)
break_loop = continue <-- false

Expand Down Expand Up @@ -633,7 +633,7 @@ atomicUpdateCAS space t arr old bucket x do_op = do
do_op
old_bits <- dPrim "old_bits" int32
sOp $ Imp.Atomic space $
Imp.AtomicCmpXchg int32 old_bits arr' bucket_offset
Imp.AtomicCmpXchg int32 old_bits arr' (sExt32 <$> bucket_offset)
(toBits (Imp.var assumed t)) (toBits (Imp.var x t))
old <~~ fromBits (Imp.var old_bits int32)
sWhen (isInt32 (toBits (Imp.var assumed t)) .==. Imp.vi32 old_bits)
Expand Down
6 changes: 3 additions & 3 deletions src/Futhark/CodeGen/ImpGen/Kernels/SegRed.hs
Original file line number Diff line number Diff line change
Expand Up @@ -637,8 +637,8 @@ reductionStageTwo constants segred_pes
sOp $ Imp.MemFence Imp.FenceGlobal
-- Increment the counter, thus stating that our result is
-- available.
sOp $ Imp.Atomic DefaultSpace $ Imp.AtomicAdd Int32 old_counter counter_mem counter_offset $
untyped (1::Imp.TExp Int32)
sOp $ Imp.Atomic DefaultSpace $ Imp.AtomicAdd Int32 old_counter counter_mem
(sExt32 <$> counter_offset) $ untyped (1::Imp.TExp Int32)
-- Now check if we were the last group to write our result. If
-- so, it is our responsibility to produce the final result.
sWrite sync_arr [0] $ untyped $ Imp.vi32 old_counter .==. groups_per_segment - 1
Expand All @@ -656,7 +656,7 @@ reductionStageTwo constants segred_pes
-- races in oclgrind.
sWhen (local_tid .==. 0) $
sOp $ Imp.Atomic DefaultSpace $
Imp.AtomicAdd Int32 old_counter counter_mem counter_offset $
Imp.AtomicAdd Int32 old_counter counter_mem (sExt32 <$> counter_offset) $
untyped $ negate groups_per_segment

sLoopNest (slugShape slug) $ \vec_is -> do
Expand Down
25 changes: 13 additions & 12 deletions src/Futhark/CodeGen/ImpGen/Kernels/Transpose.hs
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,8 @@ mapTranspose block_dim args t kind =
, dec index_out $ vi32 x_index * height + vi32 y_index

, when (vi32 get_global_id_0 .<. width * height * num_arrays)
(Write odata (elements $ vi32 odata_offset + vi32 index_out) t (Space "global") Nonvolatile $
index idata (elements $ vi32 idata_offset + vi32 index_in) t (Space "global") Nonvolatile)
(Write odata (elements $ sExt64 $ vi32 odata_offset + vi32 index_out) t (Space "global") Nonvolatile $
index idata (elements $ sExt64 $ vi32 idata_offset + vi32 index_in) t (Space "global") Nonvolatile)
]

TransposeLowWidth ->
Expand Down Expand Up @@ -83,10 +83,11 @@ mapTranspose block_dim args t kind =
let i = vi32 j * (tile_dim `quot` elemsPerThread)
in mconcat [ dec index_in $ (vi32 y_index + i) * width + vi32 x_index
, when (vi32 y_index + i .<. height) $
Write block (elements $ (vi32 get_local_id_1 + i) * (tile_dim+1)
+ vi32 get_local_id_0)
Write block (elements $ sExt64 $
(vi32 get_local_id_1 + i) * (tile_dim+1)
+ vi32 get_local_id_0)
t (Space "local") Nonvolatile $
index idata (elements $ vi32 idata_offset + vi32 index_in)
index idata (elements $ sExt64 $ vi32 idata_offset + vi32 index_in)
t (Space "global") Nonvolatile]
, Op $ Barrier FenceLocal
, SetScalar x_index $ untyped $ vi32 get_group_id_1 * tile_dim + vi32 get_local_id_0
Expand All @@ -96,10 +97,10 @@ mapTranspose block_dim args t kind =
let i = vi32 j * (tile_dim `quot` elemsPerThread)
in mconcat [ dec index_out $ (vi32 y_index + i) * height + vi32 x_index
, when (vi32 y_index + i .<. width) $
Write odata (elements $ vi32 odata_offset + vi32 index_out)
Write odata (elements $ sExt64 $ vi32 odata_offset + vi32 index_out)
t (Space "global") Nonvolatile $
index block (elements $ vi32 get_local_id_0 * (tile_dim+1)
+ vi32 get_local_id_1+i)
index block (elements $ sExt64 $
vi32 get_local_id_0 * (tile_dim+1) + vi32 get_local_id_1+i)
t (Space "local") Nonvolatile
]
]
Expand Down Expand Up @@ -166,18 +167,18 @@ mapTranspose block_dim args t kind =
, dec y_index y_in_index
, dec index_in $ vi32 y_index * width + vi32 x_index
, when (vi32 x_index .<. width .&&. vi32 y_index .<. height) $
Write block (elements $ vi32 get_local_id_1 * (block_dim+1) + vi32 get_local_id_0)
Write block (elements $ sExt64 $ vi32 get_local_id_1 * (block_dim+1) + vi32 get_local_id_0)
t (Space "local") Nonvolatile $
index idata (elements $ vi32 idata_offset + vi32 index_in)
index idata (elements $ sExt64 $ vi32 idata_offset + vi32 index_in)
t (Space "global") Nonvolatile
, Op $ Barrier FenceLocal
, SetScalar x_index $ untyped x_out_index
, SetScalar y_index $ untyped y_out_index
, dec index_out $ vi32 y_index * height + vi32 x_index
, when (vi32 x_index .<. height .&&. vi32 y_index .<. width) $
Write odata (elements $ vi32 odata_offset + vi32 index_out)
Write odata (elements $ sExt64 (vi32 odata_offset + vi32 index_out))
t (Space "global") Nonvolatile $
index block (elements $ vi32 get_local_id_0 * (block_dim+1) + vi32 get_local_id_1)
index block (elements $ sExt64 $ vi32 get_local_id_0 * (block_dim+1) + vi32 get_local_id_1)
t (Space "local") Nonvolatile
]

Expand Down
9 changes: 9 additions & 0 deletions tests/big.fut
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
-- Testing big arrays.
-- ==
-- no_opencl compiled input { 2 1100000000 1 1073741823 } output { 255u8 }
-- no_opencl compiled input { 3 1073741824 2 1073741823 } output { 255u8 }
-- structure { Replicate 1 Iota 1 }

let main (n: i32) (m: i32) (i: i32) (j: i32) =
-- The opaque is just to force manifestation.
(opaque (replicate n (tabulate m (\i -> u8.i32 i))))[i,j]

0 comments on commit ec3009b

Please sign in to comment.