Only one inter-grup kernel gets generated that runs out of shared memory (need also a "safe" one) #1610

coancea · 2022-03-18T14:26:55Z

The TSP.fut program below would fail on some datasets because it runs out of shared memory.
The reason is that only one intra-group version is generated, which is of course not valid for any dataset.

-- ==
--
-- compiled input @ ../Data/berlin52.txt
--
-- compiled input @ ../Data/swiss.txt


module type rand ={
    type rng
    val init : i32 -> rng
    val rand : rng -> (rng, i32)
    val split : rng -> (rng,rng)
    val split_n : (n: i64) -> rng -> [n]rng
}

-- random module taken from Futharks webpage
module lcg : rand = {
    type rng = u32
    
    def addConst : u32 = 1103515245
    def multConst : u32 = 12345
    def modConst : u32 = 1<<31

    def rand rng = 
        let rng' = (addConst * rng + multConst) % modConst
        in (rng', i32.u32 rng')
    
    
    def init (x: i32) : u32 =
        let x = u32.i32 x
        let x =((x >> 16) ^ x) * 0x45d9f3b   
        let x =((x >> 16) ^ x) * 0x45d9f3b
        let x =((x >> 16) ^ x)
        in x
    def split (rng: rng) = 
        (init (i32.u32 (rand rng).0),
         init (i32.u32 rng))

    def split_n n rng = 
        tabulate n (\i -> init (i32.u32 rng ^ i32.i64 i))
}

--def rand_i32 (rng: lcg.rng) (bound: i32) =
--    let (rng,x) = lcg.rand rng
--    in (rng, x % bound)


-- This function swaps the two edges that produce the lowest cost
let swap [m] (i : i32) (j : i32) (tour : [m]i32) : [m]i32 =
    let minI = i+1 in 
    map i32.i64 (iota m) |>
    map(\ind ->
        if ind < minI || ind > j then
            tour[ind]
        else
            tour[j - (ind - minI)]
    ) 

def rand_nonZero (rng: lcg.rng) (bound: i32) =
    let (rng,x) = lcg.rand rng
    in  if (x % bound) > 0 then (rng, x % bound) else (rng, (x % bound) + 1)
    
let mkRandomTour (offset:i64) (cities:i32) : []i32 = 
    let rng = lcg.init (i32.i64 offset)
    --let randIndArr = map (\i -> 
    --    if i == 0 then rand_i32 rng cities
    --              else rand_i32 (i-1).0 cities ) iota cities 
    let initTour = map (\i -> if i == (i64.i32 cities) then 0
                              else i) (iota ((i64.i32 cities)+1)) |> map i32.i64
    let intialI = rand_nonZero rng (cities)
    let intialJ = rand_nonZero intialI.0 (cities)
    let randomSwaps = rand_nonZero intialJ.0 100
    let rs = loop (intialI,intialJ,initTour) for i < randomSwaps.1 do
                let intI = rand_nonZero intialI.0 (cities)
                let intJ = rand_nonZero intI.0 (cities)
                let swappedTour = swap intialI.1 intialJ.1 initTour
                in (intI, intJ, swappedTour)
    in rs.2

    
-- mkFlagArray is taken from PMPH lecture notes p. 48
let mkFlagArray 't [m] 
        (aoa_shp: [m]i32) (zero: t)
        (aoa_val: [m]t) : []t =
    let shp_rot = map (\i -> if i == 0 then 0
                             else aoa_shp[i-1]
                      ) (iota m)
    let shp_scn = scan (+) 0 shp_rot    
    let aoa_len = shp_scn[m-1]+ aoa_shp[m-1] |> i64.i32
    let shp_ind = map2 (\shp ind -> 
                        if shp == 0 then -1
                        else ind
                        ) aoa_shp shp_scn
    in scatter (replicate aoa_len zero) (map i64.i32 shp_ind) aoa_val

-- segmented_scan is taken from PMPH Futhark code 
let segmented_scan [n] 't (op: t -> t -> t) (ne: t)
                          (flags: [n]bool) (arr: [n]t) : [n]t =
    let (_, res) = unzip <|
        scan (\(x_flag,x) (y_flag,y) ->
                let fl = x_flag || y_flag
                let vl = if y_flag then y else op x y
                in  (fl, vl)
            ) (false, ne) (zip flags arr)
    in  res
     
-- Comparator function used in forLoops in the reduce part
let changeComparator (t1 : (i32, i32, i32)) (t2: (i32, i32, i32)) : (i32, i32, i32) =
    if t1.0 < t2.0 then t1
    else 
        if t1.0 == t2.0 then 
            if t1.1 < t2.1 then t1
            else 
                if t1.1 == t2.1 then
                    if t1.2 < t2.2 then t1
                    else t2
                else t2
        else t2

-- finds the best cost of two input costs
let costComparator (cost1: i32) (cost2 :( i32)) : i32 = 
    if cost1 < cost2 then cost1 else cost2

-- findMinChange is the parallel implementation of the two for loops
-- in the 2-opt move algorithm
let findMinChange [m] [n] [x] [y] (dist : [m]i32) (tour : [n]i32) 
                                (Iarr : [x]i32) (Jarr : [y]i32) 
                                (cities : i32) (totIter : i64) : (i32, i32, i32) =
    let changeArr = map (\ind -> 
                        let i = Iarr[ind]
                        let iCity = tour[i]
                        let iCityp1 = tour[i+1]
                        let j = Jarr[ind] + i + 2
                        let jCity = tour[j]
                        let jCityp1 = tour[j+1]
                        in  ((dist[iCity * cities + jCity] + 
                            dist[iCityp1 * cities + jCityp1] - 
                            (dist[iCity * cities + iCityp1] + 
                            dist[jCity * cities + jCityp1])), i, j) 
                        ) (iota totIter)
    in reduce changeComparator (2147483647, -1, -1) changeArr


-- 2-opt algorithm
let twoOptAlg [m] [n] [x] [y] (distM : [m]i32) (tour : [n]i32) 
                            (Iarr : [x]i32) (Jarr : [y]i32) 
                            (cities : i32) (totIter : i64) : []i32 =
    let twoOpt xs =
        let minChange = findMinChange distM xs Iarr Jarr cities totIter
        in 
            if minChange.0 < 0 then (swap minChange.1 minChange.2 xs, minChange.0) 
                else (xs, minChange.0)
    let rs = loop (xs, cond) = (tour, -1)  while cond < 0 do twoOpt xs 
    in rs.0

-- Compute cost of tour
let cost [n] [m] (tour : [n]i32) (distM : [m]i32) : i32 = 
        map (\i ->  distM[tour[i] * i32.i64(n-1) + tour[i+1]]
            ) (iota (n-1)) |> reduce (+) 0

-- Generate flagArray
let flagArrayGen (cities : i32) : ([]i32)=
    let len = i64.i32 (cities-2)
    let aoa_val = replicate len 1i32
    let temp = map (+1) (iota len) |> map i32.i64
    let aoa_shp = map (\i ->
                       temp[len - i - 1] 
                      ) (iota len)
    in mkFlagArray aoa_shp 0i32 aoa_val
--[m] 
let main [m] (cities : i32) (numRestarts : i64) 
         (distM : [m]i32) : i32 =
    --let cities = 5
    let totIter = ((cities-1)*(cities-2))/2 |> i64.i32
    --let initTour = (iota (i64.i32 cities+1)) |> 
                    --map(\i -> (i+1)*10)
     
    --let oldCost = cost tour distM  
    --let distM = [0,4,6,8,3,
    --             4,0,4,5,2,
    --             6,4,0,2,3,
    --             8,5,2,0,4,
    --             3,2,3,4,0]

    let flagArr = flagArrayGen cities
    let Iarr = scan (+) 0i32 flagArr |> map (\x -> x-1)
    let Jarr = segmented_scan (+) 0i32 (map bool.i32 (flagArr :> [totIter]i32)) (replicate totIter 1i32) |> map (\x -> x-1)
    let allCosts = map(\ind -> 
            let tour = mkRandomTour ((ind+1)*13) cities
            let minTour = twoOptAlg distM tour Iarr Jarr cities totIter
            in cost minTour distM
        )(iota numRestarts)
    in reduce costComparator 2147483647 allCosts
    --in mkRandomTour 125 cities

The compilation of the TSP program generates only one intra-group version

$ futhark cuda TSP.fut
$ ./TSP --print-params
builtin#replicate_i32.group_size_15895 (group_size)
builtin#replicate_i8.group_size_15798 (group_size)
main.segmap_group_size_12195 (group_size)
main.segmap_group_size_12209 (group_size)
main.segmap_group_size_12237 (group_size)
main.segmap_group_size_12261 (group_size)
main.segred_group_size_14034 (group_size)
main.segred_num_groups_14036 (num_groups)
main.segscan_group_size_12185 (group_size)
main.segscan_group_size_12199 (group_size)
main.segscan_group_size_12227 (group_size)
main.segscan_num_groups_12187 (num_groups)
main.segscan_num_groups_12201 (num_groups)
main.segscan_num_groups_12229 (num_groups)

The intra-group version fails because it runs out of shared memory:

$ ./TSP -t /dev/stderr -r 2 -P < swiss.txt 
371
883
1273i32
Peak memory usage for space 'device': 24257 bytes.
copy_dev_to_dev                       ran     0 times; avg:        0us; total:        0us
copy_dev_to_host                      ran     0 times; avg:        0us; total:        0us
copy_host_to_dev                      ran     0 times; avg:        0us; total:        0us
copy_scalar_to_dev                    ran     0 times; avg:        0us; total:        0us
copy_scalar_from_dev                  ran     2 times; avg:       13us; total:       26us
builtin#replicate_i32.replicate_15892 ran     1 times; avg:        5us; total:        5us
builtin#replicate_i8.replicate_15795  ran     3 times; avg:        4us; total:       12us
main.segmap_12193                     ran     1 times; avg:        4us; total:        4us
main.segmap_12222                     ran     1 times; avg:        4us; total:        4us
main.segmap_12252                     ran     1 times; avg:        4us; total:        4us
main.segmap_12275                     ran     1 times; avg:        3us; total:        3us
main.segmap_intragroup_14305          ran     1 times; avg:      293us; total:      293us
main.segred_nonseg_14818              ran     1 times; avg:        5us; total:        5us
main.segscan_12191                    ran     1 times; avg:        8us; total:        8us
main.segscan_12205                    ran     1 times; avg:        6us; total:        6us
main.segscan_12233                    ran     1 times; avg:        9us; total:        9us
15 operations with cumulative runtime:    379us

./TSP < berlin52.txt 
./TSP: TSP.c:7790: CUDA call
  cuLaunchKernel(ctx->mainzisegmap_intragroup_14305, grid[0], grid[1], grid[2], computed_group_sizze_14048, 1, 1, 0 + (shared_sizze_16355 + (8 - shared_sizze_16355 % 8) % 8) + (shared_sizze_16357 + (8 - shared_sizze_16357 % 8) % 8) + (shared_sizze_16359 + (8 - shared_sizze_16359 % 8) % 8) + (shared_sizze_16361 + (8 - shared_sizze_16361 % 8) % 8) + (shared_sizze_16363 + (8 - shared_sizze_16363 % 8) % 8) + (shared_sizze_16365 + (8 - shared_sizze_16365 % 8) % 8), ((void *)0), kernel_args_16352, ((void *)0))
failed with error code 1 (invalid argument)

The text was updated successfully, but these errors were encountered:

athas · 2022-03-18T15:00:53Z

I suspect this is because the other versions fail during memory expansion because they require thread-variant allocations. That's the only situation that could theoretically result in only the intra-group version being available.

(Unless it's not really a version, but instead something that is tiled.)

athas mentioned this issue Mar 20, 2022

Handle conditionals properly in memory expansion #1612

Open

athas closed this as completed in 4005204 Mar 20, 2022

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Only one inter-grup kernel gets generated that runs out of shared memory (need also a "safe" one) #1610

Only one inter-grup kernel gets generated that runs out of shared memory (need also a "safe" one) #1610

coancea commented Mar 18, 2022

athas commented Mar 18, 2022

Only one inter-grup kernel gets generated that runs out of shared memory (need also a "safe" one) #1610

Only one inter-grup kernel gets generated that runs out of shared memory (need also a "safe" one) #1610

Comments

coancea commented Mar 18, 2022

athas commented Mar 18, 2022