In [49]:
#
# summing elements in an array
# Accelerate! GPU course
#
# last-update:
#     30 aug 2022 -bjr: created
#
#


def above_power_2(x):
    i = 1
    while i<x:
        i *= 2
    return i

def total_sum_seq(a):
    s = 0
    for i in range(len(a)):
        s += a[i]
    return s


# this is a simulation of a GPU algorithm; 
# the phases are sequential, and should synchronize
# the threads are parallel 

# properties: 
#    threads used: n/2^i in phase i
#    memory accesss: fully independent

def total_sum_folding(a):
    l_a = above_power_2(len(a))
    
    phase = 0
    # the first fold takes care of arrays not a power of two
    d = l_a//2
    for thread in range(d):
        if (thread+d)<len(a):
            a[thread] += a[thread+d]

    # remaining phasess do not need the length check
    d = d//2
    while d>0:
        phase += 1 # just a notation;
        
        for thread in range(d):
            a[thread] += a[thread+d]
        d = d//2
            
    return

# partial sums

def partial_sum_seq(a):
    for i in range(1,len(a)):
        a[i] += a[i-1]
    return

#
# the ladder is the parity of the n'th bit. 
# in the i-th phase the blocks are of size 2^i, from 1 increasing
#   until n/2.
# blocks are 2^i aligned. this can be achieved by masking off the 
#   i-1 least significant bits.
# the predecessor value to a block is defined: it is the array location
#   one before the lowest array location in the block
# LI: each block individually achieces the partial sum
#   Loop update: add to every element in the block the value of the block predecessor
#
#
# properties:
#    threads: n/2 every phase
#    phases: log(n)
#    memory pattern: broadcast read, n/2 individual writes
#


def partial_sum_ladder(a):
    phase = 1
    phase_mask = ~(phase-1)
    while phase<len(a):
        print(phase)
        
        for thread in range(len(a)):
            if phase & thread != 0:
                prev_rung = (thread&phase_mask)-1
                print(f"\t{thread}, {prev_rung}")
                a[thread] += a[prev_rung]
                
            else:
                # thread sleeps
                None

        phase <<= 1
        phase_mask = ~(phase-1)
        
    return


# test procedures

def fill_array(a):
    for i in range(len(a)):
        a[i] = i
    return a

SIZE = 1024
a = [0]*SIZE
fill_array(a)
print(total_sum_seq(a))
total_sum_folding(a)
print(a[0])

SIZE = 1023
a = [0]*SIZE
fill_array(a)
print(total_sum_seq(a))
total_sum_folding(a)
print(a[0])

SIZE = 30
a = [1]*SIZE
#fill_array(a)
print(partial_sum_seq(a))

a = [2]*SIZE
partial_sum_ladder(a)
print(a)

523776
523776
522753
522753
None
1
	1, 0
	3, 2
	5, 4
	7, 6
	9, 8
	11, 10
	13, 12
	15, 14
	17, 16
	19, 18
	21, 20
	23, 22
	25, 24
	27, 26
	29, 28
2
	2, 1
	3, 1
	6, 5
	7, 5
	10, 9
	11, 9
	14, 13
	15, 13
	18, 17
	19, 17
	22, 21
	23, 21
	26, 25
	27, 25
4
	4, 3
	5, 3
	6, 3
	7, 3
	12, 11
	13, 11
	14, 11
	15, 11
	20, 19
	21, 19
	22, 19
	23, 19
	28, 27
	29, 27
8
	8, 7
	9, 7
	10, 7
	11, 7
	12, 7
	13, 7
	14, 7
	15, 7
	24, 23
	25, 23
	26, 23
	27, 23
	28, 23
	29, 23
16
	16, 15
	17, 15
	18, 15
	19, 15
	20, 15
	21, 15
	22, 15
	23, 15
	24, 15
	25, 15
	26, 15
	27, 15
	28, 15
	29, 15
[2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60]
