In [1]:


# partial sums

def partial_sum_seq(a):
    for i in range(1,len(a)):
        a[i] += a[i-1]
    return

#
# the ladder is the parity of the n'th bit. 
# in the i-th phase the blocks are of size 2^i, from 1 increasing
#   until n/2.
# blocks are 2^i aligned. this can be achieved by masking off the 
#   i-1 least significant bits.
# the predecessor value to a block is defined: it is the array location
#   one before the lowest array location in the block
# LI: each block individually achieves the partial sum over that block
# Loop update: double the block size but updating by total sum
#    of the preceeding block each element of the subsequent block
#
#
# properties:
#    threads: n/2 every phase
#    phases: log(n)
#    memory pattern: broadcast read, n/2 individual writes
#

# question: how is complement defined when the integer has no fixed length in python
# however on the GPU these are thread indices so are 16 bit integers, most likely
# however, this can be avoided by taking the mod and subtracting, among other ways

def partial_sum_ladder(a):
    phase = 1
    phase_mask = ~(phase-1)
    while phase<len(a):
        print(phase)
        
        for thread in range(len(a)):
            if phase & thread != 0:
                prev_rung = (thread&phase_mask)-1
                print(f"\t{thread}, {prev_rung}")
                a[thread] += a[prev_rung]
                
            else:
                # thread sleeps
                None

        phase <<= 1
        phase_mask = ~(phase-1)
        
    return


# the folding method has blocks of size 2^h, supposing there are 2^j 
# such blocks, stack them one above the other such that the 0 cell is
# the uppermost, leftmost cell. The Loop Invariant is that the i-th 
# entry is the sum of all elements that would have been above it in 
# the original array values. 
# Update is to divide each block in half, fold the right half under the
# left half, and add to a cell the value of the corresponding cell 
# just above the cell.
# This update is down in parallel, such that all threads read the current
# cell values and updates simultaneously to the new cell values. 
# As a sequential program this can be done by working from highest 
# indexed blocks to lowest indexed blocks


def partial_sum_folded(a):
    m = len(a)
    n = 1
    while n<m:
        n *= 2
    phase = n//2
    while True:
        print(f'fphase {phase}')
        for i in range(m):
            if (i+phase) < m :
                a[i] += a[i+phase]
        print(a)
        if phase==1:
            break
        phase = phase//2
    return a

# test procedures


#--------


SIZE = 30
a = [1]*SIZE
#fill_array(a)
print(partial_sum_seq(a))

a = [2]*SIZE
partial_sum_ladder(a)
print(a)

None
1
	1, 0
	3, 2
	5, 4
	7, 6
	9, 8
	11, 10
	13, 12
	15, 14
	17, 16
	19, 18
	21, 20
	23, 22
	25, 24
	27, 26
	29, 28
2
	2, 1
	3, 1
	6, 5
	7, 5
	10, 9
	11, 9
	14, 13
	15, 13
	18, 17
	19, 17
	22, 21
	23, 21
	26, 25
	27, 25
4
	4, 3
	5, 3
	6, 3
	7, 3
	12, 11
	13, 11
	14, 11
	15, 11
	20, 19
	21, 19
	22, 19
	23, 19
	28, 27
	29, 27
8
	8, 7
	9, 7
	10, 7
	11, 7
	12, 7
	13, 7
	14, 7
	15, 7
	24, 23
	25, 23
	26, 23
	27, 23
	28, 23
	29, 23
16
	16, 15
	17, 15
	18, 15
	19, 15
	20, 15
	21, 15
	22, 15
	23, 15
	24, 15
	25, 15
	26, 15
	27, 15
	28, 15
	29, 15
[2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60]


In [47]:
SIZE = 33
a = [1]*SIZE
#fill_array(a)
print(partial_sum_folded(a))

fphase 32
[2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
fphase 16
[3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
fphase 8
[5, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1]
fphase 4
[9, 8, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1]
fphase 2
[17, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8, 8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1]
fphase 1
[33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1]
[33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1]


In [10]:
def l_child(n):
    return 2*n+1

def r_child(n):
    return 2*n+2

def in_order(i,d,a):
    a[i] = i 
    lc = l_child(i)
    rc = r_child(i)
    print(f'left: {lc}, right {rc}')
    if d>0:
        in_order(lc,d-1,a)
        in_order(rc,d-1,a)
    return


a = [0]*64
in_order(0,3,a)

left: 1, right 2
left: 3, right 4
left: 7, right 8
left: 15, right 16
left: 17, right 18
left: 9, right 10
left: 19, right 20
left: 21, right 22
left: 5, right 6
left: 11, right 12
left: 23, right 24
left: 25, right 26
left: 13, right 14
left: 27, right 28
left: 29, right 30


In [11]:
a



[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]