## Maintain 2 heaps containing a stream of integers, such that the median can be extracted in constant time at all times.  The min-heap will have the larger half of the numbers and the max the smaller. The median will be the root of the max heap (since the assignment's tiebreaker goes to the smaller of the middle 2 after an even number of items have been seen).

In [96]:
## Bring a 'stream' of 10K integers into the notebook
with open('Downloads/Median.txt') as f:
    ints = f.readlines()

In [97]:
len(ints)


10000

In [98]:
len(set(ints))  # are they unique?


10000

In [99]:
print(max(ints),min(ints))  ## seems like we're missing one

9999
 1



In [100]:
'10000\n' in ints

True

In [101]:
ints = [int(num.strip('\n')) for num in ints]

In [102]:
10000 in ints

True

In [103]:
print(max(ints), min(ints))

10000 1


### now create the 2 heaps and a running count list of the median to return

In [112]:
def insert(num, heapmax, heapmin):
    if len(heapmax) == len(heapmin):
        if num < heapmin[0]:
            bubbleMaxUp(heapmax, num)
            return heapmax[0]
        bubbleMinUp(heapmin, num)
        return heapmin[0]
    if len(heapmax) < len(heapmin):
        if num < heapmin[0]:
            bubbleMaxUp(heapmax, num)
            return heapmax[0]
        bubbleMaxUp(heapmax, heapmin[0])  # add smallest big number to maxheap as the root
        bubbleMinDown(heapmin, num)  # add new big number to minheap
        return heapmax[0]
    if num > heapmax[0]:
        bubbleMinUp(heapmin, num)
        return heapmax[0]
    bubbleMinUp(heapmin, heapmax[0]) # add biggest small number to minheap as the root
    bubbleMaxDown(heapmax, num)  # add new small number to maxheap
    return heapmax[0]

In [89]:
def swap(arr, i, j):
    # speed up heap swaps a little bit with this utility
    temp = arr[i]
    arr[i] = arr[j]
    arr[j] = temp

In [90]:
def bubbleMaxUp(heap, n):
    # append n to max heap and bubble up
    i = len(heap)
    heap.append(n)
    while i > 0:
        j = (i + 1) // 2 - 1
        if heap[j] > heap[i]:
            return
        swap(heap, i, j)
        i = j   

In [91]:
def bubbleMinUp(heap, n):
    # append n to min heap and bubble up
    i = len(heap)
    heap.append(n)
    while i > 0:
        j = (i + 1) // 2 - 1
        if heap[j] < heap[i]:
            return
        swap(heap, i, j)
        i = j   

In [95]:
def bubbleMaxDown(heap, n):
    heap[0] = n
    newIndex = 0
    leftChild = 1
    while leftChild < len(heap):
        maxChild = leftChild + 1  # right child of newIndex
        if maxChild == len(heap):  # rare case where the bubbleDown has reached a final, left child without sibling
            if heap[newIndex] > heap[leftChild]: return
            else:
                swap(heap, leftChild, newIndex)
                return
        if heap[leftChild] > heap[maxChild]:
            maxChild = leftChild
        if heap[newIndex] > heap[maxChild]:
            return
        swap(heap, newIndex, maxChild)
        newIndex = maxChild
        leftChild = (newIndex + 1) * 2 - 1
    

In [93]:
def bubbleMinDown(heap, n):
    heap[0] = n
    newIndex = 0
    leftChild = 1
    while leftChild < len(heap):
        minChild = leftChild + 1  # right child of newIndex
        if minChild == len(heap):  # rare case where the bubbleDown has reached a final, left child without sibling
            if heap[newIndex] < heap[leftChild]: return
            else:
                swap(heap, leftChild, newIndex)
                return
        if heap[leftChild] < heap[minChild]:
            minChild = leftChild
        if heap[newIndex] < heap[minChild]:
            return
        swap(heap, newIndex, minChild)
        newIndex = minChild
        leftChild = (newIndex + 1) * 2 - 1
    

In [139]:
maxHeap, minHeap, medians = [-float('inf')],[float('inf')],[]

In [140]:
nums = ints[:]
#import random
#nums = list(range(100000))  # takes about 4-5 secs to median this
#random.shuffle(nums)
while nums:
    num = nums.pop(0)
    medians.append(insert(num, maxHeap, minHeap))

In [141]:
len(medians) 

100000

In [143]:
print(minHeap[:3], maxHeap[:3])

[50000, 50041, 50001] [49999, 49994, 49998]


In [116]:
medians[:20]

[6331,
 2793,
 2793,
 2793,
 2793,
 1640,
 2793,
 2303,
 2793,
 2303,
 2793,
 2793,
 4292,
 4292,
 4479,
 4479,
 5147,
 4479,
 4479,
 4479]

In [37]:
## original ints list:
ints[:120]

['6331\n',
 '2793\n',
 '1640\n',
 '9290\n',
 '225\n',
 '625\n',
 '6195\n',
 '2303\n',
 '5685\n',
 '1354\n',
 '4292\n',
 '7600\n',
 '6447\n',
 '4479\n',
 '9046\n',
 '7293\n',
 '5147\n',
 '1260\n',
 '1386\n',
 '6193\n',
 '4135\n',
 '3611\n',
 '8583\n',
 '1446\n',
 '3480\n',
 '2022\n',
 '961\n',
 '7123\n',
 '7262\n',
 '2261\n',
 '8380\n',
 '2123\n',
 '1286\n',
 '1274\n',
 '1369\n',
 '831\n',
 '927\n',
 '993\n',
 '4484\n',
 '4865\n',
 '8473\n',
 '8587\n',
 '4200\n',
 '1216\n',
 '2454\n',
 '3371\n',
 '6471\n',
 '6303\n',
 '6837\n',
 '3365\n',
 '1733\n',
 '1875\n',
 '6239\n',
 '1009\n',
 '9058\n',
 '2833\n',
 '3555\n',
 '2329\n',
 '5901\n',
 '8765\n',
 '5317\n',
 '3505\n',
 '1310\n',
 '2050\n',
 '9305\n',
 '6941\n',
 '1267\n',
 '1801\n',
 '4181\n',
 '4854\n',
 '5549\n',
 '7990\n',
 '1058\n',
 '7066\n',
 '3719\n',
 '7080\n',
 '5874\n',
 '5706\n',
 '5397\n',
 '2746\n',
 '2246\n',
 '3172\n',
 '3561\n',
 '9407\n',
 '9207\n',
 '3912\n',
 '365\n',
 '6607\n',
 '1381\n',
 '5283\n',
 '1477\n',
 '3932

In [117]:
minHeap[:20]

[5001,
 5002,
 5013,
 5003,
 5027,
 5016,
 5017,
 5040,
 5004,
 5032,
 5029,
 5019,
 5034,
 5018,
 5024,
 5049,
 5053,
 5012,
 5005,
 5050]

In [118]:
maxHeap[:20]

[5000,
 4999,
 4957,
 4998,
 4821,
 4872,
 4956,
 4990,
 4997,
 4791,
 4819,
 4843,
 4870,
 4909,
 4955,
 4962,
 4989,
 4993,
 4996,
 4740]

In [119]:
len(minHeap)

5001

In [120]:
len(maxHeap)

5001

### This is hopefully the answer to the assignment:

In [121]:
sum(medians) % 10000

1213

## Compare to a search tree implementation for speed

In [144]:
len(ints)

10000

In [211]:
## First int will be root, and all following ints will be added as leaves
def add(parent, num):
    '''
    args are TreeNode objects.
    @num is the node being added to the search tree, and has as its key (.val) an integer, for this assignment.
    '''
    if num.val > parent.val:
        if parent.right is None:
            parent.right = num
        else:
            add(parent.right, num)
    elif parent.left is None:
        parent.left = num
    else:
        add(parent.left, num)
    

In [210]:
## traverse the tree to find the specified rank

def countKids(node, ind, arr):
    '''
    @node is a TreeNode object with a value pointer and left and right child pointers
    '''
    ## minimum key will be as far left as possible, so this method will recurse in that direction
    ####  and build a sorted array from smallest to largest keys.
    if node.left != None:
        countKids(node.left, ind, arr)
    ## When there are no more left children to recurse on, the current @node has the next smallest key
    ### and so gets appended to the array and @ind is incremented.  As it stands, this function is
    #### wasteful, since @ind isn't used in this assignment.  But could be used to find rank without
    ##### building an array, with a little more brainpower.
    arr.append(node.val)
    ind += 1
    ## After the current @node, the next smallest keys are those in its right subtree, so recurse right.
    if node.right != None:
        countKids(node.right, ind, arr)
        
def nth(root, n):
    '''
    @root is a TreeNode object with a value pointer and left and right child pointers
    This takes twice as long as necessary to find the median,
    since it orders the complete tree rather than just the desired first half.
    It is also wasteful to both build the ordered array and keep track of the current index.
    '''
    i = 0
    ordered = []
    countKids(root, i, ordered)
    return ordered[n]


In [173]:
class TreeNode:
    def __init__(self, n):
        self.val = n
        self.left = None
        self.right = None

In [208]:
numbers = ints[:]
root = TreeNode(numbers[0])
medians = [numbers[0]]

for i in range(1, len(numbers)):
    add(root, TreeNode(numbers[i]))
    medians.append(nth(root, i//2))

In [206]:
nth(root, 5000)

5001

In [209]:
medians[:20]

[6331,
 2793,
 2793,
 2793,
 2793,
 1640,
 2793,
 2303,
 2793,
 2303,
 2793,
 2793,
 4292,
 4292,
 4479,
 4479,
 5147,
 4479,
 4479,
 4479]