In [20]:
# we are given a list of items from 1...n
# find a subset of items s.t. every number appears with frequency > 1/k (k = 2, item shows up more than half of the time)
# this is done approximately using the Misra-Gries algorithm

# return the k-1 items with the highest total counts
# if k = 2, only keep one with highest relative count

# Misra Gries is just the Majority Algorithm for more than one candidate
# NOTE: this just gives us a SUPERSET of the final answer
# but we can compute the true frequencies in a second pass
def misra_gries(elements, k):
    
    T = set()
    # store the counts of the k-1 most frequent items so far (the VIPs)
    counts = dict()
    
    for element in elements:
        # item is a VIP
        if element in T:
            counts[element] = counts[element] + 1
            
        # item is not a VIP but there is space left
        elif len(T) < k-1:
            T.add(element)
            counts[element] = 1
            
        else:
            # the item is not a VIP but appears
            # so all the VIPs must feel really bad
            remove_those = []
            
            for vip in T:
                counts[vip] = counts[vip] - 1
                
                # doesnt deserve to be a vip anymor
                if counts[vip] == 0:
                    remove_those.append(vip)
                    
            for item in remove_those:
                T.remove(item)
                    
    return T

def heavy_hitters(elements, k):
    
    n = len(elements)
    k_reduced_bag = misra_gries(elements, k)
    
    counts = dict()
    
    for element in elements:
        if element in k_reduced_bag:
            if element in counts.keys():
                counts[element] = counts[element] + 1
            else:
                counts[element] = 1
    
    final_answer = set()
    
    for key in counts.keys():
        if counts[key] >= (n / k):
            final_answer.add(key)
            
    return final_answer

In [21]:
elements = [1, 1, 1, 2, 2, 1, 3, 3, 1]

# k = 3 => total count must be >= 9/3 = 3
# only satisfied by 1
print("Misra Gries: ", misra_gries(elements, k=3))
print("Heavy hitters: ", heavy_hitters(elements, k=3))

# item makes up absolute majority
# 1 has 5/9 frequency
print("Misra Gries: ", misra_gries(elements, k=2))
print("Heavy hitters: ", heavy_hitters(elements, k=2))


# items have to have count 9/4 >= 2.25
# both 2 and 3 only have to but misra_gries is an approximation of
# the elements whose frequency exceeds 1/k
print("Misra Gries: ", misra_gries(elements, k=4))
print("Heavy hitters: ", heavy_hitters(elements, k=4))


Misra Gries:  {1}
Heavy hitters:  {1}
Misra Gries:  {1}
Heavy hitters:  {1}
Misra Gries:  {1, 2, 3}
Heavy hitters:  {1}
