<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Problem-statement" data-toc-modified-id="Problem-statement-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Problem statement</a></span></li></ul></div>

In [1]:
import numpy as np

In [4]:
n = 100_000
x = np.random.rand(n)

In [5]:
%%time
np.argsort(x)

CPU times: user 8.22 ms, sys: 1.28 ms, total: 9.5 ms
Wall time: 7.99 ms


array([44812, 11689,  8420, ..., 16399, 78194, 19951])

In [15]:
import heapq
from heapq import heappush, heappop

def heapsort(iterable):
    h = []
    for value in iterable:
        heappush(h, value)
    return [heappop(h) for i in range(len(h))]

In [16]:
h = []

In [24]:
x_aux = heapsort(x)

### Problem statement

- We are given a bunch of liststs and for each of those we want to get the smallest values.

- We want to keep track of the `limit` smallest values acrosss all the lists that we visit.

- Solution 1:
  - gather all lists (concatenate) sort and get the `limit` smallest values.

- Solution 2:
  - Keep only `limit` elements in memory and update those



In [23]:
limit = 20
results = np.zeros(limit)
results

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0.])

In [60]:
x = [1,4,7,11]
y = [2,3,9,13]
aux = heapq.merge(x,y)
top_k = [x for i,x in enumerate(aux) if i<=1]
top_k

[1, 2]

In [61]:
x = np.array([1,4,7,11])
y = np.array([2,3,9,13])
aux = heapq.merge(x,y)
top_k = [x for i,x in enumerate(aux) if i<=1]
top_k

[1, 2]

In [138]:
n_cells = 100
limit = 20
n_elements = 1000
partial_results = [np.sort(np.random.rand(n_elements)) for cell in range(n_cells)]


In [129]:
def sort_with_heapq(partial_results, limit=20):
    sorted_results = partial_results[0]
    for x in partial_results[1:]:
        sorted_results = heapq.merge(sorted_results, x)

    top_k_heap = [x for i,x in enumerate(sorted_results) if i<limit]
    return top_k_heap

In [130]:
%%timeit
top_k_heap = sort_with_heapq(partial_results)

28.3 ms ± 2 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [131]:
def naive_sort(partial_results, limit=20):
    all_results = np.hstack(partial_results)
    return np.partition(all_results, limit)[0:limit]

In [132]:
%%timeit
top_k_naive = naive_sort(partial_results)

81.9 µs ± 7.85 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [141]:
top_limit = np.partition(partial_results[0],limit)[:limit]
top_limit

array([0.00076741, 0.00216987, 0.00239313, 0.0031574 , 0.00407168,
       0.0053202 , 0.00733216, 0.00836903, 0.00889813, 0.00920386,
       0.01241766, 0.01301133, 0.01463368, 0.01534017, 0.01725357,
       0.0173481 , 0.01976626, 0.0209453 , 0.02227847, 0.02581008])

In [161]:
def priority_queue_sorted(partial_results, limit):
    for partial_res in partial_results[1:]:
        for result in partial_res:
            # top_limit[-1]
            if result < top_limit[-1]:
                # find position j to put it into the top_limit
                for j in range(limit):
                    if result < top_k[j]:
                        break
                #top_limit[j] has to be result
                #top_limit[-1] se va a la calle
                #top_limit[j:] se mueve a la derecha
                top_limit[j+1:] = top_limit[j:-1]
                top_limit[j] = result


In [164]:
%%timeit
priority_queue_sorted(partial_results, limit)

21.3 ms ± 1.94 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [159]:
top_limit.sort()
top_limit

array([1.63230407e-05, 3.11500705e-05, 3.93326165e-05, 6.39855499e-05,
       9.01607294e-05, 9.38105685e-05, 9.95556330e-05, 1.05528635e-04,
       1.12312109e-04, 3.19490233e-04, 4.12880862e-04, 5.86107843e-04,
       7.12944248e-04, 7.16127697e-04, 7.62111263e-04, 8.40066867e-04,
       9.89657490e-04, 1.43738500e-03, 1.59005847e-03, 2.39598039e-03])

In [156]:
top_k_naive = naive_sort(partial_results)

In [157]:
top_k_naive

array([1.63230407e-05, 6.39855499e-05, 6.84044883e-05, 9.38105685e-05,
       5.16463262e-05, 9.69562463e-05, 9.68966784e-05, 9.01607294e-05,
       9.95290097e-05, 3.93326165e-05, 4.19230420e-05, 3.11500705e-05,
       9.95556330e-05, 1.14400970e-04, 1.05528635e-04, 1.31254954e-04,
       1.12312109e-04, 1.32316435e-04, 1.33277191e-04, 1.36219991e-04])