# Minimal example of "big arrays in loop" problem

I found that there was a weird long pause after completing the last run-through of a loop when the loops created big numpy arrays, before the line after the loop was executed.
Let's try to replicate the causes.

Later note: I never seemed able to replicate the problem.

In [None]:
import numpy as np
from time import time, sleep
import sortednp
from tqdm import tqdm, trange

from unconstrained import prune_arrivals, sample_points

In [None]:
# def prune_arrivals( times, locations ):
#     N = len(times)
#     indices = [0]
#     for i in range(1,N):
#         covered = False
#         x = locations[i]
#         t = times[i]
#         for j in indices:
#             vec = locations[j] - x
#             radius = t - times[j]
#             d2 = np.dot(vec,vec)
#             if d2 < radius*radius:
#                 covered=True
#                 break
#         if not covered:
#             indices.append(i)
#     return indices

In [None]:
# N = 1000000
# n_loops = 11

# start_time = time()
# for i in range(n_loops):
#     times = np.random.random(size=N)
#     seeds = np.random.random(size=(N,2))
#     arrived = prune_arrivals(times, seeds)
#     print(f'Step {i} completed at time {time()-start_time:.3f}')
# print(f'Loop exited at time {time()-start_time:.3f}')

In [None]:
batch_rho = 1000000
batches = 5

start_time = time()

times_processes = []
seeds_processes = []
rho = batch_rho * batches
for i in range(batches):
    times = get_arrival_times(batch_rho)
    seeds = sample_points(len(times))
    arrived = prune_arrivals(times, seeds) # pruning may be slow if rho is very large.
    times_processes.append(times[arrived])
    seeds_processes.append(seeds[arrived])
    print(f'Step {i} completed at time {time()-start_time:.3f}')
sleep(0.1)
print(f'Loop exited at time {time()-start_time:.3f}')

Hm. I don't seem to be able to replicate the bug using any of my guesses at the cause. Here's the offending bit of code verbatim:
```
batch_rho = 1.0e6
n_batches = 1000
rho = batch_rho * n_batches
max_time = 1.5*( (2*np.log(rho) + 4*np.log(np.log(rho))) / (np.pi*rho) )**(1/3)
display(f'Running until max time {max_time:.5f}.')
times = get_arrival_times(batch_rho,max_time=max_time)
seeds = sample_points(len(times))
arrived = prune_arrivals(times, seeds)
times = times[arrived]
seeds = seeds[arrived]
progress = trange(n_batches-1)
for i in progress:
    progress.set_description("Finding new arrivals")
    new_times = get_arrival_times(batch_rho,max_time=max_time)
    new_seeds = sample_points(len(new_times))
    progress.set_description("Pruning new arrivals")
    arrived = prune_arrivals(new_times, new_seeds)
    new_times = new_times[arrived]
    new_seeds = new_seeds[arrived]
    progress.set_description("Merging all arrivals")
    times, seeds = merge_jm_arrivals(times,seeds,new_times,new_seeds) # Is it faster to just stick all the arrays together and sort them at the end? (Since we don't prune in the middle any more.)
    progress.set_description("Merged. Weird pause.")
    # Comment
display("Arrivals all generated, now for the last pruning...") # There's a weird pause after the loop but before this message is printed. Not sure why.
arrived = prune_arrivals(times,seeds)
times = times[arrived]
seeds = seeds[arrived]
print(f'We have a total of {len(seeds)} arrivals with rate {rho} (that\'s {rho:.0e}).')
```

In [None]:
def merge_jm_arrivals(t1,l1,t2,l2):
    """
    Given two sets of arrival times and locations from a time-homogeneous PPP,
    merges them into a single pair.

    The arguments are all numpy arrays, and both t1 and t2 should be sort
    """
    totallen = len(t1)+len(t2)
    outtimes = np.empty(totallen)
    outseeds = np.empty((totallen,2))
    i1 = 0
    i2 = 0
    while i1 < len(t1) and i2 < len(t2):
        if t1[i1] < t2[i2]:
            outtimes[i1+i2] = t1[i1]
            outseeds[i1+i2] = l1[i1]
            i1 += 1
        else:
            outtimes[i1+i2] = t2[i2]
            outseeds[i1+i2] = l2[i2]
            i2 += 1
    if i1 == len(t1):
        outtimes[i1+i2:] = t2[i2:]
        outseeds[i1+i2:] = l2[i2:]
    else:
        outtimes[i1+i2:] = t1[i1:]
        outseeds[i1+i2:] = l1[i1:]
    return outtimes, outseeds

def get_arrival_times( rho, max_time=1.0, R=0 ):
    rate = rho*(1+2*R)**2
    Nmax = int(max_time*rate + 2*np.sqrt(max_time*rate)) # Two standard deviations above the mean
    interarrival_times = np.random.exponential(scale=1/rate,size=Nmax)
    arrival_times = np.cumsum(interarrival_times)
    too_late = np.searchsorted(arrival_times,max_time,side='right') # First index where the arrival time is at least max_time
    while too_late == Nmax: # This will be the case if we are unlucky and Nmax points arrived before time max_time. We'll just generate more points.
        interarrival_times = np.append(interarrival_times, np.random.exponential(scale=1/rate,size=Nmax))
        arrival_times = np.cumsum(interarrival_times)
        too_late = np.searchsorted(arrival_times,max_time,side='right') # First index where the arrival time is at least max_time
    return arrival_times[:too_late].copy()

In [None]:
batch_rho = 1.0e6
n_batches = 5

start_time = time()
rho = batch_rho * n_batches
max_time = 1.5*( (2*np.log(rho) + 4*np.log(np.log(rho))) / (np.pi*rho) )**(1/3)
print(f'Running until max time {max_time:.5f}.')
times = get_arrival_times(batch_rho,max_time=max_time)
seeds = sample_points(len(times))
arrived = prune_arrivals(times, seeds)
times = times[arrived]
seeds = seeds[arrived]
progress = trange(n_batches-1)
for i in progress:
    progress.set_description("Finding new arrivals")
    new_times = get_arrival_times(batch_rho,max_time=max_time)
    new_seeds = sample_points(len(new_times))
    progress.set_description("Pruning new arrivals")
    arrived = prune_arrivals(new_times, new_seeds)
    new_times = new_times[arrived]
    new_seeds = new_seeds[arrived]
    progress.set_description("Merging all arrivals")
    times, seeds = merge_jm_arrivals(times,seeds,new_times,new_seeds) # Is it faster to just stick all the arrays together and sort them at the end? (Since we don't prune in the middle any more.)
    progress.set_description("Merged. Weird pause.")
    # Comment
print(f'Arrivals all generated at time {time()-start_time:.3f}, now for the last pruning...') # There's a weird pause after the loop but before this message is printed. Not sure why.
arrived = prune_arrivals(times,seeds)
times = times[arrived]
seeds = seeds[arrived]
print(f'We have a total of {len(seeds)} arrivals with rate {rho} (that\'s {rho:.0e}).')

In [None]:
progress = trange(3)
a = 0
for i in progress:
    progress.set_description("Start of step")
    sleep(10)
    a += i
    progress.set_description("  End of step")
print("Thing after")
sleep(1)
a *= 2
print("Final thing")

If `tqdm` is the problem then the above should also have a long pause, but it seems not to.