## Realistic events are big
- n hits produce n² segments
- trackml has 1e5 hits per event
- 1e10 does not fit

In [None]:
from segment.candidate import gen_seg_layered, gen_seg_all
from datasets import bman, trackml
import pandas as pd

In [None]:
event = trackml.get_one_event_by_volume()
len(event)

In [None]:
%time seg = gen_seg_layered(event)
print(f'{len(seg):_}')
pd.DataFrame(seg).info()

In [None]:
%time seg = gen_seg_all(event)
print(f'{len(seg):_}')
pd.DataFrame(seg).info()

In [None]:
event = trackml.get_one_event()
len(event)

In [None]:
%time seg = gen_seg_layered(event.iloc[:40000])
print(f'{len(seg):_}')
pd.DataFrame(seg).info()
del seg

## try dask?

In [None]:
import logging

import dask
from dask.distributed import Client
from dask import dataframe as dd, array as da
import numpy as np
import pandas as pd
client = Client(n_workers=1, threads_per_worker=2, processes=False,
                memory_limit='1GB', silence_logs=logging.ERROR)
client

In [None]:
from datasets import bman, trackml
event = trackml.get_one_event()
len(event)

In [None]:
chunks = [*range(0, len(event), 6000), len(event)]
chunks = list(zip(chunks[:-1], chunks[1:]))

In [None]:
pos = event[['x', 'y', 'z']].to_numpy()

In [None]:
from numba import njit, prange

@njit(parallel=True)
def dist_chunk(a, b, pos):
    x, y, z = pos[:, 0], pos[:, 1], pos[:, 2]
    da, db = a[1] - a[0], b[1] - b[0]
    result = np.empty((da, db), np.single)
    for i in prange(da):
        for j in prange(db):
            ii, jj = a[0] + i, b[0] + j
            result[i, j] = np.sqrt((x[ii] - x[jj])**2 + 
                                   (y[ii] - y[jj])**2 +
                                   (z[ii] - z[jj])**2)
    return result.ravel()
%timeit dist_chunk(chunks[0], chunks[1], pos)

In [None]:
delayed_pos = dask.delayed(pos)
dist = da.concatenate([
    da.from_delayed(
        dask.delayed(dist_chunk)(a, b, delayed_pos),
        shape=((a[1] - a[0]) * (b[1] - b[0]),),
        dtype=np.single
    ) for a in chunks for b in chunks
])
dist

In [None]:
%time dist.max().compute()

In [None]:
hist, bins = da.histogram(dist, bins=64, range=(0, 6500))
hist

In [None]:
h = hist.compute()

In [None]:
from matplotlib import pyplot as plt
plt.hist(bins[:-1], bins, weights=h)
pass

In [None]:
from matplotlib import pyplot as plt
plt.hist(bins[:-1], bins, weights=h, cumulative=True)
pass

In [None]:
from matplotlib import pyplot as plt
plt.hist(bins[:-1], bins, weights=np.cumsum(h))
pass

 now we can estimate how many segments will remain after a possible length cut