# The algorithm as it stands

Take two catalogues as input:
- S3M is a simulated catalogue that contains a representative population of the solar system
- MPCOrb contains real data from actual detections

The goal is to combine the two to make a hybrid catalogue.

In order to do this we will:
1. Split catalogues into bins of absolute magnitude and for each bin
    1. Create a K-D Tree for the matching objects in the simulated catalogue (`tree`)
    2. Create a set of all objects that have already been assigned (`assigned`)
    3. For each object in the mpcorb catalogue with matching magnitudes:
        1. Set `k=100`
        2. Query the `tree` to find the nearest `k` objects in position within a distance `d_max`, save as `neighbours` and save length of `neighbours` as `count_nearby`
        3. Remove any objects that are already in `assigned`
        4. While `len(neighbours) < 1` and `count_nearby > 0` and `k < 500`
            1. `k += 100`
            2. Go back to step b
        5. If `len(neighbours) >= 1` then
            1. Find the object in `neighbours` with the closest velocity
            2. Save the pairing and add the object to `assigned`
        6. Otherwise
            1. Make a note that the real object has no match (so we will add it directly)
3. Output the list of pairings

In [111]:
import numpy as np

In [110]:
np.load("output/matched_24_25.npy")

array([183160., 162310., 140272., ..., 154940., 251413., 206844.])

In [112]:
from scipy.spatial import KDTree, cKDTree
import numpy as np
import pandas as pd

## Read in the data

In [113]:
s3m = pd.read_hdf("catalogues/s3m_propagated.h5", key="df")

In [114]:
mpcorb = pd.read_hdf("catalogues/mpcorb_propagated.h5", key="df")

In [126]:
min_mag = 24
max_mag = 25
mpcorb.H[np.logical_and(mpcorb.H >= min_mag, mpcorb.H <= max_mag)]

209214     24.20
367942     24.00
443103     24.30
469218     24.33
594981     25.00
           ...  
1132446    24.00
1132507    24.30
1132841    24.60
1133182    24.00
1133186    24.30
Name: H, Length: 3868, dtype: float64

In [123]:
mpcorb[np.logical_and(mpcorb.H >= 25, mpcorb.H <= 26)]

Unnamed: 0,id,x,y,z,vx,vy,vz,H,g,def
477718,477718.0,-2.928215,-0.174154,0.000822,-0.001597,-0.007541,0.000108,25.2,0.15,2010 SG15
478783,478783.0,0.786726,0.627914,-0.006685,-0.012451,0.011756,-0.000602,25.6,0.15,2012 UV136
594965,594965.0,-1.200930,0.302586,-0.145880,-0.006172,-0.013381,0.003101,26.0,0.15,1991 TT
594979,594979.0,-1.047440,0.172711,-0.110074,-0.004452,-0.015030,-0.007309,25.3,0.15,1992 DU
594981,594981.0,0.264205,-0.998782,0.220509,0.015886,0.004244,0.001827,25.0,0.15,1992 JD
...,...,...,...,...,...,...,...,...,...,...
1132194,1132194.0,0.860486,-1.507056,0.007435,0.007632,0.012363,0.000067,25.2,0.15,2017 WE29
1132400,1132400.0,0.675324,-0.657325,-0.048971,0.009537,0.017902,0.000155,25.4,0.15,2017 YU8
1132410,1132410.0,1.690850,0.235127,0.332680,-0.008927,0.011098,-0.001565,25.8,0.15,2017 YK14
1132469,1132469.0,0.918352,-1.720197,-0.064591,0.008580,0.005797,-0.000539,25.4,0.15,2018 BE11


In [4]:
mpcorb_xyz = np.array([mpcorb["x"].values, mpcorb["y"].values, mpcorb["z"].values]).T
s3m_xyz = np.array([s3m["x"].values, s3m["y"].values, s3m["z"].values]).T

## Split into absolute magnitude bins

In [36]:
s3m.vx.values

array([-0.00096639, -0.00460317, -0.00614024, ...,  0.0030317 ,
        0.00127532, -0.00246129])

In [76]:
v_sim = np.array([s3m.vx.values, s3m.vy.values, s3m.vz.values])

In [67]:
s3m.vx.values[:5]

array([-0.00096639, -0.00460317, -0.00614024,  0.00279847,  0.00261734])

In [69]:
v_sim

array([[-9.66389543e-04, -4.60317475e-03, -6.14023716e-03,
         2.79847253e-03,  2.61734222e-03],
       [ 1.35450038e-02,  1.19452146e-03,  2.98006585e-04,
        -2.74414253e-03, -3.00434514e-03],
       [-8.34065030e-04,  8.56008417e-04,  1.05181188e-03,
        -3.65415556e-04, -9.51059103e-05]])

In [75]:
v_real = np.array([mpcorb.vx.values, mpcorb.vy.values, mpcorb.vz.values])

In [82]:
np.sqrt(np.sum((v_sim[:, :50] - v_real[:, :50])**2, axis=0)).argmin()

40

In [87]:
np.where(mpcorb.id == 144)

(array([144]),)

In [33]:
%%timeit -n 1 -r 1

# loop every magnitude bin
magnitudes = np.arange(-2, 28)
for i in range(len(magnitudes) - 1):
    i = 14
    
    # get the matching simulated data and build a K-D Tree
    s3m_mag_mask = np.logical_and(s3m.H >= magnitudes[i], s3m.H < magnitudes[i + 1])
    s3m_id = s3m.id[s3m_mag_mask].values
    tree = cKDTree(s3m_xyz[s3m_mag_mask])
    
    # get the matching real data from MPCORB
    mpcorb_mag_mask = np.logical_and(mpcorb.H >= magnitudes[i], mpcorb.H <= magnitudes[i + 1])
    real_objects = mpcorb_xyz[mpcorb_mag_mask]
    
    print(len(s3m.H[s3m_mag_mask]), len(real_objects))
    
    # keep track of objects already assigned and a count of how many had no matches
    taken = []
    no_match_count = 0
    
    # iterate over every object in the real catalogue
    for obj in real_objects:
        
        # find the nearest 100 neighbours within 0.1 AU and mask further neighbours
        distances, inds = tree.query(obj, k=100, distance_upper_bound=0.1)
        distances, inds = distances[np.isfinite(distances)], inds[np.isfinite(distances)]
        
        ids = s3m_id[inds]
        
        # get only the options which haven't yet been assigned
        unassigned_options = np.setdiff1d(ids, taken, assume_unique=True)

        # if there are any matching objects
        if len(unassigned_options) > 0:
            
            # choose the one with the closest velocity
            match = unassigned_options[0]
            taken.append(match)
        else:
            no_match_count += 1
    
    print(len(taken), no_match_count)
            
    break

8404 5364
1910 3454
1.6 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [102]:
def merge_catalogues(sim, real, min_mag, max_mag, k=100, d_max=0.1):
    """Merge the simulated solar system catalogue with the real MPCORB data for a
    certain magnitude bin.
    
    Parameters
    ----------
    
    sim : `pandas DataFrame`
        Dataframe with the simulated catalogue (must contain x,y,z,vx,vy,vz and H)
    
    real : `pandas DataFrame`
        Dataframe with the real catalogue (must contain x,y,z,vx,vy,vz and H)
        
    min_mag : `float`
        Minimum magnitude to consider for this merge
        
    max_mag : `float`
        Maximum magnitude to consider for this merge
        
    k : `int`
        Maximum number of neighbours to find
        
    d_max : `float`
        Maxmimum distance within which to find neighbours
        
    Returns
    -------
    taken_ids : `float/array`
        An array of the ids that have been replaced by the real objects in this
        magnitude bin
        
    no_match_count : `int`
        A count of the number of systems that had no matches (and thus must be
        added directly to the simulated catalogue)
    """
    real_xyz = np.array([real["x"].values, real["y"].values, real["z"].values]).T
    sim_xyz = np.array([sim["x"].values, sim["y"].values, sim["z"].values]).T
    
    v_sim = np.array([sim.vx.values, sim.vy.values, sim.vz.values])
    v_real = np.array([real.vx.values, real.vy.values, real.vz.values])
    
    # get the matching simulated data and build a K-D Tree
    sim_mag_mask = np.logical_and(sim.H >= min_mag, sim.H < max_mag)
    sim_id = sim.id[sim_mag_mask].values
    tree = cKDTree(sim_xyz[sim_mag_mask])
    
    # get the matching real data from MPCORB
    real_mag_mask = np.logical_and(real.H >= min_mag, real.H <= max_mag)
    real_objects = real_xyz[real_mag_mask]
    
    # keep track of objects already assigned and a count of how many had no matches
    taken = []
    no_match_count = 0
    
    # iterate over every object in the real catalogue
    for obj in real_objects:
        
        # find the nearest k neighbours within d_max and mask further neighbours
        distances, inds = tree.query(obj, k=k, distance_upper_bound=d_max)
        distances, inds = distances[np.isfinite(distances)], inds[np.isfinite(distances)]
        
        # get only the options which haven't yet been assigned
        unassigned_inds = np.setdiff1d(inds, taken, assume_unique=True)

        
        # if there are many matching object
        if len(unassigned_inds) > 0:
            # find the closest velocity of the bunch and assign it
            best = np.sum((v_sim[:, unassigned_inds] - v_real[:, unassigned_inds])**2, axis=0).argmin()
            taken.append(unassigned_inds[best])
        
        # if only one then just immediately assign it
        elif len(unassigned_inds) == 1:
            taken.append(unassigned_inds[0])
        
        # otherwise then there was no match
        else:
            no_match_count += 1
            
    return np.array(sim_id[taken]), no_match_count

In [103]:
merge_catalogues(s3m, mpcorb, 12, 13)

False
True
False
True
True
True
True
False
False
False
True
True
True
True
True
True
True
False
False
False
False
True
True
True
True
False
True
True
True
True
False
True
True
True
True
True
True
True
True
True
True
True
True
True
True
False
True
True
True
True
True
False
True
True
False
True
False
True
True
False
True
True
True
True
False
True
True
True
True
True
False
True
True
True
True
True
True
False
True
True
True
False
True
True
True
True
False
True
True
False
True
True
False
True
False
True
False
True
True
True
True
True
False
True
False
True
True
True
False
True
True
True
True
True
True
True
False
True
True
True
True
True
True
True
False
True
False
True
True
True
True
True
True
True
False
True
True
False
False
True
False
True
False
False
True
False
True
False
True
True
True
True
True
True
True
True
True
False
True
True
True
True
False
False
True
True
True
True
True
True
False
False
False
False
True
True
True
True
True
True
True
False
False
True
True
True
True
True
True
True
Tr

True
True
True
True
False
False
True
False
True
True
True
True
True
True
True
True
True
False
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
False
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
False
True
False
False
False
True
False
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
False
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
False
True
True
True
False
True
True
True
True
True
True
False
True
True
False
True
True
True
True
True
True
True
True
True
False
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
False
True
True
True
True
True
False
False
True
True
True
True
True
True
True
True
False
True
True
True
True
True
True
True
True
True
True
True
True
True
False
True
True
True
True
True
True
True
True
True
True
True
True
False
True
True
True
True
True
True
True
True
True
True
True
True
True
True
Tru

(array([287623., 314794., 337270., ..., 279835., 287040., 293720.]), 3459)

In [98]:
merge_catalogues(s3m, mpcorb, 12, 13)

(array([287623., 314794., 337270., ..., 279835., 287040., 293720.]), 3459)