In [1]:
from pathlib import Path
from trackml.dataset import load_event

In [2]:
indir = Path("/tigress") / "jdezoort" / "codalab" / "train_1"

In [3]:
truncated_files = list(Path(indir).glob("event000021354*"))

In [4]:
truncated_files

[PosixPath('/tigress/jdezoort/codalab/train_1/event000021354-truth.csv.gz'),
 PosixPath('/tigress/jdezoort/codalab/train_1/event000021354.csv'),
 PosixPath('/tigress/jdezoort/codalab/train_1/event000021354-hits.csv.gz'),
 PosixPath('/tigress/jdezoort/codalab/train_1/event000021354-cells.csv.gz'),
 PosixPath('/tigress/jdezoort/codalab/train_1/event000021354-particles.csv.gz')]

In [5]:
hits, cells, particles, truth = load_event(
    indir / "event000021354",
)

In [10]:
extracted_pids = particles["particle_id"].sample(frac=0.05).to_list()

In [11]:
extracted_pids

[378305117494976514,
 869200981554888704,
 4508616149172224,
 400828681892659200,
 851186239448023040,
 324274291455557632,
 229687360567115776,
 85604676803756032,
 535930417241391104,
 67556399592243200,
 680047254584688640,
 869199469743190018,
 495397333417074690,
 725080639518277632,
 135108263699021824,
 76564835797565440,
 157635470245756928,
 607998456639782912,
 108098348245843968,
 522447174869450752,
 639525990493585408,
 108087421849042944,
 608011169742979072,
 324273191943929856,
 707069608279937025,
 265716020147126272,
 626020826608566272,
 522427521099104256,
 108105426351947776,
 270217214592811008,
 725088267396976641,
 292737617911349248,
 256705384918548480,
 639520699093876736,
 630518653799890944,
 558451026735140865,
 680045124280909824,
 671039711732563968,
 815159985049698304,
 522427246254755841,
 810648894999363584,
 103588288987660288,
 157631621955059712,
 265714851932807170,
 684554290185895936,
 594479411420463104,
 522434186888347648,
 76561441055414681

In [12]:
f_noise = 0.15  # roughly the same as initial dataset

In [13]:
extracted_hit_ids = truth[truth["particle_id"].isin(extracted_pids)]["hit_id"].to_list()
n_noise = int(f_noise / (1 - f_noise) * len(extracted_hit_ids))
extracted_hit_ids += (
    truth[truth["particle_id"] == 0].sample(n=n_noise)["hit_id"].to_list()
)

In [14]:
s_hits = hits[hits["hit_id"].isin(extracted_hit_ids)].copy()
s_cells = cells[cells["hit_id"].isin(extracted_hit_ids)].copy()
s_particles = particles[particles["particle_id"].isin(extracted_pids + [0])].copy()
s_truth = truth[truth["hit_id"].isin(extracted_hit_ids)].copy()

In [18]:
out_dir = Path(".") / "test_data"
out_name = "test_event000000001"
out_dir.mkdir(parents=True, exist_ok=True)

In [19]:
s_hits.to_csv(out_dir / f"{out_name}-hits.csv.gz", index=False, compression="gzip")
s_cells.to_csv(out_dir / f"{out_name}-cells.csv.gz", index=False, compression="gzip")
s_particles.to_csv(
    out_dir / f"{out_name}-particles.csv.gz", index=False, compression="gzip"
)
s_truth.to_csv(out_dir / f"{out_name}-truth.csv.gz", index=False, compression="gzip")

In [17]:
hits, cells, particles, truth = load_event(
    out_dir / "test_001",
)