# Analyze missing files in preprocessing of data

**Objective**: Some input files in the codalab data are missing, causing data processing to fail...

In [1]:
from pathlib import Path
import re
from collections import defaultdict
import torch
from tqdm import tqdm
from tqdm.contrib.concurrent import process_map

In [2]:
data_dir = Path("/scratch/gpfs/IOJALVO/gnn-tracking/object_condensation/")
input_data_dir = data_dir / "codalab-data"
pc_data_dir = data_dir / "point_clouds_v1"
graph_data_dir = data_dir / "graphs_v1"

In [3]:
def get_evtid(path: Path) -> int:
    return int(re.findall(r"\d+", path.name)[0])

In [4]:
def get_evtids(path: Path) -> set:
    return {get_evtid(p) for p in path.iterdir()}

In [5]:
def find_incomplete_evt_data(path: Path, n_files=4):
    sorted_paths = defaultdict(list)
    for p in path.iterdir():
        evtid = get_evtid(p)
        sorted_paths[evtid].append(p)
    for evtid, paths in sorted_paths.items():
        if len(paths) != n_files:
            print(evtid, [p.name for p in paths])


## Checks

### Check for completeness of input data

In [33]:
for part in range(1, 10):
    print(f"Part {part}")
    find_incomplete_evt_data(input_data_dir / f"part_{part}")

Part 1
Part 2
Part 3
23161 ['event000023161-truth.csv.gz', 'event000023161-particles.csv.gz']
23157 ['event000023157-cells.csv.gz']
Part 4
Part 5
Part 6
Part 7
Part 8
Part 9


### Check that we processed all sectors for every event

In [37]:
for part in range(1, 10):
    print(f"Part {part}")
    find_incomplete_evt_data(pc_data_dir / f"part_{part}", n_files=32)

Part 1
Part 2
Part 3
Part 4
Part 5
Part 6
Part 7
Part 8
Part 9


In [40]:
for part in range(1, 10):
    print(f"Part {part}")
    find_incomplete_evt_data(graph_data_dir / f"part_{part}", n_files=32)

Part 1
Part 2
Part 3
Part 4
Part 5
Part 6
Part 7
Part 8
Part 9


### Check that no events were skipped in preprocessing

In [42]:
for part in range(1, 10):
    print(f"Part {part}")
    inpt_evtids = get_evtids(input_data_dir / f"part_{part}")
    pc_evtids = get_evtids(pc_data_dir / f"part_{part}")
    graph_evtids = get_evtids(graph_data_dir / f"part_{part}")
    if not len(inpt_evtids) == len(pc_evtids) == len(graph_evtids):
        print((len(inpt_evtids), len(pc_evtids), len(graph_evtids)))

Part 1
Part 2
(950, 942, 942)
Part 3
(895, 736, 736)
Part 4
Part 5
Part 6
Part 7
Part 8
Part 9


### Check that no data files were broken

In [6]:
all_paths = []
for part in range(1, 10):
    for d in [pc_data_dir, graph_data_dir]:
        all_paths.extend((d / f"part_{part}").iterdir())
print(f"Total number of files: {len(all_paths)}")

def worker(path):
    try:
        torch.load(path)
    except:
        print(path)
        return path

broken = process_map(worker, all_paths, max_workers=18, chunksize=100)
broken = [b for b in broken if b is not None]

Total number of files: 548992


  0%|          | 0/548992 [00:00<?, ?it/s]

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,