In [44]:
import pathlib as pl
import pandas as pd
import pickle as pck
import re as re
import collections as col
import os

mount = pl.Path("/mounts/hilbert/project")
remote = pl.Path("/gpfs/project")

sample_folder = pl.Path("projects/medbioinf/data/00_RESTRUCTURE/sample-centric")

data_root = pl.Path("projects/medbioinf/data/00_RESTRUCTURE")

cache_mapping = pl.Path(".").joinpath(".cache", "file_cell_map.pck")
cache_mapping.parent.mkdir(exist_ok=True, parents=True)

hifi_cell = re.compile("(m[0-9a-z_U]{16,24}|[ABCDEFSPL0-9_\-]{22,28})")
ont_cell = re.compile("(P|G)[A-Z0-9_\-]{8,16}")

contains_date = re.compile("20[0-9]{2}[0-9]{4}")
contains_sample = re.compile("(HG|NA|GM)[0-9]{5}")

cells = re.compile(f"((?P<hifi>{hifi_cell})|(?P<ont>{ont_cell}))")

if cache_mapping.is_file():
    with open(cache_mapping, "rb") as dump:
        mapping = pck.load(dump)   
else:
    mapping = col.defaultdict(set)
    for fofn in mount.joinpath(sample_folder).glob("**/*.fofn"):
        if "strandseq" in fofn.name:
            continue
        with open(fofn, "r") as listing:
            sample = fofn.name.split("_")[0]
            for line in listing:
                is_ont = False
                is_hifi = False
                if not line.strip() or line.startswith("#"):
                    continue
                fofn_name = fofn.name
                
                file_rel_path = pl.Path(line.strip())
                file_name = file_rel_path.name
                file_path = mount.joinpath(data_root, file_rel_path)
                remote_path = remote.joinpath(data_root, file_rel_path)
                if "nanopore" in str(remote_path):
                    is_ont = True
                elif "pacbio_hifi" in str(remote_path):
                    is_hifi = True
                else:
                    raise
                file_size = os.stat(file_path).st_size
                stripped_name = file_name
                
                mobj = contains_date.search(stripped_name)
                if mobj is not None:
                    stripped_name = stripped_name.replace(mobj.group(0), "")
                mobj = contains_sample.search(stripped_name)
                if mobj is not None:
                    stripped_name = stripped_name.replace(mobj.group(0), "")
                
                if is_hifi:
                    mobj = hifi_cell.search(stripped_name)
                elif is_ont:
                    mobj = ont_cell.search(stripped_name)
                else:
                    raise
                if mobj is None:
                    raise ValueError(f"None: {file_name}")
                else:
                    cell_id = mobj.group(0)
                    cell_id = cell_id.strip("-_.")
                    mapping[("size", file_name)] = file_size
                    mapping[("remote", file_name)] = remote_path
                    read_type = "hifi" if is_hifi else "ont"
                    mapping[(read_type, cell_id, fofn_name)].add(file_name)
                    mapping[(read_type, file_name, fofn_name)].add(cell_id)
                    
    with open(cache_mapping, "wb") as dump:
        pck.dump(mapping, dump)

def to_gb(size_in_byte):
    return round(size_in_byte / 1e9, 1)


def determine_mean_file_size(data_files, read_type):
    
    total_size = 0
    total_files = 0
    for (smp, rtype), files in data_files.items():
        if rtype != read_type:
            continue
        total_size += sum(t[0] for t in files)
        total_files += len(files)
    return total_size / total_files


def group_files(data_files, mean_size, read_type):
    print(mean_size / 1e9)
    grouping = []
    for (smp, rtype), files in data_files.items():
        if rtype != read_type:
            continue
        if len(files) < 3:
            gnum = 1
            for size, file_name, remote_path in sorted(files, reverse=True):
                group_name = f"{smp}-{read_type}-G{gnum}"
                group_size = size
                group_files = str(remote_path)
                grouping.append((group_name, 1, group_size, group_files))
                gnum += 1
        else:
            gnum = 1
            gsize = 0
            gcard = 0
            gfiles = []
            sample_grouping = []
            for size, file_name, remote_path in sorted(files, reverse=False):
                gsize += size
                gcard += 1
                gfiles.append(str(remote_path))
                if gsize > mean_size:
                    group_name = f"{smp}-{read_type}-G{gnum}"
                    group_files = ",".join(gfiles)
                    sample_grouping.append((group_name, gcard, gsize, group_files))
                    gnum += 1
                    gsize = 0
                    gcard = 0
                    gfiles = []
            if gfiles:
                group_name = f"{smp}-{read_type}-G{gnum}"
                group_files = ','.join(sorted(gfiles))
                assert gsize > mean_size, gsize
                sample_grouping.append((group_name, gcard, gsize, group_files))
        grouping.extend(sample_grouping)

    df = pd.DataFrame.from_records(
        grouping,
        columns=["sample", "cardinality", "size_byte", "input"]
    )
    return df
                    
                    
                    


group_by_sample = col.defaultdict(list)
for k,v in mapping.items():
    if k[0] in ["size", "remote"]:
        continue
    read_type, cell_or_file, fofn_name = k
    sample = fofn_name.split("_")[0]
    if cell_or_file.endswith(".fastq.gz"):
        continue
    for file_name in v:
        file_size = mapping[("size", file_name)]
        remote_path = mapping[("remote", file_name)]
        group_by_sample[(sample, read_type)].append((file_size, file_name, remote_path))
        
mean_hifi_size = determine_mean_file_size(group_by_sample, "hifi")
print("HiFi ", mean_hifi_size / 1e9)
mean_ont_size = determine_mean_file_size(group_by_sample, "ont")
print("ONT ", mean_ont_size / 1e9)

hifi_groups = group_files(group_by_sample, int(mean_hifi_size * 0.5), "hifi")
print(hifi_groups)
ont_groups = group_files(group_by_sample, int(mean_ont_size * 0.5), "ont")
print(ont_groups)

HiFi  22.71897417145091
ONT  23.479925036283827
11.359487085
              sample  cardinality    size_byte   
0    HG03248-hifi-G1            1  18391712531  \
1    HG03248-hifi-G2            1  19162977107   
2    HG03248-hifi-G3            1  20532393988   
3    HG03248-hifi-G4            1  20720474319   
4    HG03248-hifi-G5            1  23391545082   
..               ...          ...          ...   
252  HG01457-hifi-G2            1  19776302414   
253  HG01457-hifi-G3            1  22259037332   
254  HG01457-hifi-G4            1  24039217397   
255  HG01457-hifi-G5            1  28157039941   
256  HG01457-hifi-G6            1  28343468088   

                                                 input  
0    /gpfs/project/projects/medbioinf/data/00_RESTR...  
1    /gpfs/project/projects/medbioinf/data/00_RESTR...  
2    /gpfs/project/projects/medbioinf/data/00_RESTR...  
3    /gpfs/project/projects/medbioinf/data/00_RESTR...  
4    /gpfs/project/projects/medbioinf/data/00_RESTR..