In [1]:
import collections as col
import re
import datetime as dt
import pandas as pd
import pathlib as pl

import itertools as itt

DRYRUN = False
VERBOSE = False

ts = dt.datetime.now()
TIMESTAMP = ts.strftime("%Y%m%dT%H%M")

MOUNT = pl.Path("/mounts/hilbert/project")
TOP_LEVEL = pl.Path("projects/medbioinf/data/00_RESTRUCTURE")
project_folders = [
    "hgsvc", "unknown", "PRJNA586863", "PRJNA731524", "PRJNA813010"
]
data_folders = ["nanopore", "pacbio_hifi"]

cell_metadata = pl.Path("/home/ebertp/work/code/cubi/project-run-hgsvc-hybrid-assemblies/annotations/external")

hifi_cells = cell_metadata.glob("*hifi*.tsv")
ont_cells = cell_metadata.glob("*ont*.tsv")

relabel_file = cell_metadata.joinpath("relabel_sources.tsv")

clean_out = cell_metadata.parent.joinpath("hgsvc_cells.tsv")
error_out = cell_metadata.parent.joinpath("errors.tsv")

PROBLEM_SAMPLES = []

def load_cell_metadata(fpath):
    
    source_must_exist = False
    if "jax" in fpath.name:
        source = "JAX"
    elif "uwash" in fpath.name or "UW" in fpath.name:
        source = "UW"
    elif "umigs" in fpath.name:
        source = "UMIGS"
    else:
        source_must_exist = True
        #raise ValueError(f"Unknown project {fpath.name}")
    if "hifi" in fpath.name:
        read_type = "HiFi"
    elif "ont" in fpath.name:
        read_type = "ONT"
    else:
        raise ValueError(f"Unknown read type: {fpath.name}")
    
    df = pd.read_csv(fpath, header=0, sep="\t")
    df.columns = [c.lower() for c in df.columns]
    if source_must_exist:
        assert "source" in df.columns
        df = df[["sample", "cell", "source"]]
    else:
        df = df[["sample", "cell"]]
        df["source"] = source
    df["sample"] = df["sample"].str.strip()
    df["cell"] = df["cell"].str.strip()
    df["cell"] = df["cell"].str.extract("([A-Z\-_a-z0-9]+)")
    df["sin"] = "SIN:" + df["sample"].str.extract("([0-9]+)") 
    df["read_type"] = read_type
    df["annotation_source"] = fpath.name
    return df


def relabel_sources(labeling, cell_table):
    
    new_labels = pd.read_csv(labeling, header=0, sep="\t")
    new_labels["cell"] = new_labels["cell"].str.strip()
    
    for row in new_labels.itertuples(index=False):
        select_sample = cell_table["sample"] == row.sample
        select_cell = cell_table["cell"].str.contains(row.cell)
        select_reads = cell_table["read_type"] == row.read_type
        selector = select_sample & select_cell & select_reads
        cell_table.loc[selector, "source"] = row.source
    return cell_table


hifi_cells = pd.concat(
    [load_cell_metadata(fp) for fp in hifi_cells],
    axis=0, ignore_index=False
)
hifi_cells = relabel_sources(relabel_file, hifi_cells)

ont_cells = pd.concat(
    [load_cell_metadata(fp) for fp in ont_cells],
    axis=0, ignore_index=False
)
ont_cells = relabel_sources(relabel_file, ont_cells)

hifi_cells["HHU_complete"] = "no"
ont_cells["HHU_complete"] = "no"
hifi_cells.sort_values(["sin", "cell"], inplace=True)
hifi_cells.reset_index(drop=True, inplace=True)
ont_cells.sort_values(["sin", "cell"], inplace=True)
ont_cells.reset_index(drop=True, inplace=True)

def group_files_by_sample_and_source(fastq_files, all_known):
    
    sample_file_groups = col.defaultdict(list)
    for fq in fastq_files:
        matches = []
        for row in all_known.itertuples():
            if row.cell not in fq.name:
                continue
            matches.append((row.sample, row.cell, row.source, fq, row.annotation_source))
        if len(matches) > 1:
            for smp, cell, source, fp, annfile in matches:
                print(smp, ' - ', cell, ' - ', source, ' - ', fp.name, ' - ', annfile)
            raise ValueError("Multi-match")
        elif len(matches) == 0:
            continue
        else:
            sample, cell_id, source, file_path, annfile = matches[0]
            sample_file_groups[(sample, source)].append(file_path)
    return sample_file_groups


def find_matching(sample_files, known_subset, data_folder):
    global PROBLEM_SAMPLES
    missing = []
    matched = 0
    matched_sources = []
    matched_table_records = []
    for row in known_subset.itertuples(index=False):
        is_uniq = list(filter(lambda x: row.cell in x.name, sample_files))
        if len(is_uniq) == 0:
            missing.append(row.cell)
        elif len(is_uniq) == 1:
            matched += 1
            matched_sources.append(row.source)
            matched_table_records.append((row.cell, is_uniq[0]))
        else:
            pprint_mmatch = "\n".join([sf.name for sf in sample_files])
            raise ValueError(f"Multi-match: {is_uniq} - {cell}", pprint_mmatch)

    if matched == 0:
        raise ValueError("No files matched ", sample_files, known_subset)
        
    assert len(set(matched_sources)) == 1, matched_sources
    source = matched_sources[0]
        
    if matched < len(sample_files):
        sample_name = known_subset["sample"].unique()
        assert len(sample_name) == 1
        sample_name = sample_name[0]
        matched_files = set()
        for cell, file_path in sorted(matched_table_records):
            rel_path = file_path.relative_to(data_folder)
            PROBLEM_SAMPLES.append(
                (
                    sample_name,
                    cell,
                    rel_path,
                    "match_found",
                    "no-error"
                )
            )
            matched_files.add(file_path)
        for sf in sample_files:
            if sf in matched_files:
                continue
            rel_path = sf.relative_to(data_folder)
            PROBLEM_SAMPLES.append(
                (
                    sample_name,
                    "no-cell",
                    rel_path,
                    "no-match-found",
                    "unknown-file-on-share"
                )
            )
        print("ERROR")
        print("Unidentified sample files ", sample_name)
        return None, None, None
    if missing:
        sample_name = known_subset["sample"].unique()
        assert len(sample_name) == 1
        sample_name = sample_name[0]
        matched_cells = set()
        for cell, file_path in sorted(matched_table_records):
            rel_path = file_path.relative_to(data_folder)
            PROBLEM_SAMPLES.append(
                (
                    sample_name,
                    cell,
                    str(rel_path),
                    "match-found",
                    "no-error"
                )
            )
            matched_cells.add(cell)
        for row in known_subset.loc[known_subset["source"] == source, :].itertuples(index=False):
            if row.cell in matched_cells:
                continue
            assert row.sample == sample_name
            PROBLEM_SAMPLES.append(
                (
                    sample_name,
                    row.cell,
                    row.sample,
                    "no-match-found",
                    "missing-file-on-share"
                )
            )
        print("ERROR")
        print("Missing files on share ", sample_name)
        print("Sample files: ", len(sample_files))
        print("Known subset: ", known_subset.shape[0])
        return None, None, None

    sample_names = known_subset["sample"].unique()
    assert sample_names.size == 1
    sample_name = sample_names[0]
    if sample_name.startswith("GM"):
        sample_name = sample_name.replace("GM", "NA")
    
    return sample_name, matched, source


def load_fastq_files(folder_path):
    
    all_files = list(folder_path.glob("**/*.fastq.gz"))
    if not all_files:
        raise ValueError(f"no fastqs at {folder_path}")
    pass_files = [fp for fp in all_files if "fail" not in fp.name]
    all_names = [fp.name for fp in pass_files]
    return pass_files


def write_verified_file(check_file, ts, subset, sample_name, fastq_paths, fofn_path):
    
    relpaths_fastq = sorted(
        [f.relative_to(MOUNT.joinpath(TOP_LEVEL)) for f in fastq_paths]
    )
    relpaths_fastq = list(map(str, relpaths_fastq))
    direct = all(sample_name in fq for fq in relpaths_fastq)
    indirect = all(sample_name.replace("NA", "GM") in fq for fq in relpaths_fastq)
    assert direct or indirect, check_file
    
    with open(check_file, "w") as dump:
        dump.write(f"# {ts}\n")
        subset.to_csv(dump, sep="\t", header=True, index=False)

    fofn_path.parent.mkdir(exist_ok=True, parents=True)
    with open(fofn_path, "w") as fofn:
        fofn.write("\n".join(relpaths_fastq) + "\n")
    return

        
def build_fofn_path(sample_name, data_type, project, source, ds_year):
    
    fofn_path = MOUNT.joinpath(
        TOP_LEVEL, "sample-centric",
        sample_name,
        f"{sample_name}_{data_type}_fastq.{project}-{source}-{ds_year}.fofn"
    )
    return fofn_path

DATA_TYPES = {
        "nanopore": "ont",
        "pacbio_hifi": "hifi"
    }

year = re.compile("20[0-9]{2}")
possible_years = ["2018", "2019", "2020", "2021", "2022", "2023"]


for project, data_folder in itt.product(project_folders, data_folders):
    sample_folder_listings = MOUNT.joinpath(
        TOP_LEVEL,
        "project-centric",
        project,
        data_folder
    )
    cell_lut = hifi_cells if data_folder == "pacbio_hifi" else ont_cells
    data_type = DATA_TYPES[data_folder]
    if not sample_folder_listings.is_dir():
        print("skipping, no dir ", sample_folder_listings)
    for sample_folder_lst in sample_folder_listings.glob("**/sample-folder.lst"):
        with open(sample_folder_lst, "r") as listing:
            for line in listing:
                if not line.strip():
                    continue
                sample_folder = MOUNT.joinpath(TOP_LEVEL, line.strip())
                mobj = year.search(line)
                if mobj is None:
                    print("no year ", line.strip())
                    ds_year = "20XX"
                else:
                    ds_year = mobj.group(0)
                    assert ds_year in possible_years
                
                unsorted_fastq = load_fastq_files(sample_folder)
                # group sample files by data source to prevent
                # missing file exceptions
                sample_file_groups = group_files_by_sample_and_source(
                    unsorted_fastq,
                    cell_lut
                )
                for (sample, source), fastq_files in sample_file_groups.items():
                    sample_num = "SIN:" + sample[2:]
                    select_sample = cell_lut["sin"] == sample_num
                    select_source = cell_lut["source"] == source
                    subset = cell_lut.loc[select_sample & select_source, :]
                    sample_name, matched_files, matched_source = find_matching(
                        fastq_files, subset, sample_folder_listings
                    )
                    if sample_name is None:
                        continue
                    # not raising = dataset complete
                    check_file = sample_folder.joinpath(
                        f"{sample_name}.{matched_files}-cells.verified"
                    )
                    fofn_path = build_fofn_path(
                        sample_name, data_type,
                        project, matched_source, ds_year
                    )
                    if DRYRUN:
                        if VERBOSE:
                            print("Would create VERIFY: ", check_file)
                            print("Would create FOFN: ", fofn_path)
                    else:
                        if VERBOSE:
                            print("Creating: ", check_file)
                            print("Creating: ", fofn_path)
                        write_verified_file(
                            check_file,
                            TIMESTAMP,
                            subset,
                            sample_name,
                            fastq_files,
                            fofn_path
                        )
                        cell_lut.loc[subset.index, "HHU_complete"] = "yes"


prs = pd.DataFrame.from_records(
    PROBLEM_SAMPLES,
    columns=["sample_name", "cell_id", "path_or_annotated_sample", "file_cell_matched", "error_source"]
)
prs.sort_values(["sample_name", "cell_id"], inplace=True)
prs.to_csv(error_out, header=True, index=False, sep="\t")                     

if not DRYRUN:
    merged = pd.concat([hifi_cells, ont_cells], axis=0, ignore_index=False)
    merged.sort_values(["sin", "read_type", "source", "cell"], inplace=True)
    merged.to_csv(clean_out, header=True, index=False, sep="\t")

ERROR
Missing files on share  HG02769
Sample files:  22
Known subset:  23
ERROR
Missing files on share  HG00732
Sample files:  3
Known subset:  9
no year  project-centric/unknown/nanopore/HG00733
no year  project-centric/unknown/nanopore/NA24385_HG002
no year  project-centric/unknown/pacbio_hifi/HG00733
skipping, no dir  /mounts/hilbert/project/projects/medbioinf/data/00_RESTRUCTURE/project-centric/PRJNA586863/nanopore
no year  project-centric/PRJNA586863/pacbio_hifi
skipping, no dir  /mounts/hilbert/project/projects/medbioinf/data/00_RESTRUCTURE/project-centric/PRJNA731524/nanopore
no year  project-centric/PRJNA731524/pacbio_hifi
skipping, no dir  /mounts/hilbert/project/projects/medbioinf/data/00_RESTRUCTURE/project-centric/PRJNA813010/nanopore
no year  project-centric/PRJNA813010/pacbio_hifi
