In [5]:
%run "match_sample_files.ipynb"

import pandas as pd
import pathlib as pl
import collections as col

DRY_RUN = False

MANUALLY_CURATED_SOURCES = {
    "UW_WH": ("UW", "hgsvc"),
}

fofn_files = []
fofn_files_out = PROJECT_BASE.joinpath("samples", "fofn_table.tsv")

for sample_name, sample_obj in SAMPLES.items():
    if sample_name == sample_obj.alt:
        continue
    
    sample_files = []
    sample_files_main = FILES_EXIST_PER_SAMPLE[sample_name]
    sample_files_alt = FILES_EXIST_PER_SAMPLE[sample_obj.alt]
    
    for sf_main in sample_files_main:
        if sf_main in sample_files:
            continue
        sample_files.append(sf_main)
    
    for sf_alt in sample_files_alt:
        if sf_alt in sample_files:
            continue
        sample_files.append(sf_alt)
                
    assert len(sample_files) == len(sample_files_main) + len(sample_files_alt)
    
    for fn, sample_file in enumerate(sample_files, start=0):
        key = sample_file.read_type, sample_file.file_group
        sample_obj.sample_files[key].append(sample_file.data_rel_path)
        if fn == 0:
            sample_obj.source_dates[sample_file.file_group].append(
                sample_file.group_date
            )
            sample_obj.source_dates[sample_file.file_group].append(
                sample_file.file_date
            )
        else:
            sample_obj.source_dates[sample_file.file_group].append(
                sample_file.file_date
            )
            
    
    if sample_obj.hifi_complete:
        for (read_type, read_group), read_files in sample_obj.sample_files.items():
            if read_type != "hifi":
                continue
            source_dates = sample_obj.source_dates[read_group]
            try:
                source_date = str(int(source_dates[0]))
            except ValueError:
                source_date, _ = col.Counter(source_dates).most_common(1)[0]
                
            read_group_info = MERGED.loc[MERGED["file_group"] == read_group, ["data_source", "project"]]
            if read_group_info.empty:
                read_group_info = set(
                    ALL_FILES.loc[ALL_FILES["file_group"] == read_group, "curated_by"].values
                )
                data_source, project = MANUALLY_CURATED_SOURCES[read_group_info]
            else:
                data_source = set(read_group_info["data_source"].values).pop()
                project = set(read_group_info["project"].values).pop()
                
            if data_source == project:
                assert data_source.startswith("HGSVC") and project.startswith("HGSVC")
                project = "hgsvc"
                data_source = data_source[5:].upper()
                
            source_path = sample_obj.get_file_group_lca_path(read_files)
            verified_file_path = PATH_PREFIX.local.joinpath(
                DATA_ROOT, source_path
            )
            
            verified_file_name = f"{sample_obj.name}.{len(read_files)}-cells.verified"
            verified_read_files = sorted([fp.name for fp in read_files])
            if not DRY_RUN:
                assert verified_file_path.is_dir()
                with open(verified_file_path.joinpath(verified_file_name), "w") as dump:
                    _ = dump.write(f"# {TODAY}\n")
                    _ = dump.write("\n".join(verified_read_files))
                    
            fofn_name = f"{sample_obj.name}_{read_type}_fastq.{project}-{data_source}-{source_date}.fofn"

            fofn_path = LOCAL_SAMPLE_ROOT.joinpath(
                sample_obj.name, fofn_name
            )
            
            if not DRY_RUN:
                with open(fofn_path, "w") as dump:
                    for rf in sorted(read_files):
                        assert str(rf).startswith("project-centric")
                        _ = dump.write(str(rf) + "\n")
            
            if sample_obj.ont_complete:
                remote_fofn_path = str(fofn_path).replace(str(PATH_PREFIX.local), str(PATH_PREFIX.remote))
                fofn_files.append((sample_obj.name, read_type, remote_fofn_path))
                
    if sample_obj.ont_complete:
        for (read_type, read_group), read_files in sample_obj.sample_files.items():
            if read_type != "ont":
                continue
            source_dates = sample_obj.source_dates[read_group]
            try:
                source_date = str(int(source_dates[0]))
            except ValueError:
                source_date, _ = col.Counter(source_dates).most_common(1)[0]
            
            read_group_info = MERGED.loc[MERGED["file_group"] == read_group, ["data_source", "project"]]
            if read_group_info.empty:
                read_group_info = set(
                    ALL_FILES.loc[ALL_FILES["file_group"] == read_group, "curated_by"].values
                )
            else:
                data_source = set(read_group_info["data_source"].values).pop()
                project = set(read_group_info["project"].values).pop()

            if data_source == project:
                assert data_source.startswith("HGSVC") and project.startswith("HGSVC")
                project = "hgsvc"
                data_source = data_source[5:].upper()
               
            source_path = sample_obj.get_file_group_lca_path(read_files)
            verified_file_path = PATH_PREFIX.local.joinpath(
                DATA_ROOT, source_path
            )
            
            verified_file_name = f"{sample_obj.name}.{len(read_files)}-cells.verified"
            verified_read_files = sorted([fp.name for fp in read_files])
            
            if not DRY_RUN:
                assert verified_file_path.is_dir()
                with open(verified_file_path.joinpath(verified_file_name), "w") as dump:
                    _ = dump.write(f"# {TODAY}\n")
                    _ = dump.write("\n".join(verified_read_files))

            fofn_name = f"{sample_obj.name}_{read_type}_fastq.{project}-{data_source}-{source_date}.fofn"

            fofn_path = LOCAL_SAMPLE_ROOT.joinpath(
                sample_obj.name, fofn_name
            )

            if not DRY_RUN:
                with open(fofn_path, "w") as dump:
                    for rf in sorted(read_files):
                        assert str(rf).startswith("project-centric")
                        _ = dump.write(str(rf) + "\n")
            
            if sample_obj.hifi_complete:
            
                remote_fofn_path = str(fofn_path).replace(str(PATH_PREFIX.local), str(PATH_PREFIX.remote))
                fofn_files.append((sample_obj.name, read_type, remote_fofn_path))
            
fofn_files = pd.DataFrame.from_records(fofn_files, columns=["sample", "read_type", "fofn_path"])
with open(fofn_files_out, "w") as dump:
    _ = dump.write(f"# {TODAY}\n")
    fofn_files.to_csv(dump, sep="\t", header=True, index=False)



Skipping over 19320 - not enough data

Skipping over 19320 - not enough data

Skipping over 19320 - not enough data

Skipping over 19320 - not enough data
