In [3]:
%run "../00_project_config.ipynb"

import pathlib as pl
import collections as col
import pandas as pd
import os

local_mount = pl.Path("/mounts/hilbert/project")
assert local_mount.is_dir()

remote_prefix = pl.Path("/gpfs/project")

top_path = pl.Path(
    "projects/medbioinf/projects/assemblies/hybrids/verkko/wd/results/assemblies/verkko"
)

sample_sheet_out = PROJECT_BASE.joinpath(
    "samples", "vrk_sseq_eval.tsv"
)


def replace_path(local_path):
    
    remote = str(remote_prefix)
    local = str(local_mount)
    
    new_path = str(local_path).replace(local, remote)
    return new_path

def is_empty(file_path):
    return os.stat(file_path).st_size == 0


files_by_sample = col.defaultdict(dict)
file_count = col.Counter()
for fasta_file in local_mount.joinpath(top_path).glob("**/*.fasta.gz"):
    if "exemplar" in fasta_file.name:
        continue
    check_empty = fasta_file.with_suffix(".gz.EMPTY")
    if check_empty.is_file():
        set_file_na = True
    else:
        set_file_na = False
    
    sample = fasta_file.parent.name
    try:
        plain = sample.split(".")[0]
        sex = HGSVC_SAMPLES.loc[HGSVC_SAMPLES["sample"] == plain, "sex"].values[0]
    except IndexError:
        raise ValueError(sample)
    sample = sample.replace("ps-", "vrk-ps-")
    asm_unit = fasta_file.name.split(".")[2].replace("-", "_")
    remote_full_path = replace_path(fasta_file)
    if set_file_na:
        remote_full_path = "n/a"
    file_count[sample] += 1
    files_by_sample[sample][asm_unit] = remote_full_path
    files_by_sample[sample]["sex"] = sex
    
sample_sheet = []
for sample, num_files in file_count.most_common():
    if num_files < 5:
        break
    row = pd.DataFrame.from_records(
        files_by_sample[sample], index=[sample])
    sample_sheet.append(row)
    
sample_sheet = pd.concat(sample_sheet, axis=0, ignore_index=False)
sample_sheet.sort_index()
    
with open(sample_sheet_out, "w") as dump:
    _ = dump.write(f"# {TODAY}\n")
    sample_sheet.to_csv(dump, sep="\t", header=True, index=True, index_label="sample")