In [20]:
import pathlib as pl
import json as json
import io

local_mount = pl.Path(
    "/mounts/hilbert/project"
)
remote_root = "/gpfs/project"

stats_output_folder = pl.Path(
    "projects/medbioinf/projects/assemblies/hybrids/verkko/wd/results/statistics/assemblies"
)

json_output_folder = pl.Path(
    "projects/medbioinf/projects/assemblies/hybrids/verkko/wd/proc/assemblies/verkko"
)

pilot_samples = ["HG00733", "NA21487", "HG00171", "HG02666", "HG02953"]
skip_outfiles = [
    "sample",
    "wg_fasta",
    "wg_layout"
]

out_root_folder = pl.Path(
    "/gpfs/project/projects/medbioinf/data/00_RESTRUCTURE",
    "shares/globus/outgoing/hgsvc/ebi_upload/20230711_verkko_pilot",
    
)


def load_stats_files(stats_path):
    
    stats_files = []
    for stats_file in stats_path.glob("*summary.tsv"):
        if "ps-none" in stats_file.name:
            continue
        if "verkko-asm-wg" in stats_file.name:
            continue
        sample = stats_file.name.split(".")[0]
        assert sample in pilot_samples, sample
        target_path = out_root_folder.joinpath(
            sample, "statistics", stats_file.name
        )
        source_path = str(stats_file).replace(str(local_mount), remote_root)
        stats_files.append((source_path, target_path))
    return stats_files

# build sub:
# sample / sequences
# sample / statistics

data_mover_script = pl.Path(
    "/home/ebertp/work/code/cubi/project-run-hgsvc-hybrid-assemblies/scripts/datamng",
    "data_mover_pilot.sh"
)
data_mover_script.parent.mkdir(exist_ok=True, parents=True)

data_mover = io.StringIO()
data_mover.write("#!/usr/bin/bash\n\n")

stats_files = load_stats_files(
    local_mount.joinpath(stats_output_folder)
)

for json_file in local_mount.joinpath(json_output_folder).glob("*ps-sseq.output.json"):
    with open(json_file, "r") as dump:
        outfiles = json.load(dump)
        sample = outfiles["sample"]
        assert sample in pilot_samples
        
        target_root = out_root_folder.joinpath(sample)
        data_mover.write(f"mkdir -p {target_root.joinpath('sequences')}\n")
        data_mover.write(f"mkdir -p {target_root.joinpath('statistics')}\n")
        
        for outkey, outpaths in outfiles.items():
            if outkey in skip_outfiles:
                continue
            suffix = outkey
            if suffix.endswith("_fasta"):
                suffix = suffix.rsplit("_", 1)[0]
            source_path = pl.Path(outpaths["abs_path"])
            file_ext = source_path.suffix
            target_name = f"{sample}.ps-sseq.verkko.{suffix}{file_ext}"            
            target_path = target_root.joinpath(
                "sequences", target_name
            )
            data_mover.write(
                f"rsync --progress --checksum {source_path} {target_path}\n"
            )          
            
for src, trg in stats_files:
    data_mover.write(
        f"rsync --progress --checksum {src} {trg}\n"
    )
    
with open(data_mover_script, "w") as dump:
    dump.write(data_mover.getvalue())