In [29]:
import polars as pl
from pathlib import Path
import os

In [30]:
site_to_acccession_map = {
    "CCH_0001_WB_01": "EB100001",
    "CCH_0002_WB_01": "EB100002",
    "CCH_0003_WB_01": "EB100003",
    "CKD_0001_WB_01": "EB100004",
    "CKD_0002_WB_01": "EB100005",
    "CKD_0003_WB_01": "EB100006",
    "EXP_0001_WB_01": "EB100007",
    "EXP_0002_WB_01": "EB100008",
    "EXP_0003_WB_01": "EB100009",
    "MOM_0001_WB_01": "EB100010",
    "MOM_0002_WB_01": "EB100011",
    "MOM_0003_WB_01": "EB100012",
    "UIC_0004_WB_01": "EB100013",
    "UIC_0005_WB_01": "EB100014",
    "UIC_0006_WB_01": "EB100015"
}

site_to_acccession_map = {v: k for k, v in site_to_acccession_map.items()}

print(site_to_acccession_map)

{'EB100001': 'CCH_0001_WB_01', 'EB100002': 'CCH_0002_WB_01', 'EB100003': 'CCH_0003_WB_01', 'EB100004': 'CKD_0001_WB_01', 'EB100005': 'CKD_0002_WB_01', 'EB100006': 'CKD_0003_WB_01', 'EB100007': 'EXP_0001_WB_01', 'EB100008': 'EXP_0002_WB_01', 'EB100009': 'EXP_0003_WB_01', 'EB100010': 'MOM_0001_WB_01', 'EB100011': 'MOM_0002_WB_01', 'EB100012': 'MOM_0003_WB_01', 'EB100013': 'UIC_0004_WB_01', 'EB100014': 'UIC_0005_WB_01', 'EB100015': 'UIC_0006_WB_01'}


In [33]:
fastq_files = sorted(list(Path("/zata/zippy/ramirezc/gembs_smk/.test3").glob("*.fastq.gz")))
fastq_files = [str(file) for file in fastq_files]

records = []
for file in fastq_files:
    basename = os.path.basename(file)
    parts = basename.split("_")
    record = {
        "Project": parts[0],
        "Assay": "WGBS",
        "Barcode": parts[1],
        "Dataset": site_to_acccession_map[parts[1]],
        "Read": parts[2].split(".")[0],
        "File": file
    }
    records.append(record)
    
wgbs_metadata = pl.from_dicts(records).pivot(
        index=["Barcode", "Project", "Assay", "Dataset"],
        on="Read", 
        values="File",
        maintain_order=True
    ).rename({"R1": "File1", "R2": "File2", }).drop("Project", "Assay")

wgbs_metadata.write_csv("/zata/zippy/ramirezc/gembs_smk/results/gembs_metadata.csv")
print(wgbs_metadata)

shape: (15, 4)
┌──────────┬────────────────┬─────────────────────────────────┬─────────────────────────────────┐
│ Barcode  ┆ Dataset        ┆ File1                           ┆ File2                           │
│ ---      ┆ ---            ┆ ---                             ┆ ---                             │
│ str      ┆ str            ┆ str                             ┆ str                             │
╞══════════╪════════════════╪═════════════════════════════════╪═════════════════════════════════╡
│ EB100001 ┆ CCH_0001_WB_01 ┆ /zata/zippy/ramirezc/gembs_smk… ┆ /zata/zippy/ramirezc/gembs_smk… │
│ EB100002 ┆ CCH_0002_WB_01 ┆ /zata/zippy/ramirezc/gembs_smk… ┆ /zata/zippy/ramirezc/gembs_smk… │
│ EB100003 ┆ CCH_0003_WB_01 ┆ /zata/zippy/ramirezc/gembs_smk… ┆ /zata/zippy/ramirezc/gembs_smk… │
│ EB100004 ┆ CKD_0001_WB_01 ┆ /zata/zippy/ramirezc/gembs_smk… ┆ /zata/zippy/ramirezc/gembs_smk… │
│ EB100005 ┆ CKD_0002_WB_01 ┆ /zata/zippy/ramirezc/gembs_smk… ┆ /zata/zippy/ramirezc/gembs_smk… │
│ …  