In [1]:
import re
from pathlib import Path

In [2]:
fastqs = [
    "igvf_003/nanopore/igvf003_8A_lig-ss_1.fastq.gz",
    "igvf_b01/next1/B01_13E_R1.fastq.gz",
    "igvf_003/nova1/Sublibrary_10_S9_L001_R1_001.fastq.gz",
    "igvf_012/nextseq/012_13A_S1_L001_R1_001.fastq.gz",
]

In [3]:
plate_re = r"(igvf)?(?P<plate_id>([\d]{3})|(B[\d]{2}))"
library_id_name_re = r"(?P<library_id>[\d]+[A-Z]+)"
library_id_index_re = r"(?P<library_id>[\d]+)"
sample_re = r"S(?P<sample_id>[\d]+)"
lane_re = r"(?P<lane_id>L[\d]+)"
read_re = r"(?P<read_id>[RI][\d])"
fragment_re = r"(?P<fragment>[\d]+)"
compression_re = r"(?P<compression>gz|bz2|xz|zstd)"

nanopore_re = re.compile(f"{plate_re}_{library_id_name_re}_lig-ss_{fragment_re}.fastq.{compression_re}")
library_name_re = re.compile(f"{plate_re}_{library_id_name_re}_{read_re}.fastq.{compression_re}")
library_index_re = re.compile(f"Sublibrary_{library_id_index_re}_{sample_re}_{lane_re}_{read_re}_{fragment_re}.fastq.{compression_re}")
library_name_lane_re = re.compile(f"{plate_re}_{library_id_name_re}_{sample_re}_{lane_re}_{read_re}_{fragment_re}.fastq.{compression_re}")

patterns = [nanopore_re, library_name_re, library_index_re, library_name_lane_re]


In [7]:
for fastq in fastqs:
    fastq = Path(fastq)
    for i, pattern in enumerate(patterns):
        match = pattern.match(fastq.name)
        if match is not None:
            print(i, match.groupdict(), fastq.name)
            

0 {'plate_id': '003', 'library_id': '8A', 'fragment': '1', 'compression': 'gz'} igvf003_8A_lig-ss_1.fastq.gz
1 {'plate_id': 'B01', 'library_id': '13E', 'read_id': 'R1', 'compression': 'gz'} B01_13E_R1.fastq.gz
2 {'library_id': '10', 'sample_id': '9', 'lane_id': 'L001', 'read_id': 'R1', 'fragment': '001', 'compression': 'gz'} Sublibrary_10_S9_L001_R1_001.fastq.gz
3 {'plate_id': '012', 'library_id': '13A', 'sample_id': '1', 'lane_id': 'L001', 'read_id': 'R1', 'fragment': '001', 'compression': 'gz'} 012_13A_S1_L001_R1_001.fastq.gz
