## Imports

In [1]:
import re
from pathlib import Path

import pandas as pd

## Set parameters to find samples

In [None]:
# save path for sample dataframes
SBS_SAMPLES_DF_FP = Path("config/sbs_samples.tsv")
PHENOTYPE_SAMPLES_DF_FP = Path("config/phenotype_samples.tsv")

# directories with SBS/phenotype sample nd2 files
SBS_DATA_FP = Path("data/input/sbs/")
PHENOTYPE_DATA_FP = Path("data/input/phenotype/")


# patterns to match SBS/phenotype sample nd2 files
""""
Example nd2 file names:
SBS: P001_SBS_10x_C9_Wells-A1_Points-100__Channel_Cy7,Cy5,AF594,Cy3_SBS,DAPI_SBS.nd2
Phenotype: P001_Pheno_20x_Wells-A1_Points-001__Channel_AF750,Cy3,GFP,DAPI.nd2
"""
SBS_SAMPLE_PATTERN = r"C(\d+)_Wells-([A-Z]\d+)_Points-(\d{3})__Channel"
PHENOTYPE_SAMPLE_PATTERN = r"Wells-([A-Z]\d+)_Points-(\d{3})__Channel"

## Find SBS samples

In [3]:
# List to store file information
sbs_data = []

# Iterate over files and extract information
for file in SBS_DATA_FP.iterdir():
    match = re.search(SBS_SAMPLE_PATTERN, file.name)
    if match:
        cycle = int(match.group(1))
        well = match.group(2)
        tile = int(match.group(3))

        sbs_data.append(
            {"sample_fp": str(file), "well": well, "tile": tile, "cycle": cycle}
        )

# Create a DataFrame and sort by well, tile, and cycle
sbs_samples = pd.DataFrame(sbs_data)
sbs_samples = sbs_samples.sort_values(by=["well", "tile", "cycle"])
sbs_samples = sbs_samples.reset_index(drop=True)

sbs_samples.to_csv(SBS_SAMPLES_DF_FP, sep="\t", index=False)
sbs_samples

Unnamed: 0,sample_fp,well,tile,cycle
0,data/input/sbs/P001_SBS_10x_C1_Wells-A1_Points...,A1,1,1
1,data/input/sbs/P001_SBS_10x_C2_Wells-A1_Points...,A1,1,2
2,data/input/sbs/P001_SBS_10x_C3_Wells-A1_Points...,A1,1,3
3,data/input/sbs/P001_SBS_10x_C4_Wells-A1_Points...,A1,1,4
4,data/input/sbs/P001_SBS_10x_C5_Wells-A1_Points...,A1,1,5
5,data/input/sbs/P001_SBS_10x_C6_Wells-A1_Points...,A1,1,6
6,data/input/sbs/P001_SBS_10x_C7_Wells-A1_Points...,A1,1,7
7,data/input/sbs/P001_SBS_10x_C8_Wells-A1_Points...,A1,1,8
8,data/input/sbs/P001_SBS_10x_C9_Wells-A1_Points...,A1,1,9
9,data/input/sbs/P001_SBS_10x_C10_Wells-A1_Point...,A1,1,10


## Find phenotype samples

In [4]:
# List to store file information
phenotype_data = []

# Iterate over files and extract information
for file in PHENOTYPE_DATA_FP.iterdir():
    match = re.search(PHENOTYPE_SAMPLE_PATTERN, file.name)
    if match:
        well = match.group(1)
        tile = int(match.group(2))

        phenotype_data.append({"sample_fp": str(file), "well": well, "tile": tile})

# Create a DataFrame and sort by well, tile, and cycle
phenotype_samples = pd.DataFrame(phenotype_data)
phenotype_samples = phenotype_samples.sort_values(by=["well", "tile"])
phenotype_samples = phenotype_samples.reset_index(drop=True)

phenotype_samples.to_csv(PHENOTYPE_SAMPLES_DF_FP, sep="\t", index=False)
phenotype_samples

Unnamed: 0,sample_fp,well,tile
0,data/input/phenotype/P001_Pheno_20x_Wells-A1_P...,A1,1
1,data/input/phenotype/P001_Pheno_20x_Wells-A1_P...,A1,100
2,data/input/phenotype/P001_Pheno_20x_Wells-A2_P...,A2,1
3,data/input/phenotype/P001_Pheno_20x_Wells-A2_P...,A2,100
