# Configure Preprocessing Params

This notebook should be used to set up preprocessing params.
Cells marked with `SET PARAMETERS` contain crucial variables that need to be set according to your specific experimental setup and data organization.
Please review and modify these variables as needed before proceeding with the analysis.

## Imports

In [19]:
import re
from pathlib import Path
from itertools import chain

import pandas as pd
import yaml

from lib.shared.configuration_utils import CONFIG_FILE_HEADER

## SET PARAMETERS

### Fixed parameters for preprocessing

- `CONFIG_FILE_PATH`: Path to a Brieflow config file used during processing*.
- `ROOT_FP`: Path to root of Brieflow output directory*.

*Note: Paths can be absolute or relative to where workflows are run from.

In [20]:
CONFIG_FILE_PATH = "config/config.yml"
ROOT_FP = "analysis_root/"

## SET PARAMETERS

### Paths to dataframes with sample information

- `SBS_SAMPLES_DF_FP`/`PHENOTYPE_SAMPLES_DF_FP`: Path to dataframe with SBS/phenotype samples location and metadata*.
- `SBS_DATA_FP`/`PHENOTYPE_DATA_FP`: Path to directories with SBS/phenotype sample nd2 files*.
- `SBS_SAMPLE_PATTERN`/`PHENOTYPE_SAMPLE_PATTERN`: Regex patters to match SBS/phenotype sample nd2 files.

*Note: Paths can be absolute or relative to where workflows are run from.

In [21]:
# paths to sample dataframes
SBS_SAMPLES_DF_FP = "config/sbs_samples.tsv"
PHENOTYPE_SAMPLES_DF_FP = "config/phenotype_samples.tsv"

# directories with SBS/phenotype sample nd2 files
SBS_REAL_IMAGES_FP = Path("data/real_images/sbs/")
SBS_EMPTY_IMAGES_FP = Path("data/empty_images/sbs/")
PHENOTYPE_REAL_IMAGES_FP = Path("data/real_images/phenotype/")
PHENOTYPE_EMPTY_IMAGES_FP = Path("data/empty_images/phenotype/")

# patterns to match SBS/phenotype sample nd2 files
""""
These match the following example nd2 file names:
SBS: P001_SBS_10x_C9_Wells-A1_Points-100__Channel_Cy7,Cy5,AF594,Cy3_SBS,DAPI_SBS.nd2
Phenotype: P001_Pheno_20x_Wells-A1_Points-001__Channel_AF750,Cy3,GFP,DAPI.nd2
"""
SBS_SAMPLE_PATTERN = r"C(\d+)_Wells-([A-Z]\d+)_Points-(\d{3})__Channel"
PHENOTYPE_SAMPLE_PATTERN = r"Wells-([A-Z]\d+)_Points-(\d{3})__Channel"

## Find SBS samples

In [22]:
# List to store file information
sbs_data = []

# Iterate over files and extract information
for file in chain(SBS_REAL_IMAGES_FP.iterdir(), SBS_EMPTY_IMAGES_FP.iterdir()):
    match = re.search(SBS_SAMPLE_PATTERN, file.name)
    if match:
        cycle = int(match.group(1))
        well = match.group(2)
        tile = int(match.group(3))

        sbs_data.append(
            {"sample_fp": str(file), "well": well, "tile": tile, "cycle": cycle}
        )

# Create a DataFrame and sort by well, tile, and cycle
sbs_samples = pd.DataFrame(sbs_data)
sbs_samples = sbs_samples.sort_values(by=["well", "tile", "cycle"])
sbs_samples = sbs_samples.reset_index(drop=True)

sbs_samples.to_csv(SBS_SAMPLES_DF_FP, sep="\t", index=False)
sbs_samples

Unnamed: 0,sample_fp,well,tile,cycle
0,data/real_images/sbs/P001_SBS_10x_C1_Wells-A1_...,A1,1,1
1,data/real_images/sbs/P001_SBS_10x_C2_Wells-A1_...,A1,1,2
2,data/real_images/sbs/P001_SBS_10x_C3_Wells-A1_...,A1,1,3
3,data/real_images/sbs/P001_SBS_10x_C4_Wells-A1_...,A1,1,4
4,data/real_images/sbs/P001_SBS_10x_C5_Wells-A1_...,A1,1,5
...,...,...,...,...
61,data/real_images/sbs/P001_SBS_10x_C7_Wells-A2_...,A2,100,7
62,data/real_images/sbs/P001_SBS_10x_C8_Wells-A2_...,A2,100,8
63,data/real_images/sbs/P001_SBS_10x_C9_Wells-A2_...,A2,100,9
64,data/real_images/sbs/P001_SBS_10x_C10_Wells-A2...,A2,100,10


## Find phenotype samples

In [23]:
# List to store file information
phenotype_data = []

# Iterate over files and extract information
for file in chain(PHENOTYPE_REAL_IMAGES_FP.iterdir(), PHENOTYPE_EMPTY_IMAGES_FP.iterdir()):
    match = re.search(PHENOTYPE_SAMPLE_PATTERN, file.name)
    if match:
        well = match.group(1)
        tile = int(match.group(2))

        phenotype_data.append({"sample_fp": str(file), "well": well, "tile": tile})

# Create a DataFrame and sort by well, tile, and cycle
phenotype_samples = pd.DataFrame(phenotype_data)
phenotype_samples = phenotype_samples.sort_values(by=["well", "tile"])
phenotype_samples = phenotype_samples.reset_index(drop=True)

phenotype_samples.to_csv(PHENOTYPE_SAMPLES_DF_FP, sep="\t", index=False)
phenotype_samples

Unnamed: 0,sample_fp,well,tile
0,data/real_images/phenotype/P001_Pheno_20x_Well...,A1,1
1,data/empty_images/phenotype/P003_Pheno_20x_Wel...,A1,2
2,data/real_images/phenotype/P001_Pheno_20x_Well...,A1,100
3,data/real_images/phenotype/P001_Pheno_20x_Well...,A2,1
4,data/empty_images/phenotype/P003_Pheno_20x_Wel...,A2,2
5,data/real_images/phenotype/P001_Pheno_20x_Well...,A2,100


## Create config file with params

In [24]:
# Create empty config variable
config = {}

# Add all section
config["all"] = {
    "root_fp": ROOT_FP,
}

# Add preprocess section
config["preprocess"] = {
    "sbs_samples_fp": SBS_SAMPLES_DF_FP,
    "phenotype_samples_fp": PHENOTYPE_SAMPLES_DF_FP,
}

# Write the updated configuration back with markdown-style comments
with open(CONFIG_FILE_PATH, "w") as config_file:
    # Write the introductory markdown-style comments
    config_file.write(CONFIG_FILE_HEADER)

    # Dump the updated YAML structure, keeping markdown comments for sections
    yaml.dump(config, config_file, default_flow_style=False)