# Configure Aggregate Module Params

This notebook should be used as a test for ensuring correct aggregate parameters before aggregate processing.
Cells marked with <font color='red'>SET PARAMETERS</font> contain crucial variables that need to be set according to your specific experimental setup and data organization.
Please review and modify these variables as needed before proceeding with the analysis.

## <font color='red'>SET PARAMETERS</font>

### Fixed parameters for aggregate module

- `CONFIG_FILE_PATH`: Path to a Brieflow config file used during processing. Absolute or relative to where workflows are run from.

In [1]:
CONFIG_FILE_PATH = "config/config.yml"

In [2]:
from pathlib import Path

import yaml
import pandas as pd
import matplotlib.pyplot as plt

from lib.shared.file_utils import get_filename
from lib.aggregate.load_format_data import clean_cell_data, load_parquet_subset
from lib.aggregate.feature_processing import feature_transform, grouped_standardization
from lib.aggregate.collapse_data import collapse_to_sgrna, collapse_to_gene
from lib.aggregate.eval_aggregate import suggest_parameters
from lib.aggregate.cell_classification import (
    plot_mitotic_distribution_hist,
    plot_mitotic_distribution_scatter,
    split_mitotic_simple,
)
from lib.aggregate.montage_utils import create_cell_montage, add_filenames
from lib.shared.configuration_utils import CONFIG_FILE_HEADER

## <font color='red'>SET PARAMETERS</font>

### Testing on subset of data

- `TEST_PLATE`: Plate used for testing configuration 
- `TEST_WELL`: Well identifier used for testing configuration 
- `POPULATION_FEATURE`: The column name that identifies your perturbation groups (e.g., 'gene_symbol_0' for CRISPR screens, 'treatment' for drug screens)
- `FILTER_SINGLE_GENE`: Whether or not to ONLY keep cells with mapped_single_gene=True.

In [3]:
TEST_PLATE = 1
TEST_WELL = "A2"

POPULATION_FEATURE = "gene_symbol_0"
FILTER_SINGLE_GENE = False

In [4]:
# load config file and determine root path
with open(CONFIG_FILE_PATH, "r") as config_file:
    config = yaml.safe_load(config_file)
ROOT_FP = Path(config["all"]["root_fp"])

# Load subset of data
# Takes ~1 minute
merge_final_fp = (
    ROOT_FP
    / "merge"
    / "parquets"
    / get_filename({"plate": TEST_PLATE, "well": TEST_WELL}, "merge_final", "parquet")
)
merge_final = load_parquet_subset(merge_final_fp)
print(f"Unique populations: {merge_final[POPULATION_FEATURE].nunique()}")

# Remove unassigned cells
clean_df = clean_cell_data(
    merge_final, POPULATION_FEATURE, filter_single_gene=FILTER_SINGLE_GENE
)
print(f"Loaded {len(merge_final)} cells with {len(merge_final.columns)} features")

Reading first 50,000 rows from analysis_root/merge/parquets/P-1_W-A2__merge_final.parquet
Unique populations: 4435
Found 19265 cells with assigned perturbations
Loaded 50000 cells with 1684 features


## Add aggregate parameters to config file

In [None]:
# # Add aggregate section
# config["aggregate"] = {
#     "transformations_fp": TRANFORMATIONS_FP,
#     "population_feature": POPULATION_FEATURE,
#     "filter_single_gene": FILTER_SINGLE_GENE,
#     "feature_start": FEATURE_START,
#     "control_prefix": CONTROL_PREFIX,
#     "group_columns": GROUP_COLUMNS,
#     "index_columns": INDEX_COLUMNS,
#     "cat_columns": CAT_COLUMNS,
#     "threshold_conditions": THRESHOLD_CONDITIONS,
# }

# # Write the updated configuration
# with open(CONFIG_FILE_PATH, "w") as config_file:
#     # Write the introductory comments
#     config_file.write(CONFIG_FILE_HEADER)

#     # Dump the updated YAML structure, keeping markdown comments for sections
#     yaml.dump(config, config_file, default_flow_style=False)