# Configure Cluster Module Params

This notebook should be used as a test for ensuring correct cluster parameters before cluster processing.
Cells marked with `SET PARAMETERS` contain crucial variables that need to be set according to your specific experimental setup and data organization.
Please review and modify these variables as needed before proceeding with the analysis.

## SET PARAMETERS

### Fixed parameters for cluster processing

- `CONFIG_FILE_PATH`: Path to a Brieflow config file used during processing. Absolute or relative to where workflows are run from.

In [1]:
CONFIG_FILE_PATH = "config/config.yml"

In [2]:
from pathlib import Path

import yaml
import pandas as pd

from lib.shared.configuration_utils import CONFIG_FILE_HEADER

In [3]:
# load config file and determine root path
with open(CONFIG_FILE_PATH, "r") as config_file:
    config = yaml.safe_load(config_file)
    ROOT_FP = Path(config["all"]["root_fp"])

In [4]:
MIN_CELL_CUTOFFS = {"mitotic": 0, "interphase": 3, "all": 3}

# Analysis parameters
CORRELATION_THRESHOLD = 0.99
VARIANCE_THRESHOLD = 0.001
MIN_UNIQUE_VALUES = 5
LEIDEN_RESOLUTION = 5.0

In [9]:
UNIPROT_DATA_FP = "config/uniprot_data.tsv"
STRING_DATA_FP = "config/string_data.tsv"
CORUM_DATA_FP = "config/corum_data.tsv"

# TODO: create data below with API
uniprot_data = pd.read_csv("/lab/barcheese01/screens/denali-etna-fuji/cluster_5/databases/uniprot_complete_data.csv")
uniprot_data.to_csv(UNIPROT_DATA_FP, sep="\t", index=False)
print(uniprot_data.shape)

string_data = pd.read_csv("/lab/barcheese01/screens/denali-etna-fuji/cluster_5/databases/9606.protein.links.v12.0.txt", sep="\t")
string_data.to_csv(STRING_DATA_FP, sep="\t", index=False)
print(string_data.shape)

corum_data = pd.read_csv("/lab/barcheese01/screens/denali-etna-fuji/cluster_5/databases/corum_humanComplexes.txt", sep="\t")
corum_data.to_csv(CORUM_DATA_FP, sep="\t", index=False)
print(corum_data.shape)

(20461, 5)
(117118, 3)
(5125, 28)


## Add cluster parameters to config file

In [12]:
# Add cluster_process section
config["cluster_process"] = {
    "min_cell_cutoffs": MIN_CELL_CUTOFFS,
    "correlation_threshold": CORRELATION_THRESHOLD,
    "variance_threshold": VARIANCE_THRESHOLD,
    "min_unique_values": MIN_UNIQUE_VALUES,
    "leiden_resolution": LEIDEN_RESOLUTION,
    "uniprot_data_fp": UNIPROT_DATA_FP,
    "string_data_fp": STRING_DATA_FP,
    "corum_data_fp": CORUM_DATA_FP
}

# Write the updated configuration
with open(CONFIG_FILE_PATH, "w") as config_file:
    # Write the introductory comments
    config_file.write(CONFIG_FILE_HEADER)

    # Dump the updated YAML structure, keeping markdown comments for sections
    yaml.dump(config, config_file, default_flow_style=False)