## Set parameters

In [None]:
import os

CONFIG_FILE_NAME = "braian_config.toml"                     # assumes the file is in DATA_ROOT directory
# USE_REMOTE_DATA -> if True, it tries to read the data on the laboratory's server
# EXPERIMENT_DIRECTORY, USE_REMOTE_DATA = "p6", False
# EXPERIMENT_DIRECTORY, USE_REMOTE_DATA = "experiment", False
# EXPERIMENT_DIRECTORY, USE_REMOTE_DATA = "proof", False
EXPERIMENT_DIRECTORY, USE_REMOTE_DATA = "rebased_on_mjd", False
# EXPERIMENT_DIRECTORY, USE_REMOTE_DATA = "Cariplo_NRe/IEGs Experiment", True

# ###################################### REMOTE DIRECTORIES #####################################
IS_COLLABORATION_PROJ = False
COLLABORATION_DIRECTORY = os.path.join("Mathias Schmidt", "soumnya")

# ###################################### LOCAL DIRECTORIES ######################################
EXPERIMENT_DIRECTORY = EXPERIMENT_DIRECTORY.replace("/", os.sep)
# DATA_ROOT  = f"../data/experiments/sowmya/{EXPERIMENT_DIRECTORY}"
# PLOTS_ROOT = f"../plots/sowmya/{EXPERIMENT_DIRECTORY}"
DATA_ROOT  = f"../data/experiments/{EXPERIMENT_DIRECTORY}"
PLOTS_ROOT = f"../plots/{EXPERIMENT_DIRECTORY}"

In [None]:
# on which comparison of CONFIG_FILE_NAME to run the PLS analysis
COMPARISON_ID = 1

In [None]:
# ####################################### GENERAL OPTIONS #######################################
SAVED_PLOT_EXTENSION = ".html"                      # '.html' for interactive plot
                                                    # '.svg' for vectorized image
                                                    # '.png'/'.jpg'/... for rasterized image

# ######################################### PLS OPTIONS #########################################
PLOT_DISTRIBUTION_OF_SINGULAR_VALUES = True

# ##################################### SALIENCE SCORE PLOT #####################################
SHOW_SALIENCE_SCORES_PLOT = True
SAVE_SALIENCE_SCORES_PLOT = True
SALIENCE_TITLE_TEXT_SIZE = 40
SALIENCE_AXIS_TEXT_SIZE = 22
SALIENCE_USE_ACRONYMS = True
SALIENCE_USE_ACRONYMS_IN_MJD = False
SALIENCE_MJD_BG_OPACITY = 0.3
SALIENCE_WIDTH = 1000
SALIENCE_BARHEIGHT = 30

# ########################################## PIE CHART ##########################################
PIE_SAVE_PLOT = True
PIE_SHOW_PLOT = False
PIE_USE_ACRONYMS = False
PIE_HOLE = 0.4                                          # a value between 0 (no hole) and 1 (just a hole, no plot)
PIE_TEXT_SIZE = 25

# ########################################## BAR PLOT ###########################################
BAR_SAVE_PLOT = True
BAR_SHOW_PLOT = True
BAR_ANIMAL_SIZE = 8
BAR_TITLE_TEXT_SIZE = 40
BAR_AXIS_TEXT_SIZE = 22
BAR_HEIGHT = 30
BAR_WIDTH = 1_500
BAR_TITLE = ""
BAR_USE_ACRONYMS = True

## Scripts's code

In [None]:
import pandas as pd
import sys
from plotly.colors import DEFAULT_PLOTLY_COLORS

project_path = os.path.dirname(os.path.abspath(os.getcwd()))
sys.path.append(project_path)
import BraiAn

In [None]:
if USE_REMOTE_DATA:
    DATA_ROOT, PLOTS_ROOT = BraiAn.remote_dirs(EXPERIMENT_DIRECTORY, IS_COLLABORATION_PROJ, COLLABORATION_DIRECTORY)

config_file = os.path.join(DATA_ROOT, CONFIG_FILE_NAME)

In [None]:
import tomllib

with open(config_file, "rb") as f:
    config = tomllib.load(f)
config
# ######################################### LOAD CONFIG #########################################
EXPERIMENT_NAME = config["experiment"]["name"]

ATLAS_VERSION = config["atlas"]["version"]
BRANCHES_TO_EXCLUDE = config["atlas"]["excluded-branches"]

NORMALIZATION = config["brains"]["normalization"]
MIN_AREA = config["pls"]["min-area"]                                    # area in mmÂ². If a region of one animal is smaller, that same region won't be considered in the PLS
                                                                        # That is because the PLS only considers the brain regions that appears in every animal of the groups
REGIONS_TO_PLOT_SELECTION_METHOD = config["pls"]["regions-to-plot"]     # Available options are: "summary structures", "major divisions" "depth <n>", "structural level <n>"
                                                                        # where <n> is an integer of the depth/level desired
PLS_SALIENCE_THRESHOLD = config["pls"]["salience-threshold"]            # Only brain regions with a salience higher than plot_threshold are shown. 2 is the significance threshold.
PLS_RANK = config["pls"]["rank"]
PLS_NUM_BOOTSTRAP = config["pls"]["num-bootstrap"]
PLS_NUM_PERMUTATIONS = config["pls"]["num-permutations"]

from collections import namedtuple
GroupDirectory = namedtuple("GroupDirectory", "id name dirs")
groups = [
    GroupDirectory(
        id=int(group[len("group"):])-1,
        name=config["experiment"][group]["name"],
        dirs=config["experiment"][group]["dirs"]
    ) for group in config["experiment"] if group.startswith("group") and group[len("group"):].isdigit()
]

comparison_groups = config["comparison"][str(COMPARISON_ID)]["groups"]
assert len(comparison_groups) == 2, f"The selected comparison is between {len(comparison_groups)} groups. PLS can only be computed between two groups!"
Comparison = namedtuple("Comparison", "group1 group2 dir")
comparison = Comparison(
    group1=groups[comparison_groups[0]-1],
    group2=groups[comparison_groups[1]-1],
    dir=config["comparison"][str(COMPARISON_ID)]["dir"]
)

In [None]:
data_input_path = os.path.join(DATA_ROOT, "BraiAn_output")
data_output_path = os.path.join(data_input_path, comparison.dir)
plots_output_path = os.path.join(PLOTS_ROOT, comparison.dir)

if not(os.path.exists(data_output_path)):
    os.makedirs(data_output_path, exist_ok=True)

if not(os.path.exists(plots_output_path)):
    os.makedirs(plots_output_path, exist_ok=True)

In [None]:
# from https://help.brain-map.org/display/api/Downloading+an+Ontology%27s+Structure+Graph
# StructureGraph id=1
path_to_allen_json = os.path.join(project_path, "data", "AllenMouseBrainOntology.json")
BraiAn.cache(path_to_allen_json, "http://api.brain-map.org/api/v2/structure_graph_download/1.json")
brain_onthology = BraiAn.AllenBrainHierarchy(path_to_allen_json, BRANCHES_TO_EXCLUDE, version=ATLAS_VERSION)

In [None]:
match REGIONS_TO_PLOT_SELECTION_METHOD:
    case "summary structures":
        # selects the Summary Strucutures
        path_to_summary_structures = os.path.join(project_path, "data", "AllenSummaryStructures.csv")
        brain_onthology.select_from_csv(path_to_summary_structures)
    case "major divisions":
        brain_onthology.select_regions(BraiAn.MAJOR_DIVISIONS)
    case "smallest":
        brain_onthology.select_leaves()
    case s if s.startswith("depth"):
        n = REGIONS_TO_PLOT_SELECTION_METHOD.split(" ")[-1]
        try:
            depth = int(n)
        except Exception:
            raise Exception("Could not retrieve the <n> parameter of the 'depth' method for 'REGIONS_TO_PLOT_SELECTION_METHOD'")
        brain_onthology.select_at_depth(depth)
    case s if s.startswith("structural level"):
        n = REGIONS_TO_PLOT_SELECTION_METHOD.split(" ")[-1]
        try:
            level = int(n)
        except Exception:
            raise Exception("Could not retrieve the <n> parameter of the 'structural level' method for 'REGIONS_TO_PLOT_SELECTION_METHOD'")
        brain_onthology.select_at_structural_level(level)
    case _:
        raise Exception(f"Invalid value '{REGIONS_TO_PLOT_SELECTION_METHOD}' for REGIONS_TO_PLOT_SELECTION_METHOD")
selected_regions = brain_onthology.get_selected_regions()
print(f"You selected {len(selected_regions)} regions to do PLS analysis over.")

In [None]:
group_1 = BraiAn.AnimalGroup.from_csv(comparison.group1.name, data_input_path, f"cell_counts_{comparison.group1.name}_{NORMALIZATION.lower()}.csv")
group_1.remove_smaller_subregions(MIN_AREA, brain_onthology)
group_2 = BraiAn.AnimalGroup.from_csv(comparison.group2.name, data_input_path, f"cell_counts_{comparison.group2.name}_{NORMALIZATION.lower()}.csv")
group_2.remove_smaller_subregions(MIN_AREA, brain_onthology)
if not group_1.is_comparable(group_2):
    raise ImportError("Group 1 and Group 2 are not comparable!\n\
Please check that you're reading two groups that normalized on the same brain regions and on the same marker")

The data are stored in ```group_1.data``` and ```group_2.data```:

In [None]:
group_1.to_pandas()

# Partial Least Squares  

The analysis done below is taken from the tutorial written by [Krishnan et al.](https://www.sciencedirect.com/science/article/pii/S1053811910010074).  
Run the 2 cells below to get started.

In [None]:
# Create a PLS object
pls = BraiAn.PLS(selected_regions, group_1, group_2, marker=group_1.markers[0])

# Show the matrix X
pls.X

In [None]:
# Show the matrix Y
pls.Y

The two matrices printed above (X and Y) illustrate the data on which the PLS is done.  
- ```X:``` The rows in this matrix are the mice. The columns in the matrix are the regions selected for analysis. The values in the matrix are the **normalized value of marked cells: in that region relative to the whole brain.** 
The normalization methods are either:
  + Density
  + Percentage (on the total number of detected marked cells outside of excluded regions)
  + RelativeDensity
- ```Y:``` The rows in this matrix are the mice. The columns in the matrix are the 2 groups. **A value in this matrix is 1 if the mice belongs to the specified group**.

In brief, PLS analyzes the relationship (correlation) between the columns of ```X``` and ```Y```. In our specific case, there will be 2 important outputs:
- **Salience scores**: Each brain region has a salience score. A high salience scores means that the brain region explains much of the correlation between ```X``` and ```Y```.  
- **Singular values**: These are the eigenvalues of the correlation matrix $R = Y^TX$.

## Random permutations to see whether we can differentiate signal from noise. 
Here, we randomly shuffle the group to which a mouse belongs, and calculate the singular values of the permuted dataset.  
From [Krishnan et al.](https://www.sciencedirect.com/science/article/pii/S1053811910010074):  
> The set of all the (permuted) singular values provides a sampling distribution of the singular values under the null hypothesis and, therefore can be used as a null hypothesis test.

*Note: running the cell below will take a few minutes.*

In [None]:
print(f"Randomly permuting singular values {PLS_NUM_PERMUTATIONS} times...")
s,singular_values = pls.randomly_permute_singular_values(PLS_NUM_PERMUTATIONS)
# Plot distribution of singular values
if PLOT_DISTRIBUTION_OF_SINGULAR_VALUES:
    fig = BraiAn.plot_permutation(pls.s[0], singular_values, PLS_NUM_PERMUTATIONS)
    fig.show()

In [None]:
# Calculate p-value = Probability(experiment | H0)
p = (singular_values[:,0] > s[0]).sum() / PLS_NUM_PERMUTATIONS
print("p-value = "+str(p))

In [None]:
BraiAn.plot_groups_salience(pls, component=1)

In [None]:
BraiAn.plot_latent_variables(pls, of="X")

## Bootstrap to identify stable salience scores

Here, we use [bootstrapping](https://en.wikipedia.org/wiki/Bootstrapping_(statistics)) (= sampling of the mice in the dataset, with replacement) to get an estimate of which salience scores are stable.

From [Krishnan et al.](https://www.sciencedirect.com/science/article/pii/S1053811910010074):  
> When a vector of saliences is considered generalizable and is kept for further analysis, we need to identify its elements that are stable through resampling. In practice, the stability of an element is evaluated by dividing it by its standard error. [...] To estimate the standard errors, we create bootstrap samples which are obtained by sampling with replacement the observations in and (Efron and Tibshirani, 1986). A salience standard error is then estimated as the standard error of the saliences from a large number of these bootstrap samples (say 1000 or 10000). **The ratios are akin to a Z-score, therefore when they are larger than 2 the corresponding saliences are considered significantly stable.**

*Note: Running the cell below will take a few minutes.*

In [None]:
print(f"Bootstrapping salience scores {PLS_NUM_BOOTSTRAP} times...")
u_salience_scores,v_salience_scores = pls.bootstrap_salience_scores(PLS_RANK, PLS_NUM_BOOTSTRAP)

In [None]:
v_salience_scores[0]

In [None]:
regions_to_plot_selection_method_str = REGIONS_TO_PLOT_SELECTION_METHOD.replace(" ", "_")
salient_regions = pls.above_threshold(PLS_SALIENCE_THRESHOLD).reset_index().rename(columns={"index":"acronym", 0: "salience_score"})

# save the salient regions in a CSV
pls_salience_threshold_str = str(PLS_SALIENCE_THRESHOLD).replace(".", "_")
salient_regions_file = f"PLS_{group_1.markers[0]}_{str(group_1.metric)}_{regions_to_plot_selection_method_str}_above_{pls_salience_threshold_str}.csv"
BraiAn.save_csv(salient_regions, data_output_path, salient_regions_file.lower(), overwrite=True)

# save ALL the regions with salient score
pls_filename = f"PLS_{group_1.markers[0]}_{str(group_1.metric)}_{regions_to_plot_selection_method_str}_salience_scores.csv"
BraiAn.save_csv(v_salience_scores.rename(columns={0:"salience_score"}), data_output_path, pls_filename.lower(), overwrite=True)

In [None]:
import numpy as np

def pls_(x, y):
    diag = np.diagflat((np.ones((1, y.shape[0])) @ y) ** (-1))
    m = diag @ y.transpose() @ x
    r = m - np.ones((m.shape[0], 1)) @ ((np.ones((1, m.shape[0])) @ m) / m.shape[0])
    u, s, v = np.linalg.svd(r, full_matrices=False)
    return u, s, v

def procrustes(u, s, v, u0):
    import numpy as np
    n, o, p = np.linalg.svd(np.matmul(u0.transpose(), u), full_matrices=False)
    q = n @ p.transpose()
    vr = v.transpose() @ q
    ur = u @ np.diagflat(s) @ q
    return ur, vr.transpose()

def bootstrap_test(x, y, v0, u0, n):
    import numpy as np
    vdist = np.zeros((n,) + v0.shape)
    m = x.shape[0]
    for i in np.arange(n):
        # generate random index sequence for bootstrapping (i.e. sampling with replacement)
        while True:
            idx = np.random.randint(0, m, m)
            # extract resampled arrays
            xsh = x[idx]
            ysh = y[idx]
            if not np.any(np.all(ysh[..., :] == 0, axis=0)):
                break
        u, s, v = pls_(xsh, ysh)
        ur, vr = procrustes(u, s, v, u0)
        # vr = v
        vdist[i, ...] = vr
    vs = np.std(vdist, axis=0)
    return vs

x = np.asarray(pls.X.values)
y = np.asarray(pls.Y.values)
u, s, v = pls_(x, y)
np.isclose(v.T, pls.v)
vs = bootstrap_test(x, y, v, u, PLS_NUM_BOOTSTRAP)
vpd = pd.DataFrame((v/vs), columns=pls.X.columns)

df = pd.DataFrame({"salience_score": vpd.T[vpd.T.iloc[:,0].abs() > 0.9][0]})
df.reset_index(inplace=True)
df = df.rename({"index": "acronym"}, axis=1)

# Plot PLS salience scores
fig = BraiAn.plot_salient_regions(df, brain_onthology,
                                    title=f"Salient regions (|score| >= {PLS_SALIENCE_THRESHOLD})",
                                    title_size=SALIENCE_TITLE_TEXT_SIZE, axis_size=SALIENCE_AXIS_TEXT_SIZE,
                                    use_acronyms=SALIENCE_USE_ACRONYMS, use_acronyms_in_mjd=SALIENCE_USE_ACRONYMS_IN_MJD,
                                    mjd_opacity=SALIENCE_MJD_BG_OPACITY,
                                    width=SALIENCE_WIDTH, barheight=SALIENCE_BARHEIGHT)
fig.show()

In [None]:
# Plot PLS salience scores
fig = BraiAn.plot_salient_regions(salient_regions, brain_onthology,
                                    title=f"Salient regions (|score| >= {PLS_SALIENCE_THRESHOLD})",
                                    title_size=SALIENCE_TITLE_TEXT_SIZE, axis_size=SALIENCE_AXIS_TEXT_SIZE,
                                    use_acronyms=SALIENCE_USE_ACRONYMS, use_acronyms_in_mjd=SALIENCE_USE_ACRONYMS_IN_MJD,
                                    mjd_opacity=SALIENCE_MJD_BG_OPACITY,
                                    width=SALIENCE_WIDTH, barheight=SALIENCE_BARHEIGHT)

if SAVE_SALIENCE_SCORES_PLOT:
    if not(os.path.exists(plots_output_path)):
        os.mkdir(plots_output_path)
    plot_filename = f"PLS_{pls_salience_threshold_str}_{group_1.markers[0]}_{str(group_1.metric)}_{regions_to_plot_selection_method_str}{SAVED_PLOT_EXTENSION}".lower()
    plot_filepath = os.path.join(plots_output_path, plot_filename)
    match SAVED_PLOT_EXTENSION.lower():
        case ".html":
            fig.write_html(plot_filepath, config=dict(toImageButtonOptions=dict(format="svg")))
        case _:
            fig.write_image(plot_filepath)

if SHOW_SALIENCE_SCORES_PLOT:
    fig.show()

In [None]:
prism_data = BraiAn.as_prism_data(brain_onthology, group_1, group_2)
prism_data = prism_data.loc[salient_regions.acronym.array]
prism_file = f"prism_{comparison.dir}_{group_1.markers[0]}_{str(group_1.metric)}_{regions_to_plot_selection_method_str}_pls_above_{pls_salience_threshold_str}.csv"
BraiAn.save_csv(prism_data.swaplevel(), data_output_path, prism_file.lower(), sep=",", overwrite=True)

# Plotting

In [None]:
fig = BraiAn.plot_pie(v_salience_scores.index.values, brain_onthology, use_acronyms=PIE_USE_ACRONYMS, hole=PIE_HOLE, line_width=1, text_size=PIE_TEXT_SIZE)

if PIE_SAVE_PLOT:
    plot_filename = f"pls_all_regions_piechart_{comparison.dir}_{group_1.markers[0]}_{str(group_1.metric)}_{regions_to_plot_selection_method_str}{SAVED_PLOT_EXTENSION}".lower()
    plot_filepath = os.path.join(plots_output_path, plot_filename)
    match SAVED_PLOT_EXTENSION.lower():
        case ".html":
            fig.write_html(plot_filepath, config=dict(toImageButtonOptions=dict(format="svg")))
        case _:
            fig.write_image(plot_filepath)
if PIE_SHOW_PLOT:
    fig.show()

In [None]:
fig = BraiAn.plot_pie(salient_regions.acronym.array, brain_onthology, use_acronyms=PIE_USE_ACRONYMS, hole=PIE_HOLE, line_width=1, text_size=PIE_TEXT_SIZE)

if PIE_SAVE_PLOT:
    plot_filename = f"pls_{pls_salience_threshold_str}_piechart_{comparison.dir}_{group_1.markers[0]}_{str(group_1.metric)}_{regions_to_plot_selection_method_str}{SAVED_PLOT_EXTENSION}".lower()
    plot_filepath = os.path.join(plots_output_path, plot_filename)
    match SAVED_PLOT_EXTENSION.lower():
        case ".html":
            fig.write_html(plot_filepath, config=dict(toImageButtonOptions=dict(format="svg")))
        case _:
            fig.write_image(plot_filepath)
if PIE_SHOW_PLOT:
    fig.show()

In [None]:
fig = BraiAn.plot_groups(brain_onthology, group_1, group_2, selected_regions=salient_regions.acronym.array,
                            plot_title=BAR_TITLE, title_size=BAR_TITLE_TEXT_SIZE, axis_size=BAR_AXIS_TEXT_SIZE, animal_size=BAR_ANIMAL_SIZE,
                            use_acronyms=BAR_USE_ACRONYMS, colors=(DEFAULT_PLOTLY_COLORS[group.id] for group in (comparison.group1, comparison.group2)),
                            width=BAR_WIDTH, barheight=BAR_HEIGHT, bargap=0.3, bargroupgap=0.0)

if BAR_SAVE_PLOT:
    plot_filename = f"pls_{pls_salience_threshold_str}_barplot_{comparison.dir}_{group_1.markers[0]}_{str(group_1.metric)}_{regions_to_plot_selection_method_str}{SAVED_PLOT_EXTENSION}".lower()
    plot_filepath = os.path.join(plots_output_path, plot_filename)
    match SAVED_PLOT_EXTENSION.lower():
        case ".html":
            fig.write_html(plot_filepath, config=dict(toImageButtonOptions=dict(format="svg")))
        case _:
            fig.write_image(plot_filepath)
if BAR_SHOW_PLOT:
    fig.show()

In [None]:
import importlib
__imported_modules = sys.modules.copy()
for module_name, module in __imported_modules.items():
    if not module_name.startswith("BraiAn"):
        continue
    try:
        importlib.reload(module)
    except ModuleNotFoundError:
        continue