In [None]:
import BraiAn
import pandas as pd
import os

## Load data

Below, you have to specify:
- ```group_1_name```: A meaningful string for Group 1.
- ```group_2_name```: A meaningful string for Group 2.
- ```group_1_names```: A list of names of the folders corresponding to animals in **Group 1** (e.g., Control group). Indeed, it is necessary to store the results in individual folders for each animal.
- ```group_2_names```: A list of names of the folders corresponding to animals in **Group 2** (e.g., Stress group).

In [None]:
experiment = "soumnya"
group_1_name = "Control"
group_2_name = "Stress"
output_folder = "C-S"

In [None]:
experiment = "soumnya"
group_1_name = "Control (Females)"
group_2_name = "Stress (Females)"
output_folder = "CF-SF"

In [None]:
experiment = "soumnya"
group_1_name = "Control (Males)"
group_2_name = "Stress (Males)"
output_folder = "CM-SM"

In [None]:
# ####################################### SET PARAMETERS ####################################

data_input_path = f"./data/experiments/{experiment}/BraiAn_norm_output/"
data_output_path = f"./data/experiments/{experiment}/BraiAn_PLS_output/{output_folder}/"
plots_output_path = f"./plots/{experiment}/{output_folder}/"


# ###########################################################################################


if not(os.path.exists(data_output_path)):
    os.makedirs(data_output_path, exist_ok=True)

if not(os.path.exists(plots_output_path)):
    os.makedirs(plots_output_path, exist_ok=True)

In [None]:
group_1 = BraiAn.AnimalGroup.from_csv(group_1_name, data_input_path, f"results_cell_counts_{group_1_name}.csv")
group_2 = BraiAn.AnimalGroup.from_csv(group_2_name, data_input_path, f"results_cell_counts_{group_2_name}.csv")
if not group_1.is_comparable(group_2):
    raise ImportError("Group 1 and Group 2 are not comparable!\n\
Please check that you're reading two groups that normalized on the same brain regions and on the same marker")

The data are stored in ```group_1.data``` and ```group_2.data```:

In [None]:
group_1.data

In [None]:
# from https://help.brain-map.org/display/api/Downloading+an+Ontology%27s+Structure+Graph
# StructureGraph id=1
path_to_allen_json = "./data/AllenMouseBrainOntology.json"

branches_to_exclude = ["retina", "VS", "grv", "fiber tracts", "CB"]
AllenBrain = BraiAn.AllenBrainHierarchy(path_to_allen_json, branches_to_exclude)

# Now, get the selected regions as a variable:
level = 6

#AllenBrain.unselect_all()
#AllenBrain.select_at_structural_level(level)
#selected_regions = AllenBrain.get_selected_regions()
#print(f"You selected {len(selected_regions)} regions at level {level}.")
#
#AllenBrain.unselect_all()
#AllenBrain.select_at_depth(level)
#selected_regions = AllenBrain.get_selected_regions()
#print(f"You selected {len(selected_regions)} regions at depth {level}.")
#
#AllenBrain.unselect_all()
AllenBrain.select_from_csv("./data/AllenSummaryStructures.csv")
selected_regions = AllenBrain.get_selected_regions()
print(f"You selected {len(selected_regions)} Summary Structure regions.")

# Partial Least Squares  

The analysis done below is taken from the tutorial written by [Krishnan et al.](https://www.sciencedirect.com/science/article/pii/S1053811910010074).  
Run the 2 cells below to get started.

In [None]:
# PLS
normalization = "RelativeDensity" # group_1.get_normalization_methods() to know the available methods of group 1
rank = 1

# Create a PLS object
pls = BraiAn.PLS(group_1, group_2, selected_regions, normalization)

# Show the matrix X
pls.X

In [None]:
# For cross correlation analysis
ds = pls.X.copy(deep=True)
ds = ds.transpose()
ds.index.name = "BrainRegion"
ds.to_csv(os.path.join(data_output_path, f"{group_1.marker.lower()}_{normalization.lower()}.csv"), sep=";", decimal=".", index=True)

In [None]:
# Show the matrix Y
pd.get_dummies(pls.y).rename(columns={0: group_2_name, 1: group_1_name})

The two matrices printed above (X and Y) illustrate the data on which the PLS is done.  
- ```X:``` The rows in this matrix are the mice. The columns in the matrix are the regions selected for analysis. The values in the matrix are the **normalized value of marked cells: in that region relative to the whole brain.** 
The normalization methods are either:
  + Density
  + Percentage (on the total number of detected marked cells outside of excluded regions)
  + RelativeDensity
- ```Y:``` The rows in this matrix are the mice. The columns in the matrix are the 2 groups. **A value in this matrix is 1 if the mice belongs to the specified group**.

In brief, PLS analyzes the relationship (correlation) between the columns of ```X``` and ```Y```. In our specific case, there will be 2 important outputs:
- **Salience scores**: Each brain region has a salience score. A high salience scores means that the brain region explains much of the correlation between ```X``` and ```Y```.  
- **Singular values**: These are the eigenvalues of the correlation matrix $R = Y^TX$.

## Random permutations to see whether we can differentiate signal from noise. 
Here, we randomly shuffle the group to which a mouse belongs, and calculate the singular values of the permuted dataset.  
From [Krishnan et al.](https://www.sciencedirect.com/science/article/pii/S1053811910010074):  
> The set of all the (permuted) singular values provides a sampling distribution of the singular values under the null hypothesis and, therefore can be used as a null hypothesis test.

*Note: running the cell below will take a few minutes.*

In [None]:
num_permutations = 5000
print(f"Randomly permuting singular values {num_permutations} times...")
s,singular_values = pls.randomly_permute_singular_values(num_permutations)
# Plot distribution of singular values
fig = BraiAn.plot_permutation(pls.s[0], singular_values, num_permutations)
fig.show()

In [None]:
# Calculate p-value = Probability(experiment | H0)
p = (singular_values[:,0] > s[0]).sum() / num_permutations
print("p-value = "+str(p))

## Bootstrap to identify stable salience scores

Here, we use [bootstrapping](https://en.wikipedia.org/wiki/Bootstrapping_(statistics)) (= sampling of the mice in the dataset, with replacement) to get an estimate of which salience scores are stable.

From [Krishnan et al.](https://www.sciencedirect.com/science/article/pii/S1053811910010074):  
> When a vector of saliences is considered generalizable and is kept for further analysis, we need to identify its elements that are stable through resampling. In practice, the stability of an element is evaluated by dividing it by its standard error. [...] To estimate the standard errors, we create bootstrap samples which are obtained by sampling with replacement the observations in and (Efron and Tibshirani, 1986). A salience standard error is then estimated as the standard error of the saliences from a large number of these bootstrap samples (say 1000 or 10000). **The ratios are akin to a Z-score, therefore when they are larger than 2 the corresponding saliences are considered significantly stable.**

*Note: Running the cell below will take a few minutes.*

In [None]:
num_bootstrap = 5000
print(f"Bootstrapping salience scores {num_bootstrap} times...")
u_salience_scores,v_salience_scores = pls.bootstrap_salience_scores(rank, num_bootstrap)
print("Done!")

In [None]:
# Plot PLS salience scores
pls_salience_threshold = 1.2 # Only brain regions with a salience higher than plot_threshold are shown. 2 is the significance threshold.
file_title = f"PLS_{group_1.marker}_{normalization}.png"
tp, salient_regions = pls.plot_salience_scores(pls_salience_threshold, plots_output_path, file_title,
                              fig_width=1000, fig_height=2000)

In [None]:
salient_regions = salient_regions.reset_index()
salient_regions.columns = ["region", "salience"]
salient_regions["salience"] = salient_regions["salience"].abs()
salient_regions = salient_regions.sort_values(by="salience")
salient_regions.to_csv(os.path.join(data_output_path, "salient_regions.csv"), sep=";", index=False)
salient_regions

In [None]:
pls_filename = f"PLS_{group_1.marker}_{normalization}_salience_scores.csv"
v_salience_scores = v_salience_scores.rename(columns={0:"salience score"})
BraiAn.save_csv(v_salience_scores, data_output_path, pls_filename, overwrite=True)

# Plotting

In [None]:
p_cutoff = 0.05
r_cutoff = 0.9

In [None]:
regions_to_plot = BraiAn.regions_to_plot(pls, pls_salience_threshold)
fig = BraiAn.plot_groups(normalization, AllenBrain, group_1, group_2,
                            selected_regions=regions_to_plot, use_acronyms=False, height=5000)
fig.show()

file_title = f"barplot_{group_1.marker}_{normalization}_{group_1.name}_vs_{group_2.name}.png"
fig.write_image(os.path.join(plots_output_path, file_title))

In [None]:
groups_cross_correlations = []
for group in (group_1, group_2):
    # min_animals=None because it doesn't matter. PLS already removes every region with NaNs.
    r, p = group.cross_correlation(normalization, regions_to_plot, min_animals=None)
    groups_cross_correlations.append((r, p))

In [None]:
for group, r,p in zip((group_1, group_2), groups_cross_correlations):
    fig = BraiAn.draw_chord_plot(r, p, r_cutoff, p_cutoff, AllenBrain,
                                ideograms_a=50,
                                title=f"{group.name} connectomics graph from Pearson correlation (|r| >= {r_cutoff}, p >= {p_cutoff})",
                                size=1200,
                                no_background=False,
                                annotation1="This is the first annotation",
                                annotation2="This is the second annotation",
                                annotation3="This is the third annotation")
    fig.show()