In [None]:
import BraiAn

And we'll need other python functions to easily read and manipulate data and make nice plots:

In [None]:
import pandas as pd
import numpy as np
import os

# import matplotlib.pyplot as plt
import plotly.graph_objects as go

## Load data

Below, you have to specify:
- ```animals_root```: Absolute path to the folder that contains the animal folders.
- ```group_1_name```: A meaningful string for Group 1.
- ```group_2_name```: A meaningful string for Group 2.
- ```group_1_names```: A list of names of the folders corresponding to animals in **Group 1** (e.g., Control group). Indeed, it is necessary to store the results in individual folders for each animal.
- ```group_2_names```: A list of names of the folders corresponding to animals in **Group 2** (e.g., Stress group).

In [None]:
# ####################################### SET PARAMETERS ####################################


animals_root = './data/QuPath_output/'
group_1_name = 'Control'
group_2_name = 'Stress'

data_input_path = './data/python_norm_output/'
data_output_path = './data/python_PLS_output/'
plots_output_path = './plots/python_output/'


# ###########################################################################################


if not(os.path.exists(data_output_path)):
    os.makedirs(data_output_path, exist_ok=True)

if not(os.path.exists(plots_output_path)):
    os.makedirs(plots_output_path, exist_ok=True)

In [None]:
group_1_results = pd.read_csv(os.path.join(data_input_path, f'results_cell_counts_{group_1_name}.csv'), sep='\t', header=[0, 1], index_col=[0,1])
group_2_results = pd.read_csv(os.path.join(data_input_path, f'results_cell_counts_{group_2_name}.csv'), sep='\t', header=[0, 1], index_col=[0,1])

group_1_names = {index[1] for index in group_1_results.index}
group_2_names = {index[1] for index in group_2_results.index}

group_1_tracers = list({cols[0] for cols in group_1_results.columns})
group_2_tracers = list({cols[0] for cols in group_2_results.columns})
assert len(group_1_tracers) == len(group_2_tracers) == 1, "The CSVs in input should have data for one tracer only."
assert group_1_tracers[0] == group_2_tracers[0], f"The data in the CSV of group 1 and 2 should refer to the same tracer. \
Instead group 1 CSV presents data for the tracer '{group_1_tracers[0]}', while group 2 CSV for '{group_2_tracers[0]}'."
tracer = group_1_tracers[0]

The data are stored in ```group_1_results``` and ```group_2_results```:

In [None]:
group_1_results

In [None]:
# from https://help.brain-map.org/display/api/Downloading+an+Ontology%27s+Structure+Graph
# StructureGraph id=1
path_to_allen_json = "./data/AllenMouseBrainOntology.json"

branches_to_exclude = ['retina','VS','grv','fiber tracts']
AllenBrain = BraiAn.AllenBrainHierarchy(path_to_allen_json, branches_to_exclude)

# Now, get the selected regions as a variable:
level = 6

AllenBrain.select_at_structural_level(level)
selected_regions = AllenBrain.get_selected_regions()
print(f'You selected {len(selected_regions)} regions at level {level}.')

AllenBrain.unselect_all()
AllenBrain.select_at_depth(level)
selected_regions = AllenBrain.get_selected_regions()
print(f'You selected {len(selected_regions)} regions at depth {level}.')

AllenBrain.unselect_all()
AllenBrain.select_from_csv("./data/AllenSummaryStructures.csv")
selected_regions = AllenBrain.get_selected_regions()
print(f'You selected {len(selected_regions)} Summary Structure regions.')

# Partial Least Squares  

The analysis done below is taken from the tutorial written by [Krishnan et al.](https://www.sciencedirect.com/science/article/pii/S1053811910010074).  
Run the 2 cells below to get started.

In [None]:
# PLS
normalization = 'Density' # 'Density', 'Percentage' or 'RelativeDensity'
rank = 1

# Create a PLS object
# TODO: see what happens if analyze() does not include the NaN rows
cfosPLS = BraiAn.PLS(group_1_results, group_2_results, group_1_names, group_2_names, selected_regions, tracer, normalization)

# Show the matrix X
cfosPLS.X

In [None]:
# Show the matrix Y
pd.get_dummies(cfosPLS.y).rename(columns={0: group_2_name, 1: group_1_name})

The two matrices printed above (X and Y) illustrate the data on which the PLS is done.  
- ```X:``` The rows in this matrix are the mice. The columns in the matrix are the regions selected for analysis. The values in the matrix are the **normalized value of marked cells: in that region relative to the whole brain.** 
The normalization methods are either:
  + Density
  + Percentage (on the total number of detected marked cells outside of excluded regions)
  + RelativeDensity
- ```Y:``` The rows in this matrix are the mice. The columns in the matrix are the 2 groups. **A value in this matrix is 1 if the mice belongs to the specified group**.

In brief, PLS analyzes the relationship (correlation) between the columns of ```X``` and ```Y```. In our specific case, there will be 2 important outputs:
- **Salience scores**: Each brain region has a salience score. A high salience scores means that the brain region explains much of the correlation between ```X``` and ```Y```.  
- **Singular values**: These are the eigenvalues of the correlation matrix $R = Y^TX$.

## Random permutations to see whether we can differentiate signal from noise. 
Here, we randomly shuffle the group to which a mouse belongs, and calculate the singular values of the permuted dataset.  
From [Krishnan et al.](https://www.sciencedirect.com/science/article/pii/S1053811910010074):  
> The set of all the (permuted) singular values provides a sampling distribution of the singular values under the null hypothesis and, therefore can be used as a null hypothesis test.

*Note: running the cell below will take a few minutes.*

In [None]:
num_permutations = 5000
print(f'Randomly permuting singular values %d times ...'%num_permutations)
s,singular_values = cfosPLS.randomly_permute_singular_values(num_permutations)
print('Done!\n')

In [None]:
# TODO: move to Plotly

# Plot distribution of singular values
# plt.figure(figsize=(10,4))
# plt.hist(singular_values[:,0],bins=10)
# plt.axvline(cfosPLS.s[0], color='r')
# plt.xlabel('First singular value')
# plt.ylabel('Frequency')
# plt.legend([f'Experiment','Sampling distribution\nunder H0 (%d permutations)'%num_permutations])
# plt.show()

In [None]:
# Calculate p-value = Probability(experiment | H0)
p = (singular_values[:,0] > s[0]).sum() / num_permutations
print('p-value = '+str(p))

## Bootstrap to identify stable salience scores

Here, we use [bootstrapping](https://en.wikipedia.org/wiki/Bootstrapping_(statistics)) (= sampling of the mice in the dataset, with replacement) to get an estimate of which salience scores are stable.

From [Krishnan et al.](https://www.sciencedirect.com/science/article/pii/S1053811910010074):  
> When a vector of saliences is considered generalizable and is kept for further analysis, we need to identify its elements that are stable through resampling. In practice, the stability of an element is evaluated by dividing it by its standard error. [...] To estimate the standard errors, we create bootstrap samples which are obtained by sampling with replacement the observations in and (Efron and Tibshirani, 1986). A salience standard error is then estimated as the standard error of the saliences from a large number of these bootstrap samples (say 1000 or 10000). **The ratios are akin to a Z-score, therefore when they are larger than 2 the corresponding saliences are considered significantly stable.**

*Note: Running the cell below will take a few minutes.*

In [None]:
num_bootstrap = 5000
print(f'Bootstrapping salience scores {num_bootstrap} times...')
u_salience_scores,v_salience_scores = cfosPLS.bootstrap_salience_scores(rank, num_bootstrap)
print('Done!')

In [None]:
# Plot PLS salience scores
plot_threshold = 1.2 # Only brain regions with a salience higher than plot_threshold are shown. 2 is the significance threshold.
file_title = f'PLS_{tracer}_{normalization}.png'
tp, salient_regions = cfosPLS.plot_salience_scores(plot_threshold, plots_output_path, file_title, AllenBrain.brain_region_dict,
                              fig_width=1000, fig_height=2000)

In [None]:
salient_regions = salient_regions.reset_index()
salient_regions.columns = ['region', 'salience']
salient_regions['salience'] = salient_regions['salience'].abs()
salient_regions = salient_regions.sort_values(by='salience')
salient_regions.to_csv(os.path.join(data_output_path, 'salient_regions.csv'), sep=';', index=False)
salient_regions

In [None]:
def save_results(results_df, output_path, filename):
    if not(os.path.exists(output_path)):
        os.mkdir(output_path)
        print('\nCreated a new results_python folder '+output_path+' \n')
    else:
        print('\n! A results_python folder already existed in root. I am overwriting previous results!\n')

    # results_df.to_csv( os.path.join(output_path, filename) )
    results_df.to_csv(os.path.join(output_path, filename), sep='\t', mode='w')

    print('Results are saved in '+output_path)
    print('\nDone!')
    return True

pls_filename = f'PLS_{tracer}_{normalization}_salience_scores.csv'
save_results(v_salience_scores.rename(columns={0:'salience score'}), data_output_path, pls_filename)

# Plot percentages

In [None]:
# In this case we wanted to normalize it based on the density, rather then the Percentage 
# I didn't modify the various labels in the plot as I was just focused on adapting the code to our dataset, rather then polishing it

threshold = 1e-2 # Only plot bars with value larger than threshold (1e-6, 1e-2, 3)
y_axis_label = 'region_names' # change this to 'acronym' to have acronyms on the y-axis

# Calculate mean values
group_1_df = pd.DataFrame(group_1_results[(tracer,normalization)].rename('cell counts'))
group_1_avg = group_1_df.reset_index().groupby('level_0').mean(numeric_only=True)
group_1_sem = group_1_df.reset_index().groupby('level_0').sem(numeric_only=True)

group_2_df = pd.DataFrame(group_2_results[(tracer,normalization)].rename('cell counts'))
group_2_avg = group_2_df.reset_index().groupby('level_0').mean(numeric_only=True)
group_2_sem = group_2_df.reset_index().groupby('level_0').sem(numeric_only=True)

# Determine which regions to plot  
mean_sum = group_1_avg + group_2_avg
#regs_to_plot = mean_sum[(mean_sum['cell counts']>threshold) & (mean_sum['cell counts'].notnull())].sort_values(by='cell counts').index.to_list()
regs_to_plot = cfosPLS.X.columns.to_list()

# y-axis, with seperate values for each region
y_axis_il, ticklabels = pd.factorize(group_1_df.loc[regs_to_plot].reset_index()['level_0'])
y_axis_bla, ticklabels = pd.factorize(group_2_df.loc[regs_to_plot].reset_index()['level_0'])
if(y_axis_label=='region_names'):
    ticklabels = [AllenBrain.brain_region_dict[reg] for reg in ticklabels]
     
fig = go.Figure()

# Barplot
fig.add_trace(go.Bar(
                     x = group_1_avg.loc[regs_to_plot]['cell counts'],
                     name = f'{group_1_name} mean',
                     error_x = dict(
                         type='data',
                         array=group_1_sem.loc[regs_to_plot]['cell counts']
                     )
              )
)

fig.add_trace(go.Bar(
                     x = group_2_avg.loc[regs_to_plot]['cell counts'],
                     name = f'{group_2_name} mean',
                     error_x = dict(
                         type='data',
                         array=group_2_sem.loc[regs_to_plot]['cell counts']
                     )
              )
)

fig.update_layout(barmode='group', colorway=['rgb(0,255,0)', 'rgb(255,0,0)'])

# Scatterplot (animals)
fig.add_trace(go.Scatter(
                    mode = 'markers',
                    y = y_axis_il - 0.2,
                    x = group_1_df.loc[regs_to_plot]['cell counts'],
                    name = f'{group_1_name} animals',
                    opacity=0.5,
                    marker=dict(
                        color='rgb(0,255,0)',
                        size=5,
                        line=dict(
                            color='rgb(0,0,0)',
                            width=1
                        )
                    )
              )
)

fig.add_trace(go.Scatter(
                    mode = 'markers',
                    y = y_axis_bla + 0.2,
                    x = group_2_df.loc[regs_to_plot]['cell counts'],
                    name = f'{group_2_name} animals',
                    opacity=0.5,
                    marker=dict(
                        color='rgb(255,0,0)',
                        size=5,
                        line=dict(
                            color='rgb(0,0,0)',
                            width=1
                        )
                    )
              )
)

# Figure title
title = ''
if normalization == 'RelativeDensity':
    title = f'[#{tracer} / area] / [{tracer} (brain) / area (brain)].'
elif normalization == 'Density':
    title = f'[#{tracer} / area]'
elif normalization == 'Percentage':
    title = f'[#{tracer} / brain]'

# Update layout
fig.update_layout(
    title = title,
    yaxis = dict(
        tickmode = 'array',
        tickvals = np.arange(0,len(regs_to_plot)),
        ticktext = ticklabels
    ),
    xaxis=dict(
        title = f'{tracer} density (relative to brain)'
    ),
    width=900, height=5000,
    hovermode="x unified",
    yaxis_range = [-1,len(regs_to_plot)+1]
)

fig.show()

# Save figure as PNG
file_title = 'barplot_' + tracer + '_' + normalization + 'CvS.png'
fig.write_image(os.path.join(plots_output_path, file_title))