# Import required packages

In [1]:
import imc_import
import utils
import pop_id

# pip install sc3s
import sc3s

from utils import adlog
import scanpy as sc
import anndata as ad

from pathlib import Path
import os

# Matplotlib and seaborn for plotting
import matplotlib
from matplotlib import rcParams
from matplotlib import colors
from matplotlib import cm
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap, Normalize
import seaborn as sb

# For access to good colour maps
import colorcet as cc

# Set up output figure settings
plt.rcParams['figure.figsize']=(5,5) #rescale figures, increase sizehere

# Set up scanpy settings
sc.settings.verbosity = 3
sc.set_figure_params(dpi=100, dpi_save=200, figsize=(5, 5)) #Increase DPI for better resolution figures

In [3]:
from SpatialBiologyToolkit import population_identification as pop_id

# 1. Read AnnData

Load the AnnData from the first notebook

In [None]:
adata = ad.read_h5ad('adata_freshimport.h5ad')

In case of crash, reload the latest backup of the AnnData here

In [None]:
adata = ad.read_h5ad('adata_temp.h5ad')

# 2. Batch correction
This will batch correct using BBKNN. There are other options batch correction, such as Harmony, but I found this the easiest to implement. Read more here: https://bodenmillergroup.github.io/IMCDataAnalysis/batch-effects.html

#### Run PCA and BBKNN

<font color='blue'>**batch_correction_obs** -This defines which .obs should be used to identify the different batches, in the example here it is 'Case'</font>

In [None]:
pop_id.batch_neighbors?

In [None]:
pop_id.batch_neighbors(adata,
                      correction_method='bbknn',
                      batch_correction_obs='Case')

# 3. UMAPs

#### Calculate UMAP

In [None]:
adlog(adata, f'Starting calculating UMAP', sc)
sc.tl.umap(adata)
adlog(adata, f'Finished calculating UMAP', sc, save=True)

#### Plot UMAPs
<font color='blue'>You can add extra .obs to **UMAP_groups** to colour the UMAPs by, e.g. treatment</font>

In [None]:
figure_dir=Path('Figures','UMAPs')
os.makedirs(figure_dir, exist_ok=True)

#Define a list of .obs to colour the UMAP by
UMAP_groups = adata.uns['categorical_obs'].tolist() + ['ROI'] 

# Plot UMAPs coloured by list above
fig = sc.pl.umap(adata, color=UMAP_groups, ncols=1, size=10, return_fig=True)
fig.savefig(Path(figure_dir, 'Categorical_UMAPS.png'), bbox_inches='tight', dpi=300)

# This will plot a UMAP for each of the individual markers
fig = sc.pl.umap(adata, color=adata.var_names.tolist(), color_map='plasma', ncols=4, size=10, return_fig=True)
fig.savefig(Path(figure_dir, 'Marker_UMAPS.png'), bbox_inches='tight', dpi=300)

# 4. Initial population identification using Leiden clustering
This is the same algorithm used by PhenoGraph, and is more or less the same

#### Leiden clustering

<font color='blue'>**resolution** - Change this to alter the size of the clusters - small resolution results in bigger clusters (therefore, less clusters overall, but each with more cells). </font>

Feel free to re-run this with several different resolutions, to see which looks best!

In [None]:
pop_id.leiden?

In [None]:
pop_id.population_summary?

In [None]:
obs_to_show = adata.obs.columns.tolist()[-1] #This is the last .obs to be added, alternatively put actual name of leiden group

pop_id.leiden(adata,
              resolution=0.3)

pop_id.population_summary(adata,
                          categorical_obs='Case',
                          groupby_obs=obs_to_show) 

#### <font color='orange'>OPTIONAL - Sub clustering</font>

Once you've done the initital subclustering, which could be with a fairly low resolution, you can then do subclustering on specific clusters from previous Leiden analyses

<font color='blue'>**resolution** - As above. Change this to alter the size of the subclusters of the previous analysis </font>


In [None]:
pop_id.leiden(adata,
              resolution=0.6,
              restrict_to_existing_leiden='leiden_0.3', #Specify the existing .obs to restrict the subclustering to
              existing_leiden_groups=['1']) #Specify the subgroups to use from the above .obs

pop_id.population_summary(adata,
                          categorical_obs='Case',
                          groupby_obs=adata.obs.columns.tolist()[-1])

#### <font color='orange'>OPTIONAL - Consensus Clustering</font>

This is another clustering option that runs a lot faster than Leiden, and that can be ran several times to get the consensus of several itterations

In [None]:
pop_id.consensus?

In [None]:
pop_id.consensus(adata, 
                 n_clusters=[10], # Number of different clusters to try and find
                 n_runs=50) # Number of runs to find the consensus of

In [None]:
pop_id.population_summary(adata,
                          categorical_obs='Case',
                          groupby_obs='sc3s_10')

#### <font color='orange'>OPTIONAL - Prune populations</font>
This reduces the number of populations in an .obs by merging populations (usually leiden) above a certain number based upon their closeness in the dendrogram clustering

In [None]:
pop_id.prune_leiden_using_dendrogram?

In [None]:
pop_id.prune_leiden_using_dendrogram(adata,
                                     leiden_obs='leiden_1', #.obs leiden you want to reduce numbers of
                                     new_obs='leiden_merged', #.obs where the new population will be saved
                                     max_leiden=6)

pop_id.population_summary(adata,
                          categorical_obs='Case',
                          groupby_obs=adata.obs.columns.tolist()[-1])

#### <font color='red'>OPTIONAL - Subclustering with different markers</font>

**<font color='red'>This is old code I haven't updated</font>**


This is optional extra that will see if you can subcluster using a different set of markers, as the subclustering options above use the same markers. The only drawback is that it creates a new AnnData object which you'll have to analyse separately. If you're unsure what any of this means, then don't use it.

In [None]:
# Create a list of the markers we want to use in the subanalysis, in this example we're focusing on our myeloid markers
myeloid = ['Iba1','CD14','CD16','CD44','CD11c','CD206','CX3CR1','HLA-DR', 'CD163', 'CD68']

In [None]:
# This will create a brand new adata object with just myeloid cells in 
adata_myeloid = pop_id.adata_subclustering(adata,
                        population_obs='leiden_0.35', #This is the leiden which you want to to use to identify myeloid populations in your original Leiden
                        populations=['4'], #List of populations from the above population_obs
                        marker_list=myeloid, #List of markers to use
                        clustering_resolutions=[0.1, 0.25, 0.4, 0.7], #Clustering resolutions to try
                        umap_categories=['ROI','Case'],
                        batch_correct='bbknn', #Whether to batch correct, otherwise but None if you dont want to correct
                        batch_correct_obs='Case', #Which .obs to use for batch correction
                        close_plots=True)

This next function will transfer the popuation labels from one adata and .obs to another. This relies upon all cells having a unique ID inthe 'Master_Index' column of .obs

In [None]:
pop_id.transfer_populations(adata_source=adata_myeloid,
                                 adata_source_populations_obs='leiden_0.1',
                                 adata_target=adata,
                                 adata_target_populations_obs='leiden_0.35',
                                 common_cell_index='Master_Index',
                                 pop_prefix='NEW')

# 5. Plot final clustering results

In [None]:
pop_id.population_summary(adata,
                          categorical_obs='Case',
                          groupby_obs=adata.obs.columns.tolist()[-1])

# 6. Labelling populations

## 6A. Create labelling file

In [None]:
pop_id.create_remapping(adata, 'leiden_1')

## 6B. Edit in Excel

<font color='red'>**The remapping file is created in this directory, fill in the table in Excel for how each of the population should be remapped.**</font>

By default three new populations are added (population, population_broad, hierarchy), but you can call the whatever you like and add more column/new groups if you wish

## 6C. Read in labelling file and remap

In [None]:
pop_id.read_remapping(adata, 'leiden_1')

# 7. Colourmap

The following function will allow you to view and select the colours for your new populations

<font color='red'>**WARNING** - This can be unstable on some machines, so make sure your AnnData is saved<font>


In [None]:
pop_id.recolour_population(adata, 'population_broad', save=True)