# Integration

**[Sanorama Hie et al., 2019](https://www.nature.com/articles/s41587-019-0113-3)**  
[GitHub](https://github.com/brianhie/scanorama)  
[Tutorial external API](https://scanpy-tutorials.readthedocs.io/en/latest/spatial/integration-scanorama.html)  
[External external API tutorial](https://scanpy.readthedocs.io/en/stable/generated/scanpy.external.pp.scanorama_integrate.html)

A [fix](https://github.com/theislab/single-cell-tutorial/issues/60) to run [scran pooling normalization](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-016-0947-7) computeSumFactors in current python environment. 

In [1]:
import scanpy as sc
import scanorama

import numpy as np
import pandas as pd

from IPython.display import display, HTML
from matplotlib.pyplot import rc_context

import os

In [2]:
# Working directory 
os.chdir('/research/peer/fdeckert/FD20200109SPLENO')

In [3]:
# rpy2 
os.environ['R_HOME'] = '/home/fdeckert/bin/miniconda3/envs/p.3.8.12-FD20200109SPLENO/lib/R'

In [4]:
# Plotting 
import rpy2.robjects as robjects
color_load = robjects.r.source('plotting_global.R')
color = dict()
for i in range(len(color_load[0])):
    color[color_load[0].names[i]] = {key : color_load[0][i].rx2(key)[0] for key in color_load[0][i].names}

sc.set_figure_params(figsize=(5, 5))

# Parameter

In [5]:
# Scanorama 
dimred=100
knn=20
sigma=15
alpha=0.1

# Scanorama

In [6]:
adata = sc.read_h5ad('data/object/so_sct.h5ad')
hvg = list(adata.uns['hvg_int_8000'])
adata = adata.raw.to_adata()

In [7]:
def set_color(categories): 
    
    categories = [x for x in categories if x in list(adata.obs.columns)]

    for category in categories: 
        
        adata.obs[category] = pd.Series(adata.obs[category], dtype='category')
        
        keys = list(color[category].keys())
        keys = [x for x in keys if x in list(adata.obs[category])]

        adata.obs[category] = adata.obs[category].cat.reorder_categories(keys)
        adata.uns[category+'_colors'] = np.array([color[category].get(key) for key in keys], dtype=object)
        
# Set colors
set_color(list(color.keys()))

In [8]:
sc.pp.normalize_total(adata)
sc.pp.log1p(adata)
adata = adata[:,hvg]

In [9]:
adata.obs['integrate'] = adata.obs['treatment'].astype(str)+adata.obs['sample_rep'].astype(str)
adata_sub = dict()
for sample_group in adata.obs['integrate'].unique():
    adata_tmp = adata[adata.obs['integrate']==sample_group].copy()
    sc.pp.scale(adata_tmp)
    adata_sub[sample_group] = adata_tmp
adata_sub = list(adata_sub.values())

  adata.obs['integrate'] = adata.obs['treatment'].astype(str)+adata.obs['sample_rep'].astype(str)


In [10]:
# Run Scanorama
scanorama.integrate_scanpy(adata_sub, dimred=dimred, knn=knn, sigma=sigma, alpha=alpha, verbose=True)

# Concatenate scanorama output 
X_scanorama = [ad.obsm['X_scanorama'] for ad in adata_sub]
X_scanorama = np.concatenate(X_scanorama)

obs_names = [ad.obs_names for ad in adata_sub]
obs_names = np.concatenate(obs_names)
all(obs_names==adata.obs_names)

# Add X_scanorama integration to adata 
adata.obsm["X_scanorama"] = X_scanorama

Found 8000 genes among all datasets
Processing datasets (2, 3)
Processing datasets (1, 3)
Processing datasets (0, 1)
Processing datasets (0, 2)
Processing datasets (0, 3)
Processing datasets (1, 2)


# Control cluster composition on high resolution

In [None]:
# Dimensional reduction and clustering 
sc.pp.neighbors(adata, n_neighbors=10, metric='euclidean', n_pcs=dimred, use_rep='X_scanorama')
sc.tl.leiden(adata, resolution=2)
sc.tl.louvain(adata, resolution=2)
sc.tl.umap(adata, min_dist=0.3, spread=1.0)

# Plot 
sc.pl.umap(adata, color=['louvain', 'leiden', 'tissue', 'treatment', 'label_fine_haemosphere', 'sample_rep', 'cc_phase_class', 'pMt_RNA', 'pHb_RNA', 'pRb_RNA', 'nCount_RNA', 'nFeature_RNA'], wspace=0.5, ncols=3)

In [None]:
df = adata[adata.obs['treatment']=='NaCl'].copy().obs
df = df[['leiden', 'treatment', 'sample_rep']].groupby(['leiden']).value_counts('sample_rep').to_frame().reset_index().pivot(index='sample_rep', columns='leiden', values=0)
display(HTML(df.to_html()))

In [None]:
df = adata[adata.obs['treatment']=='CpG'].copy().obs
df = df[['leiden', 'treatment', 'sample_rep']].groupby(['leiden']).value_counts('sample_rep').to_frame().reset_index().pivot(index='sample_rep', columns='leiden', values=0)
display(HTML(df.to_html()))

In [None]:
with rc_context({'figure.figsize': (7.5, 3)}):
    sc.pl.violin(adata, ['nCount_RNA', 'nFeature_RNA', 'pMt_RNA', 'pRb_RNA'], groupby='leiden', stripplot=False, inner='box', multi_panel=True)

# Dimensionality reduction and clustering (n_neighbors=30)

In [None]:
# Dimensional reduction and clustering 
sc.pp.neighbors(adata, n_neighbors=30, metric='euclidean', n_pcs=dimred, use_rep='X_scanorama')
sc.tl.leiden(adata, resolution=1)
sc.tl.louvain(adata, resolution=1)
sc.tl.umap(adata, min_dist=0.3, spread=1.0)

# Plot 
sc.pl.umap(adata, color=['louvain', 'leiden', 'tissue', 'treatment', 'label_fine_haemosphere', 'sample_rep', 'cc_phase_class', 'pHb_RNA', 'pRb_RNA'], wspace=0.5, ncols=3)