In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")
import numpy as np
import scanpy.api as sc
from anndata import read_h5ad
from anndata import AnnData
import scipy as sp
import scipy.stats
from gprofiler import GProfiler
from gprofiler_plotting import plot_enrich
import pickle
from sklearn.neighbors import KNeighborsClassifier
from adjustText import adjust_text
# Other specific functions 
from itertools import product
from statsmodels.stats.multitest import multipletests
import martin_helper as mh
# R related packages 
import rpy2.rinterface_lib.callbacks
import logging
from rpy2.robjects import pandas2ri
import anndata2ri

In [None]:
# Ignore R warning messages
#Note: this can be commented out to get more verbose R output
rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)
# Automatically convert rpy2 outputs to pandas dataframes
pandas2ri.activate()
anndata2ri.activate()
%load_ext rpy2.ipython
# autoreload
%load_ext autoreload
%autoreload 2
# logging
sc.logging.print_versions()

## Load data

In [None]:
# maca_local = '/data/maca_local' 
maca_local = '/data3/martin/maca_local'
output_folder = '/home/martin/maca/maca-data-analysis/results/cluster_tissue_celltype'

In [None]:
# Load the data 
adata_combine = mh.load_normalized_data(maca_local, log1p=True)

In [None]:
temp_facs = adata_combine[adata_combine.obs['b_method']=='facs',]
temp_droplet = adata_combine[adata_combine.obs['b_method']=='droplet',]

### Clustering all tissues using Angela's receipe

In [None]:
def cluster_angela(input_adata, resolution_val=.7):
    adata = input_adata.copy()
    sc.pp.highly_variable_genes(adata, subset = False, min_disp=.5,
                                max_disp=None, min_mean=.0125, max_mean=10,
                                n_bins=20, n_top_genes=None)
    sc.pp.scale(adata, max_value=10, zero_center=False)
    sc.pp.pca(adata, n_comps=50, use_highly_variable=True, svd_solver='arpack')
    sc.pp.neighbors(adata, n_neighbors=15, n_pcs=20)
    sc.tl.louvain(adata, resolution = resolution_val)
    sc.tl.leiden(adata, resolution = resolution_val)
    sc.tl.umap(adata)
    return adata

In [None]:
# generate analysis list
analysis_list_mt = ['facs.%s'%x for x in set(temp_facs.obs['tissue'])] + \
                        ['droplet.%s'%x for x in set(temp_droplet.obs['tissue'])] 
# A sorted age list
age_list = [int(x[:-1])for x in set(adata_combine.obs['age'])]
age_list.sort()
age_list = ['%dm'%x for x in age_list]
# Aggregate all mouse_id
mouse_id_list = list(set(adata_combine.obs['mouse.id']))
analysis_list = []
for analyte_mt in analysis_list_mt:
    method,tissue = analyte_mt.split('.')
    for mouse_id in mouse_id_list:
        ind_select = (adata_combine.obs['tissue'] == tissue) & \
                        (adata_combine.obs['b_method'] == method) &\
                        (adata_combine.obs['mouse.id'] == mouse_id)
        if ind_select.sum()>100:
            analyte = '%s.%s.%s'%(method,mouse_id,tissue)
            analysis_list.append(analyte)
            print(analyte, ind_select.sum())

Here we repeated the procedures three times for `resolution_val=[0.3, 0.5, 0.7]`, as shown in the commented lines 

In [None]:
dic_data = {}
for analyte in analysis_list:
    print(analyte)
    method,mouse_id,tissue = analyte.split('.')
    ind_select = (adata_combine.obs['tissue'] == tissue) & \
                    (adata_combine.obs['b_method'] == method) &\
                    (adata_combine.obs['mouse.id'] == mouse_id)
    temp_adata = adata_combine[ind_select]
    dic_data[analyte] = cluster_angela(temp_adata, resolution_val=0.7)
#     dic_data[analyte] = cluster_angela(temp_adata, resolution_val=0.5)
#     dic_data[analyte] = cluster_angela(temp_adata, resolution_val=0.3)
    sc.pl.umap(dic_data[analyte], color=['cell_ontology_class', 'louvain', 'leiden'])

In [None]:
temp_folder = '/data3/martin/maca_local/result/cluster_tissue_celltype'
with open(temp_folder+'/cluster_mouse_tissue_07.pickle', 'wb') as handle:
# with open(temp_folder+'/cluster_mouse_tissue_05.pickle', 'wb') as handle:
# with open(temp_folder+'/cluster_mouse_tissue_03.pickle', 'wb') as handle:
    pickle.dump(dic_data, handle) 