In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")
import numpy as np
import scanpy.api as sc
from anndata import read_h5ad
from anndata import AnnData
import scipy as sp
import scipy.stats
from gprofiler import GProfiler
import pickle
# Other specific functions 
from itertools import product
from statsmodels.stats.multitest import multipletests
import util
# R related packages 
import rpy2.rinterface_lib.callbacks
import logging
from rpy2.robjects import pandas2ri
import anndata2ri



In [2]:
# Ignore R warning messages
#Note: this can be commented out to get more verbose R output
rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)
# Automatically convert rpy2 outputs to pandas dataframes
pandas2ri.activate()
anndata2ri.activate()
%load_ext rpy2.ipython
# autoreload
%load_ext autoreload
%autoreload 2
# logging
sc.logging.print_versions()

scanpy==1.4.3 anndata==0.6.20 umap==0.3.8 numpy==1.16.4 scipy==1.2.1 pandas==0.25.0 scikit-learn==0.21.1 statsmodels==0.9.0 python-igraph==0.7.1 louvain==0.6.1 


In [3]:
%%R
library(MAST)

## Load data

In [4]:
# Data path
data_path = '/data3/martin/tms_gene_data'
output_folder = data_path + '/DE_result'

In [5]:
# Load the data 
adata_combine = util.load_normalized_data(data_path)

In [6]:
temp_facs = adata_combine[adata_combine.obs['b_method']=='facs',]
temp_droplet = adata_combine[adata_combine.obs['b_method']=='droplet',]

## Generate a list of tissue-cell types for DE testing

In [7]:
cell_type_list = list(set(temp_facs.obs['cell_ontology_class']))
tissue_list = list(set(temp_facs.obs['tissue']))
min_cell_number = 25
analysis_list = []
analysis_info = {}
# for cell_type in cell_type_list:
for tissue,cell_type in product(tissue_list, cell_type_list):
    analyte = '%s.%s'%(tissue,cell_type)
    ind_select = (temp_facs.obs['cell_ontology_class'] == cell_type) & \
                    (temp_facs.obs['tissue'] == tissue)
    n_young = (temp_facs.obs['age'][ind_select].isin(['1m', '3m'])).sum()
    n_old = (temp_facs.obs['age'][ind_select].isin(['18m', '21m',
                                                   '24m', '30m'])).sum()
    analysis_info[analyte] = {}
    analysis_info[analyte]['n_young'] = n_young
    analysis_info[analyte]['n_old'] = n_old
    if (n_young>min_cell_number) & (n_old>min_cell_number) & (cell_type!='nan'):
        print('%s, n_young=%d, n_old=%d'%(analyte, n_young, n_old))
        analysis_list.append(analyte)

Pancreas.pancreatic ductal cell, n_young=163, n_old=150
Pancreas.pancreatic D cell, n_young=123, n_old=68
Pancreas.pancreatic B cell, n_young=522, n_old=820
Pancreas.leukocyte, n_young=54, n_old=66
Pancreas.pancreatic A cell, n_young=364, n_old=157
Pancreas.endothelial cell, n_young=86, n_old=116
Pancreas.pancreatic acinar cell, n_young=191, n_old=385
Brain_Myeloid.macrophage, n_young=44, n_old=243
Brain_Myeloid.microglial cell, n_young=4488, n_old=8642
Skin.basal cell of epidermis, n_young=678, n_old=994
Skin.epidermal cell, n_young=260, n_old=232
Skin.bulge keratinocyte, n_young=1370, n_old=1234
Marrow.precursor B cell, n_young=489, n_old=367
Marrow.late pro-B cell, n_young=276, n_old=135
Marrow.CD4-positive, alpha-beta T cell, n_young=29, n_old=251
Marrow.macrophage, n_young=182, n_old=255
Marrow.basophil, n_young=29, n_old=42
Marrow.mature alpha-beta T cell, n_young=99, n_old=231
Marrow.early pro-B cell, n_young=29, n_old=43
Marrow.granulocyte, n_young=742, n_old=2150
Marrow.NK cel

### DE using R package MAST 

In [8]:
## DE testing
gene_name_list = np.array(temp_facs.var_names)
DE_result_MAST = {}
for i_analyte,analyte in enumerate(analysis_list):
    print(analyte, '%d/%d'%(i_analyte, len(analysis_list)))
    tissue,cell_type = analyte.split('.')
    ind_select = (temp_facs.obs['cell_ontology_class'] == cell_type) & \
                    (temp_facs.obs['tissue'] == tissue)
    adata_temp = temp_facs[ind_select,]
    # reformatting
    adata_temp.X = np.array(adata_temp.X.todense())
    adata_temp.obs['condition'] = [int(x[:-1]) for x in adata_temp.obs['age']] 
    adata_temp.obs = adata_temp.obs[['condition', 'sex']]
    if len(set(adata_temp.obs['sex'])) <2:
        covariate = ''
    else:
        covariate = '+sex'
#     # toy example
#     covariate = ''
#     np.random.seed(0)
#     ind_select = np.random.permutation(adata_temp.shape[0])[0:100]
#     ind_select = np.sort(ind_select)
#     adata_temp = adata_temp[ind_select, 0:3]
#     adata_temp.X[:,0] = (adata_temp.obs['sex'] == 'male')*3
#     adata_temp.X[:,1] = (adata_temp.obs['condition'])*3
    # DE using MAST 
    R_cmd = util.call_MAST_age()
    get_ipython().run_cell_magic(u'R', u'-i adata_temp -i covariate -o de_res', R_cmd)
    de_res.columns = ['gene', 'raw-p', 'coef', 'bh-p']
    de_res.index = de_res['gene']
    DE_result_MAST[analyte] = pd.DataFrame(index = gene_name_list)
    DE_result_MAST[analyte] = DE_result_MAST[analyte].join(de_res)
    # fc between yound and old
    X = adata_temp.X
    y = (adata_temp.obs['condition']>10)
    DE_result_MAST[analyte]['fc'] = X[y,:].mean(axis=0) - X[~y,:].mean(axis=0)
#     break

Pancreas.pancreatic ductal cell 0/131
Pancreas.pancreatic D cell 1/131
Pancreas.pancreatic B cell 2/131
Pancreas.leukocyte 3/131
Pancreas.pancreatic A cell 4/131
Pancreas.endothelial cell 5/131
Pancreas.pancreatic acinar cell 6/131
Brain_Myeloid.macrophage 7/131
Brain_Myeloid.microglial cell 8/131
Skin.basal cell of epidermis 9/131
Skin.epidermal cell 10/131
Skin.bulge keratinocyte 11/131
Marrow.precursor B cell 12/131
Marrow.late pro-B cell 13/131
Marrow.CD4-positive, alpha-beta T cell 14/131
Marrow.macrophage 15/131
Marrow.basophil 16/131
Marrow.mature alpha-beta T cell 17/131
Marrow.early pro-B cell 18/131
Marrow.granulocyte 19/131
Marrow.NK cell 20/131
Marrow.naive B cell 21/131
Marrow.promonocyte 22/131
Marrow.granulocyte monocyte progenitor cell 23/131
Marrow.immature B cell 24/131
Marrow.hematopoietic stem cell 25/131
Marrow.granulocytopoietic cell 26/131
Brain_Non-Myeloid.oligodendrocyte precursor cell 27/131
Brain_Non-Myeloid.oligodendrocyte 28/131
Brain_Non-Myeloid.brain peri

### Save DE results

In [9]:
with open(output_folder+'/DE_tissue_cell_FACS.pickle', 'wb') as handle:
    pickle.dump(DE_result_MAST, handle)
    pickle.dump(analysis_list, handle)
    pickle.dump(analysis_info, handle)

### Validation

In [11]:
# Load DE result
with open(output_folder+'_old/DE.pickle', 'rb') as handle:
    DE_result_MAST_temp = pickle.load(handle)
    analysis_list_temp = pickle.load(handle)
for analyte in analysis_list:
    if analyte in analysis_list_temp:
        bh_p = DE_result_MAST[analyte]['bh-p']
        bh_p_temp = DE_result_MAST_temp[analyte]['bh-p']
        print('%s, New:%d, Old:%d, Overlap:%d'%(analyte, np.sum(bh_p<0.01), 
                                            np.sum(bh_p_temp<0.01), 
                                            np.sum((bh_p<0.01) & (bh_p_temp<0.01))))

Trachea.endothelial cell, New:2437, Old:2312, Overlap:2177
Trachea.fibroblast, New:9847, Old:8395, Overlap:8242
Trachea.basal epithelial cell of tracheobronchial tree, New:1539, Old:1895, Overlap:1169
Trachea.macrophage, New:2225, Old:1221, Overlap:1159
Lung.non-classical monocyte, New:83, Old:86, Overlap:65
Lung.lymphatic endothelial cell, New:54, Old:75, Overlap:54
Lung.classical monocyte, New:121, Old:107, Overlap:72
Lung.fibroblast of lung, New:717, Old:478, Overlap:384
Lung.dendritic cell, New:1710, Old:1900, Overlap:1188
Lung.mature natural killer T cell, New:1513, Old:3384, Overlap:1385
Lung.capillary endothelial cell, New:5690, Old:6824, Overlap:5079
Pancreas.pancreatic acinar cell, New:1396, Old:244, Overlap:223
Pancreas.endothelial cell, New:120, Old:97, Overlap:83
Pancreas.pancreatic D cell, New:1332, Old:715, Overlap:650
Pancreas.pancreatic A cell, New:5651, Old:2568, Overlap:2482
Pancreas.leukocyte, New:482, Old:48, Overlap:48
Pancreas.pancreatic ductal cell, New:3781, Old