In [5]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")
import numpy as np
import scanpy.api as sc
from anndata import read_h5ad
from anndata import AnnData
import scipy as sp
import scipy.stats
from gprofiler import GProfiler
import pickle
# Other specific functions 
from itertools import product
from statsmodels.stats.multitest import multipletests
import util
# R related packages 
import rpy2.rinterface_lib.callbacks
import logging
from rpy2.robjects import pandas2ri
import anndata2ri

In [6]:
# Ignore R warning messages
#Note: this can be commented out to get more verbose R output
rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)
# Automatically convert rpy2 outputs to pandas dataframes
pandas2ri.activate()
anndata2ri.activate()
%load_ext rpy2.ipython
# autoreload
%load_ext autoreload
%autoreload 2
# logging
sc.logging.print_versions()

The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
scanpy==1.4.3 anndata==0.6.20 umap==0.3.8 numpy==1.16.4 scipy==1.2.1 pandas==0.25.0 scikit-learn==0.21.1 statsmodels==0.9.0 python-igraph==0.7.1 louvain==0.6.1 


In [7]:
%%R
library(MAST)

## Load data

In [8]:
# Data path
data_path = '/data3/martin/tms_gene_data'
output_folder = data_path + '/DE_result'

In [10]:
# Load the data 
adata_combine = util.load_normalized_data(data_path)

In [11]:
temp_facs = adata_combine[adata_combine.obs['b_method']=='facs',]
temp_droplet = adata_combine[adata_combine.obs['b_method']=='droplet',]

## Generate a list of tissues for DE testing

In [13]:
tissue_list = list(set(temp_facs.obs['tissue']))
min_cell_number = 1
analysis_list = []
analysis_info = {}
# for cell_type in cell_type_list:
for tissue in tissue_list:
    analyte = tissue
    ind_select = (temp_facs.obs['tissue'] == tissue)
    n_young = (temp_facs.obs['age'][ind_select].isin(['1m', '3m'])).sum()
    n_old = (temp_facs.obs['age'][ind_select].isin(['18m', '21m',
                                                   '24m', '30m'])).sum()
    analysis_info[analyte] = {}
    analysis_info[analyte]['n_young'] = n_young
    analysis_info[analyte]['n_old'] = n_old
    if (n_young>min_cell_number) & (n_old>min_cell_number):
        print('%s, n_young=%d, n_old=%d'%(analyte, n_young, n_old))
        analysis_list.append(analyte)

Tongue, n_young=1418, n_old=2317
Thymus, n_young=1359, n_old=2716
Pancreas, n_young=1588, n_old=1796
Brain_Non-Myeloid, n_young=3650, n_old=4879
Trachea, n_young=1354, n_old=1806
Kidney, n_young=561, n_old=1620
Bladder, n_young=1383, n_old=1790
Heart, n_young=4464, n_old=5405
Spleen, n_young=1702, n_old=2132
BAT, n_young=713, n_old=1510
Diaphragm, n_young=903, n_old=955
Aorta, n_young=557, n_old=781
Skin, n_young=2346, n_old=2514
Brain_Myeloid, n_young=4532, n_old=9044
Liver, n_young=731, n_old=2128
SCAT, n_young=1721, n_old=2034
Lung, n_young=1743, n_old=4332
GAT, n_young=1464, n_old=1942
Mammary_Gland, n_young=2414, n_old=642
Limb_Muscle, n_young=1102, n_old=2753
Marrow, n_young=5069, n_old=9449
MAT, n_young=1187, n_old=1827
Large_Intestine, n_young=3987, n_old=4324


### DE using R package MAST 

In [15]:
## DE testing
gene_name_list = np.array(temp_facs.var_names)
DE_result_MAST = {}
for i_analyte,analyte in enumerate(analysis_list):
    print(analyte, '%d/%d'%(i_analyte, len(analysis_list)))
    tissue = analyte
#     tissue,cell_type = analyte.split('.')
    ind_select = (temp_facs.obs['tissue'] == tissue)
    adata_temp = temp_facs[ind_select,]
    # reformatting
    adata_temp.X = np.array(adata_temp.X.todense())
    adata_temp.obs['condition'] = [int(x[:-1]) for x in adata_temp.obs['age']] 
    adata_temp.obs = adata_temp.obs[['condition', 'sex']]
    if len(set(adata_temp.obs['sex'])) <2:
        covariate = ''
    else:
        covariate = '+sex'
    # toy example
#     covariate = ''
#     np.random.seed(0)
#     ind_select = np.random.permutation(adata_temp.shape[0])[0:100]
#     ind_select = np.sort(ind_select)
#     adata_temp = adata_temp[:, 0:200]
#     adata_temp.X[:,0] = (adata_temp.obs['sex'] == 'male')*3
#     adata_temp.X[:,1] = (adata_temp.obs['condition'])*3
    # DE using MAST 
    R_cmd = util.call_MAST_age()
    get_ipython().run_cell_magic(u'R', u'-i adata_temp -i covariate -o de_res', R_cmd)
    de_res.columns = ['gene', 'raw-p', 'coef', 'bh-p']
    de_res.index = de_res['gene']
    DE_result_MAST[analyte] = pd.DataFrame(index = gene_name_list)
    DE_result_MAST[analyte] = DE_result_MAST[analyte].join(de_res)
    # fc between yound and old
    X = adata_temp.X
    y = (adata_temp.obs['condition']>10)
    DE_result_MAST[analyte]['fc'] = X[y,:].mean(axis=0) - X[~y,:].mean(axis=0)
#     break

Tongue 0/23
Thymus 1/23
Pancreas 2/23
Brain_Non-Myeloid 3/23
Trachea 4/23
Kidney 5/23
Bladder 6/23
Heart 7/23
Spleen 8/23
BAT 9/23
Diaphragm 10/23
Aorta 11/23
Skin 12/23
Brain_Myeloid 13/23
Liver 14/23
SCAT 15/23
Lung 16/23
GAT 17/23
Mammary_Gland 18/23
Limb_Muscle 19/23
Marrow 20/23
MAT 21/23
Large_Intestine 22/23


### Save DE results

In [16]:
with open(output_folder+'/DE_tissue_FACS.pickle', 'wb') as handle:
    pickle.dump(DE_result_MAST, handle)
    pickle.dump(analysis_list, handle)
    pickle.dump(analysis_info, handle)

### Validation

In [12]:
for analyte in DE_result_MAST.keys():
    print(analyte, np.sum(DE_result_MAST[analyte]['bh-p']<0.00001))


Fat 12428
Liver 8147
Limb_Muscle 8066
Trachea 9606
Pancreas 10145
Marrow 10720
Tongue 8795
Lung 5588
Spleen 2500
Heart 8132
Skin 8647
Bladder 11658
Kidney 1001
Thymus 4762
Brain_Myeloid 9601
Mammary_Gland 5029
Brain_Non-Myeloid 12855
Large_Intestine 10159
