In [1]:
import numpy as np
import pandas as pd
import scanpy.api as sc
import anndata
from anndata import AnnData
import sys
sys.path.insert(1, '../')
import util
from itertools import product

# autoreload
%load_ext autoreload
%autoreload 2

# 
import anndata2ri
anndata2ri.activate()
%load_ext rpy2.ipython


In a future version of Scanpy, `scanpy.api` will be removed.
Simply use `import scanpy as sc` and `import scanpy.external as sce` instead.



In [2]:
# Data path
data_path = '/n/groups/price/martin/tms_gene_data'
output_folder = data_path + '/DGE_result'
output_data_folder = data_path + '/rds_by_tissue.1e4'
output_raw_adata_folder = data_path + '/raw_adata_by_tissue'
# output_data_folder_tmp = data_path + '/rds_by_tissue.1e4'

### bulk data

In [8]:
temp_data = util.load_normalized_data_bulk(data_path)

Trying to set attribute `.obs` of view, copying.


In [9]:
tissue_list = list(set(temp_data.obs['tissue']))
tissue_list.sort()
df_info = pd.DataFrame(index=tissue_list, columns=['n_sample', 'n_sample_young', 'n_sample_old'], data=0)
for tissue in tissue_list:
    ind_select = (temp_data.obs['tissue'] == tissue)
    df_info.loc[tissue, 'n_sample'] = ind_select.sum()
    df_info.loc[tissue, 'n_sample_young'] = (temp_data.obs['age_num'][ind_select]<10).sum()
    df_info.loc[tissue, 'n_sample_old'] = (temp_data.obs['age_num'][ind_select]>10).sum()
df_info.to_csv(output_folder + '/bulk.tissue_info', sep=' ')
with open(output_folder + '/bulk.tissue_list', 'w') as f:
    for tissue in df_info.index:
        f.write('%s\n'%tissue)

In [10]:
# write normalized bulk data 
for tissue in tissue_list:
    ind_select = (temp_data.obs['tissue'] == tissue)
    temp = temp_data[ind_select]
    fil = output_data_folder_tmp+'/bulk.normalized.%s.rds'%tissue
    get_ipython().run_cell_magic(u'R', u'-i temp -i fil', "print(fil)\nsaveRDS(temp, fil)")
#     break

[1] "/n/groups/price/martin/tms_gene_data/rds_by_tissue.tpm/bulk.normalized.Bone.rds"
[1] "/n/groups/price/martin/tms_gene_data/rds_by_tissue.tpm/bulk.normalized.Brain.rds"
[1] "/n/groups/price/martin/tms_gene_data/rds_by_tissue.tpm/bulk.normalized.Brown_Fat.rds"
[1] "/n/groups/price/martin/tms_gene_data/rds_by_tissue.tpm/bulk.normalized.Gonadal_Fat.rds"
[1] "/n/groups/price/martin/tms_gene_data/rds_by_tissue.tpm/bulk.normalized.Heart.rds"
[1] "/n/groups/price/martin/tms_gene_data/rds_by_tissue.tpm/bulk.normalized.Kidney.rds"
[1] "/n/groups/price/martin/tms_gene_data/rds_by_tissue.tpm/bulk.normalized.Limb_Muscle.rds"
[1] "/n/groups/price/martin/tms_gene_data/rds_by_tissue.tpm/bulk.normalized.Liver.rds"
[1] "/n/groups/price/martin/tms_gene_data/rds_by_tissue.tpm/bulk.normalized.Lung.rds"
[1] "/n/groups/price/martin/tms_gene_data/rds_by_tissue.tpm/bulk.normalized.Marrow.rds"
[1] "/n/groups/price/martin/tms_gene_data/rds_by_tissue.tpm/bulk.normalized.Mesenteric_Fat.rds"
[1] "/n/groups/pri

In [17]:
# # write normalized bulk data 
# tissue = 'Bone'
# ind_select = (temp_data.obs['tissue'] == tissue)
# temp = temp_data[ind_select]

# np.random.seed(0)
# temp = temp[:, 0:4]
# temp.X[:,0] = (temp.obs['sex'] == 'male')*1 + np.random.randn(temp.shape[0]) * 0.1
# temp.X[:,1] = temp.obs['age_num']*2 + np.random.randn(temp.shape[0]) * 0.1
# temp.X[:,2] = (temp.obs['sex'] == 'male')*1 + temp.obs['age_num']*2 + np.random.randn(temp.shape[0]) * 0.1
# temp.X[:,3] = np.random.randn(temp.shape[0]) * 0.1

# fil = output_data_folder+'/toy.rds'
# get_ipython().run_cell_magic(u'R', u'-i temp -i fil', "print(fil)\nsaveRDS(temp, fil)")
# #     break

Trying to set attribute `.X` of view, copying.


[1] "/n/groups/price/martin/tms_gene_data/adata_by_tissue/toy.rds"


### FACS and droplet data

In [4]:
dname_list = ['facs', 'facs_old', 'droplet', 'droplet_old']
for dname in dname_list:
    temp_data = util.load_normalized_data(data_path, data_name=dname,
                                          flag_size_factor = False,
                                          flag_log1p = False)
    
    if 'facs' in dname:
        min_cell_number = 25
    elif 'droplet' in dname:
        min_cell_number = 100
    
    # tissue_level
    tissue_list = list(set(temp_data.obs['tissue']))
    tissue_list.sort()
    df_info = pd.DataFrame(columns=['n_cell', 'n_cell_young', 'n_cell_old'])
    for tissue in tissue_list:
        ind_select = (temp_data.obs['tissue'] == tissue)
        
        n_cell = ind_select.sum()
        n_young = (temp_data.obs['age_num'][ind_select]<10).sum()
        n_old = (temp_data.obs['age_num'][ind_select]>10).sum()
        
        if (n_young>min_cell_number) & (n_old>min_cell_number):
            df_info.loc[tissue] = [n_cell, n_young, n_old]
    ind_list = list(df_info.index)
    ind_list.sort()
    df_info = df_info.loc[ind_list]
    df_info.to_csv(output_folder + '/%s.tissue_info'%dname, sep=' ')
    with open(output_folder + '/%s.tissue_list'%dname, 'w') as f:
        for tissue in df_info.index:
            f.write('%s\n'%tissue)
            
    # tissue_celltype level 
    cell_type_list = list(set(temp_data.obs['cell_ontology_class']))
    cell_type_list.sort()
    df_info = pd.DataFrame(columns=['n_cell', 'n_cell_young', 'n_cell_old'])
    for tissue,cell_type in product(tissue_list, cell_type_list):
        analyte = '%s.%s'%(tissue,cell_type)
        ind_select = (temp_data.obs['cell_ontology_class'] == cell_type) & \
                        (temp_data.obs['tissue'] == tissue)
        n_cell = ind_select.sum()
        n_young = (temp_data.obs['age_num'][ind_select]<10).sum()
        n_old = (temp_data.obs['age_num'][ind_select]>10).sum()
        
        if (n_young>min_cell_number) & (n_old>min_cell_number) & (cell_type!='nan'):
            df_info.loc[analyte] = [n_cell, n_young, n_old]
    ind_list = list(df_info.index)
    ind_list.sort()
    df_info = df_info.loc[ind_list]
    df_info.to_csv(output_folder + '/%s.tc_info'%dname, sep=' ')
    with open(output_folder + '/%s.tc_list'%dname, 'w') as f:
        for tissue in df_info.index:
            f.write('%s\n'%tissue)

Trying to set attribute `.obs` of view, copying.


In [3]:
# dname_list = ['facs', 'facs_old', 'droplet', 'droplet_old']
dname_list = ['facs', 'droplet']
for dname in dname_list:
    temp_data = util.load_normalized_data(data_path, data_name=dname,
                                          flag_size_factor=True,
                                          total_ct_per_cell=1e4, 
                                          flag_log1p=True)
    temp_data.obs['n_genes'] = (temp_data.X>0).sum(axis=1)
    
    tissue_list = list(set(temp_data.obs['tissue']))
    tissue_list.sort()
    
    # save the data by tissue 
    for tissue in tissue_list:
        ind_select = (temp_data.obs['tissue'] == tissue)
        temp = temp_data[ind_select]
        fil = output_data_folder+'/%s.normalized.%s.rds'%(dname, tissue)
#         fil = output_data_folder_tmp+'/%s.normalized.%s.rds'%(dname, tissue)
        get_ipython().run_cell_magic(u'R', u'-i temp -i fil', "print(fil)\nsaveRDS(temp, fil)")


Trying to set attribute `.obs` of view, copying.


[1] "/n/groups/price/martin/tms_gene_data/rds_by_tissue.1e4/facs.normalized.Aorta.rds"
[1] "/n/groups/price/martin/tms_gene_data/rds_by_tissue.1e4/facs.normalized.BAT.rds"
[1] "/n/groups/price/martin/tms_gene_data/rds_by_tissue.1e4/facs.normalized.Bladder.rds"
[1] "/n/groups/price/martin/tms_gene_data/rds_by_tissue.1e4/facs.normalized.Brain_Myeloid.rds"
[1] "/n/groups/price/martin/tms_gene_data/rds_by_tissue.1e4/facs.normalized.Brain_Non-Myeloid.rds"
[1] "/n/groups/price/martin/tms_gene_data/rds_by_tissue.1e4/facs.normalized.Diaphragm.rds"
[1] "/n/groups/price/martin/tms_gene_data/rds_by_tissue.1e4/facs.normalized.GAT.rds"
[1] "/n/groups/price/martin/tms_gene_data/rds_by_tissue.1e4/facs.normalized.Heart.rds"
[1] "/n/groups/price/martin/tms_gene_data/rds_by_tissue.1e4/facs.normalized.Kidney.rds"
[1] "/n/groups/price/martin/tms_gene_data/rds_by_tissue.1e4/facs.normalized.Large_Intestine.rds"
[1] "/n/groups/price/martin/tms_gene_data/rds_by_tissue.1e4/facs.normalized.Limb_Muscle.rds"
[1] 

In [26]:
# Store raw adata by tissue
dname_list = ['facs', 'droplet']
for dname in dname_list:
    temp_data = util.load_normalized_data(data_path, data_name=dname,
                                          flag_size_factor = False,
                                          flag_log1p=False)
    
    # tissue_level
    tissue_list = list(set(temp_data.obs['tissue']))
    tissue_list.sort()
    
    # save the data by tissue 
    for tissue in tissue_list:
        ind_select = (temp_data.obs['tissue'] == tissue)
        temp = temp_data[ind_select]
        fil = output_raw_adata_folder+'/%s.raw.%s.h5ad'%(dname, tissue)
        temp.write(fil)
#         break

Trying to set attribute `.obs` of view, copying.
