In [4]:
import numpy as np
import pandas as pd
import scanpy as sc
import os
import warnings
import functools
import seaborn as sns
import scipy.stats
import anndata

warnings.filterwarnings('ignore')
os.chdir(os.path.expanduser('/home/jovyan/Prostate_analysis/Prostate_Spatial/spatial_h5ad/'))
sc.settings.verbosity = 3
sc.logging.print_versions()

scanpy==1.4.5.post2 anndata==0.6.22.post1 umap==0.3.10 numpy==1.17.2 scipy==1.4.1 pandas==0.25.1 scikit-learn==0.22.2 statsmodels==0.11.1 python-igraph==0.8.0 louvain==0.6.1


In [7]:
files = ['P2.1_Normal.h5ad',
 'P2.4_GS 3+4.h5ad',
 'P3.1_Inflamed.h5ad',
 'P4.2_Normal.h5ad',
 'P2.3_Inflamed.h5ad',
 'P1.1_Normal.h5ad',
 'P4.3_Normal.h5ad',
 'P4.1_Normal.h5ad',
 'P3.3_GS 3+4.h5ad',
 'P1.3_GS 3+3.h5ad',
 'P3.2_Normal.h5ad',
 'P1.2_GS 3+3.h5ad']

In [8]:
# some set up
adata_list = []
for i in range(0, len(files)):
    # import data
    adata = sc.read_h5ad(files[i])
    # Calculate QC metrics
    sc.pp.calculate_qc_metrics(adata, inplace=True)
    mito_genes = adata.var_names.str.startswith('MT-')
    adata.obs['percent_mito'] = np.sum(adata[:, mito_genes].X, axis = 1) / np.sum(adata.X, axis = 1)
    adata.obs['n_counts'] = adata.X.sum(axis = 1)
    # add sample name to obs names
    adata.obs_names = [str(x)+'_'+str(y) for x, y in zip(adata.obs['sample'], adata.obs_names)]
    adata.obs.index.name = 'spot_id'
    adata_list.append(adata)
adata = adata_list[0].concatenate(adata_list[1:], index_unique=None)
adata

AnnData object with n_obs × n_vars = 5910 × 11365 
    obs: 'barcode', 'batch', 'group', 'imagecol', 'imagerow', 'log1p_n_genes_by_counts', 'log1p_total_counts', 'n_counts', 'n_genes_by_counts', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'pct_counts_in_top_50_genes', 'percent_mito', 'sample', 'sum_gene', 'sum_umi', 'total_counts'
    var: 'symbol-0', 'gene_ids-0', 'n_cells_by_counts-0', 'mean_counts-0', 'log1p_mean_counts-0', 'pct_dropout_by_counts-0', 'total_counts-0', 'log1p_total_counts-0', 'symbol-1', 'gene_ids-1', 'n_cells_by_counts-1', 'mean_counts-1', 'log1p_mean_counts-1', 'pct_dropout_by_counts-1', 'total_counts-1', 'log1p_total_counts-1', 'symbol-2', 'gene_ids-2', 'n_cells_by_counts-2', 'mean_counts-2', 'log1p_mean_counts-2', 'pct_dropout_by_counts-2', 'total_counts-2', 'log1p_total_counts-2', 'symbol-3', 'gene_ids-3', 'n_cells_by_counts-3', 'mean_counts-3', 'log1p_mean_counts-3', 'pct_dropout_by_counts-3', 'total_counts-3', '

In [12]:
# create a human readable var column
adata.var['SYMBOL'] = adata.var.index
adata.var

Unnamed: 0,symbol-0,gene_ids-0,n_cells_by_counts-0,mean_counts-0,log1p_mean_counts-0,pct_dropout_by_counts-0,total_counts-0,log1p_total_counts-0,symbol-1,gene_ids-1,...,log1p_total_counts-10,symbol-11,gene_ids-11,n_cells_by_counts-11,mean_counts-11,log1p_mean_counts-11,pct_dropout_by_counts-11,total_counts-11,log1p_total_counts-11,SYMBOL
SMIM20,SMIM20,ENSG00000250317,80,0.186472,0.170984,85.374771,102,4.634729,SMIM20,ENSG00000250317,...,2.639057,SMIM20,ENSG00000250317,92,0.327586,0.283362,77.339901,133,4.897840,SMIM20
WDR1,WDR1,ENSG00000071127,427,2.085923,1.126851,21.937843,1141,7.040536,WDR1,ENSG00000071127,...,4.394449,WDR1,ENSG00000071127,256,1.586207,0.950192,36.945813,644,6.469250,WDR1
KIF27,KIF27,ENSG00000165115,5,0.012797,0.012716,99.085923,7,2.079442,KIF27,ENSG00000165115,...,1.098612,KIF27,ENSG00000165115,9,0.022167,0.021925,97.783251,9,2.302585,KIF27
DYNLRB1,DYNLRB1,ENSG00000125971,495,3.349177,1.469987,9.506399,1832,7.513709,DYNLRB1,ENSG00000125971,...,4.779123,DYNLRB1,ENSG00000125971,353,3.825123,1.573836,13.054187,1553,7.348588,DYNLRB1
MCL1,MCL1,ENSG00000143384,442,2.261426,1.182165,19.195612,1237,7.121253,MCL1,ENSG00000143384,...,5.068904,MCL1,ENSG00000143384,340,3.477833,1.499139,16.256158,1412,7.253470,MCL1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MOSPD2,MOSPD2,ENSG00000130150,3,0.007313,0.007286,99.451554,4,1.609438,MOSPD2,ENSG00000130150,...,1.791759,MOSPD2,ENSG00000130150,11,0.036946,0.036280,97.290640,15,2.772589,MOSPD2
FAM83D,FAM83D,ENSG00000101447,2,0.003656,0.003650,99.634369,2,1.098612,FAM83D,ENSG00000101447,...,0.693147,FAM83D,ENSG00000101447,4,0.009852,0.009804,99.014778,4,1.609438,FAM83D
SPAG17,SPAG17,ENSG00000155761,1,0.005484,0.005469,99.817185,3,1.386294,SPAG17,ENSG00000155761,...,0.693147,SPAG17,ENSG00000155761,0,0.000000,0.000000,100.000000,0,0.000000,SPAG17
GDA,GDA,ENSG00000119125,1,0.007313,0.007286,99.817185,4,1.609438,GDA,ENSG00000119125,...,0.693147,GDA,ENSG00000119125,1,0.002463,0.002460,99.753695,1,0.693147,GDA


In [13]:
adata.write('./combined_st.h5ad', compression = 'gzip')

... storing 'barcode' as categorical
... storing 'group' as categorical
... storing 'sample' as categorical
