In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import scanpy.api as sc
# from anndata import read_loom
from anndata import read_h5ad
sc.logging.print_versions()

scanpy==1.4.3 anndata==0.6.20 umap==0.3.8 numpy==1.16.4 scipy==1.2.1 pandas==0.25.0 scikit-learn==0.21.1 statsmodels==0.9.0 python-igraph==0.7.1 louvain==0.6.1 


In [3]:
# Data path
data_path = '/data3/martin/tms_gene_data'

In [5]:
# Raw data
adata = read_h5ad(f"{data_path}/maca-droplet-raw-data-annotated.h5ad")

In [6]:
# Mean depth
print('Mean sequencing depth', adata.X.sum(axis=1).mean())

Mean sequencing depth 6109.0303


## QC filtering

In [7]:
# Filter cells and genes
print(adata)
print(f'\n# remove genes expressed in less than 5 cells')
sc.pp.filter_genes(adata, min_cells=5)
print(f'# remove cell expressing less than 500 genes')
sc.pp.filter_cells(adata, min_genes=500)
print(adata)
print('\n# remove cells whose \n\t1. Tissue annotation is nan'
      + '\n\t2. n_counts is smaller than 1000')
adata.obs['n_counts'] = np.sum(adata.X, axis=1).A1
ind_select = (adata.obs['tissue']!='nan') & (adata.obs['age']!='nan') & \
            (adata.obs['n_counts']>=3000)
adata = adata[ind_select,]
print(adata)

AnnData object with n_obs × n_vars = 418908 × 23433 
    obs: 'age', 'batch', 'cell', 'cell_ontology_class', 'cell_ontology_id', 'free_annotation', 'method', 'mouse.id', 'n_genes', 'sex', 'subtissue', 'tissue', 'tissue_free_annotation'

# remove genes expressed in less than 5 cells
# remove cell expressing less than 500 genes
AnnData object with n_obs × n_vars = 345454 × 19861 
    obs: 'age', 'batch', 'cell', 'cell_ontology_class', 'cell_ontology_id', 'free_annotation', 'method', 'mouse.id', 'n_genes', 'sex', 'subtissue', 'tissue', 'tissue_free_annotation'
    var: 'n_cells'

# remove cells whose 
	1. Tissue annotation is nan
	2. n_counts is smaller than 1000
View of AnnData object with n_obs × n_vars = 235325 × 19861 
    obs: 'age', 'batch', 'cell', 'cell_ontology_class', 'cell_ontology_id', 'free_annotation', 'method', 'mouse.id', 'n_genes', 'sex', 'subtissue', 'tissue', 'tissue_free_annotation', 'n_counts'
    var: 'n_cells'


## Write the data

In [8]:
adata.write(f'{data_path}/droplet_filtered.h5ad')