In [1]:
import numpy as np
import pandas as pd
import anndata as ad
import scanpy as sc
from scipy.sparse import csr_matrix

The ALM dataset can be downloaded from
https://celltypes.brain-map.org/api/v2/well_known_file_download/694413179 

Download it and extracted it in ALM subfolder, all datas below can be extracted here.

In [2]:
filename = './ALM/mouse_ALM_2018-06-14_exon-matrix.csv'
expr_df = pd.read_csv(filename, header=0, index_col=0, delimiter=',').transpose()
expr = expr_df.values

# Find gene names
filename = './ALM/mouse_ALM_2018-06-14_genes-rows.csv'
genes_df = pd.read_csv(filename, header=0, index_col=0, delimiter=',')
gene_symbol = genes_df.index.values
gene_ids = genes_df['gene_entrez_id'].values
gene_names = np.array([gene_symbol[np.where(gene_ids == name)[0][0]] for name in expr_df.columns])

# Get metadata and save restrict to relevant fields
filename = './ALM/mouse_ALM_2018-06-14_samples-columns.csv'
obs = pd.read_csv(filename, header=0, index_col=0, delimiter=',', encoding='iso-8859-1')

obs = obs.reset_index()
obs = obs[['sample_name','seq_name','class','subclass','cluster']]
obs = obs.rename(columns={'sample_name':'sample_id'})
obs = obs.set_index('sample_id')
obs.head()

Unnamed: 0_level_0,seq_name,class,subclass,cluster
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
F1S4_161216_001_A01,SM-D9CZQ_S96_E1-50,Glutamatergic,L5 PT,L5 PT ALM Slco2a1
F1S4_180124_314_A01,SM-GE8ZM_S081_E1-50,Glutamatergic,L5 IT,L5 IT ALM Npw
F1S4_180124_315_A01,SM-GE8ZM_S089_E1-50,GABAergic,Lamp5,Lamp5 Fam19a1 Pax6
F1S4_180124_315_B01,SM-GE8ZM_S090_E1-50,GABAergic,Sncg,Sncg Slc17a8
F1S4_180124_315_C01,SM-GE8ZM_S091_E1-50,GABAergic,Sncg,Sncg Slc17a8


In [3]:
# compose and store anndata object for efficient read/write
adata = ad.AnnData(X=csr_matrix(expr))
adata.var_names = gene_names
adata.var.index.set_names('genes', inplace=True)
adata.obs = obs
adata.obs['cell_type'] = adata.obs['subclass']

# adata.write('./ALM/ALM.h5ad')

In [4]:
# transforms data in adata.X
adata.layers['log1pcpm'] = sc.pp.normalize_total(adata, target_sum=1e5, inplace=False)['X']

# transforms data in layers['lognorm'] inplace
sc.pp.log1p(adata, layer='log1pcpm')


In [5]:
# introduces "highly_variable" column to adata.var
sc.pp.highly_variable_genes(adata, 
                            layer='log1pcpm', 
                            flavor='cell_ranger',
                            n_top_genes=10000, 
                            inplace=True)

  disp_grouped = df.groupby('mean_bin')['dispersions']


In [6]:
# adata_hvg is a view. We'll convert it to a new AnnData object and write it out. 
adata_hvg = ad.AnnData(X=adata.X,
                       obs=adata.obs, 
                       var=adata.var[['highly_variable']],
                       layers=adata.layers, uns=adata.uns)
adata_hvg.write('./ALM/ALM_filtered_cells.h5ad')