This notebook goes through Allen datasets and pulls out specific cell types or subtypes.

In [166]:
import loompy as lp
import pandas as pd
import numpy as np
import scipy
from scipy import sparse
import matplotlib.pyplot as plt
import time
import os.path


In [29]:
allen_membership = pd.read_csv('~/count_data/allen_metadata/cluster.membership.csv',\
                               skiprows = 1, names=['barcode','cluster_id'])
allen_annot = pd.read_csv('~/count_data/allen_metadata/cluster.annotation.csv')


In [30]:
allen_membership

Unnamed: 0,barcode,cluster_id
0,AAACCCAAGCTTCATG-1L8TX_181211_01_G12,42
1,AAACCCAAGTGAGGTC-1L8TX_181211_01_G12,41
2,AAACCCACACCAGCCA-1L8TX_181211_01_G12,42
3,AAACCCAGTGAACGGT-1L8TX_181211_01_G12,41
4,AAACCCAGTGGCATCC-1L8TX_181211_01_G12,20
...,...,...
94165,TTTGTTGTCAGCATTG-12L8TX_190430_01_G08,40
94166,TTTGTTGTCATTGCGA-12L8TX_190430_01_G08,41
94167,TTTGTTGTCCCAACTC-12L8TX_190430_01_G08,43
94168,TTTGTTGTCTATGCCC-12L8TX_190430_01_G08,42


In [31]:
allen_annot

Unnamed: 0,cluster_id,cluster_label,subclass_label,class_label,cluster_color,size
0,1,Lamp5 Pax6,Lamp5,GABAergic,#DDACC9,81
1,2,Lamp5 Egln3_2_1,Lamp5,GABAergic,#DD8091,179
2,3,Lamp5 Pdlim5_1,Lamp5,GABAergic,#FF829E,247
3,4,Lamp5 Pdlim5_2,Lamp5,GABAergic,#FF7290,537
4,5,Lamp5 Slc35d3_1,Lamp5,GABAergic,#FFA388,1275
...,...,...,...,...,...,...
142,143,L6b Shisa6_low_1,Low Quality,Low Quality,#5A7A65,84
143,144,L6b Shisa6_low_2,Low Quality,Low Quality,#1F7C70,95
144,145,Oligo Opalin_neuon,doublet,Low Quality,#41775C,16
145,146,Endo Slc38a5_5,doublet,Low Quality,#4C6863,36


In [33]:
annot_bcs = list(allen_membership['barcode'])
annot_cid = np.asarray(allen_membership['cluster_id'])
annot_exp = np.asarray([x[-3:] for x in annot_bcs])
annot_bcs = np.asarray([x[:16] for x in annot_bcs])
annot_n = len(annot_bcs)

In [120]:
celltype_names = ['GABAergic','Glutamatergic','L5 IT','L6 IT','L6 CT']
celltype_abbr = ['gaba','glu','l5it','l6it','l6ct']
n_celltypes = len(celltype_names)

In [124]:
celltype_clusters = [np.where(allen_annot['class_label'] == x)[0] for x in celltype_names[:2]] + \
                    [np.where(allen_annot['subclass_label'] == x)[0] for x in celltype_names[2:]]

In [148]:
celltype_clusters

[array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
        34, 35, 36, 37, 38]),
 array([39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
        56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67]),
 array([40, 41, 42, 43]),
 array([44, 45, 46, 47]),
 array([58, 59, 60, 61, 62, 63])]

In [155]:
annot_cid_filter = np.zeros((n_celltypes,len(annot_bcs)),dtype=bool)
for cl in range(n_celltypes):
    annot_cid_filter[cl,:] = np.isin(annot_cid,celltype_clusters[cl])

In [162]:
datasets = ['A08','B01','B08','C01']
spliced_layer = 'spliced'
unspliced_layer = 'unspliced'
gene_attr = 'gene_name'
cell_attr = 'barcode'

outdir = '/home/ggorin/count_data/loom_allen_celltype_kb/test/'
for d in datasets:
    filename = '/home/ggorin/count_data/allen_{}/counts_filtered/adata.loom'.format(d)
    with lp.connect(filename) as ds:
        S = ds.layers[spliced_layer][:]
        U = ds.layers[unspliced_layer][:]
        gene_names = ds.ra[gene_attr]
        bcs = ds.ca[cell_attr]
        for celltype in range(n_celltypes):
            S_ = np.copy(S)
            U_ = np.copy(U)
            celltype_exp_bcs = annot_bcs[annot_cid_filter[celltype]]
            cf = np.asarray([True if (x in celltype_exp_bcs) else False for x in bcs])
            S_ = S[:,cf]
            U_ = U[:,cf]
            bc_filt = bcs[cf]
            print('allen_{}_{}: {:.0f} barcodes.'.format(d,celltype_abbr[celltype],cf.sum()))
            
            lp.create(outdir+'/allen_{}_{}.loom'.format(d,celltype_abbr[celltype]),\
                      layers={'':scipy.sparse.csr_matrix(S_.shape),
                              spliced_layer:S_,
                              unspliced_layer:U_},\
                              row_attrs={gene_attr:gene_names},\
                              col_attrs={cell_attr:bc_filt})

allen_A08_gaba: 741 barcodes.
allen_A08_glu: 5093 barcodes.
allen_A08_l5it: 3140 barcodes.
allen_A08_l6it: 199 barcodes.
allen_A08_l6ct: 1305 barcodes.
allen_B01_gaba: 796 barcodes.
allen_B01_glu: 6555 barcodes.
allen_B01_l5it: 4293 barcodes.
allen_B01_l6it: 229 barcodes.
allen_B01_l6ct: 1367 barcodes.
allen_B08_gaba: 871 barcodes.
allen_B08_glu: 5466 barcodes.
allen_B08_l5it: 3243 barcodes.
allen_B08_l6it: 266 barcodes.
allen_B08_l6ct: 1398 barcodes.
allen_C01_gaba: 778 barcodes.
allen_C01_glu: 6771 barcodes.
allen_C01_l5it: 4393 barcodes.
allen_C01_l6it: 254 barcodes.
allen_C01_l6ct: 1375 barcodes.


In [170]:
datasets = ['A01','D01','E01','F01','G12','H12','F08','G08']


outdir = '/home/ggorin/count_data/loom_allen_celltype_kb/test/'
for d in datasets:
    filename = '/home/ggorin/count_data/allen_{}/counts_filtered/adata.loom'.format(d)
    print(filename)
    while (not os.path.isfile(filename) ) or (os.path.isdir('/home/ggorin/count_data/allen_{}/tmp/'.format(d))):
        print('DNE...' ,end='\t')
        time.sleep(900)

    
    with lp.connect(filename) as ds:
        S = ds.layers[spliced_layer][:]
        U = ds.layers[unspliced_layer][:]
        gene_names = ds.ra[gene_attr]
        bcs = ds.ca[cell_attr]
        for celltype in range(n_celltypes):
            S_ = np.copy(S)
            U_ = np.copy(U)
            celltype_exp_bcs = annot_bcs[annot_cid_filter[celltype]]
            cf = np.asarray([True if (x in celltype_exp_bcs) else False for x in bcs])
            S_ = S[:,cf]
            U_ = U[:,cf]
            bc_filt = bcs[cf]
            print('allen_{}_{}: {:.0f} barcodes.'.format(d,celltype_abbr[celltype],cf.sum()))
            
            lp.create(outdir+'/allen_{}_{}.loom'.format(d,celltype_abbr[celltype]),\
                      layers={'':scipy.sparse.csr_matrix(S_.shape),
                              spliced_layer:S_,
                              unspliced_layer:U_},\
                              row_attrs={gene_attr:gene_names},\
                              col_attrs={cell_attr:bc_filt})

/home/ggorin/count_data/allen_A01/counts_filtered/adata.loom
DNE...	DNE...	DNE...	DNE...	

KeyboardInterrupt: 

In [172]:
os.access('/home/ggorin/count_data/allen_{}/counts_filtered/adata.loom'.format(d), os.R_OK)

True

In [176]:
os.path.isdir('/home/ggorin/count_data/allen_{}/tmp/'.format(d))

True