# CELLEX

In [1]:
import pandas as pd
import cellex

In [2]:
data_dir = 'data/CELLEX'
file = data_dir+'/PsychENCODE/DER-22_Single_cell_expression_raw_UMI.tsv'

In [3]:
%%time
data = pd.read_csv(file, 
                   index_col=0,
                   sep='\t')

CPU times: user 3min 3s, sys: 7.14 s, total: 3min 11s
Wall time: 3min 9s


In [4]:
display(data)

Unnamed: 0,Ex3e,Ex2,In1b,Oligo,Ex1,Astro,Ex8,Astro.1,Astro.2,In4b,...,Microglia.307,Microglia.308,Microglia.309,Microglia.310,Microglia.311,Microglia.312,Microglia.313,Microglia.314,Microglia.315,Microglia.316
A1BG,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A1BG-AS1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A1CF,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A2M,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A2M-AS1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZYG11A,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ZYG11B,1,0,0,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
ZYX,1,0,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ZZEF1,0,0,0,1,0,1,2,0,0,0,...,0,0,0,0,0,0,0,1,0,2


In [5]:
metadata = pd.DataFrame()
metadata['cluster'] = data.columns.str.split('.').str[0]
metadata['cell'] = data.columns
metadata.set_index('cell', inplace=True)
display(metadata)

Unnamed: 0_level_0,cluster
cell,Unnamed: 1_level_1
Ex3e,Ex3e
Ex2,Ex2
In1b,In1b
Oligo,Oligo
Ex1,Ex1
...,...
Microglia.312,Microglia
Microglia.313,Microglia
Microglia.314,Microglia
Microglia.315,Microglia


In [6]:
metadata['cluster'].unique() # NA cluster should be excluded

array(['Ex3e', 'Ex2', 'In1b', 'Oligo', 'Ex1', 'Astro', 'Ex8', 'In4b',
       'Ex6b', 'OPC', 'Endo', 'In6b', 'In8', 'Ex5b', 'Microglia', 'In1c',
       'In3', 'Ex4', 'Ex9', 'NA', 'Ex6a', 'In6a', 'In4a', 'In7', 'In1a',
       'Per'], dtype=object)

In [7]:
nan_cells = metadata[metadata['cluster']=='NA'].index
print(nan_cells.shape) # Number of NA cells

(32,)


In [8]:
# Remove NA cells from data
data_dropped = data.drop(nan_cells, axis=1)
print(data.shape,data_dropped.shape) # size before and after drop

(17176, 27412) (17176, 27380)


In [9]:
# Remove NA cells from metadata 
metadata_dropped = metadata[metadata['cluster']!='NA']
print(metadata.shape, metadata_dropped.shape) # size before and after drop

(27412, 1) (27380, 1)


In [10]:
%%time 
eso = cellex.ESObject(data=data_dropped, 
                      annotation=metadata_dropped,
                      verbose=True)

Preprocessing - checking input ... input parsed in 0 min 0 sec
Preprocessing - running remove_non_expressed ... excluded 0 / 17176 genes in 0 min 12 sec
Preprocessing - normalizing data ... data normalized in 3 min 15 sec
Preprocessing - running ANOVA ... excluded 2586 / 17176 genes in 0 min 17 sec
CPU times: user 25.5 s, sys: 3min 20s, total: 3min 46s
Wall time: 3min 46s


In [11]:
%%time
eso.compute(verbose=True)

Computing DET ... 
    esw ...
    empirical p-values ...
    esw_s ...
    finished in 1 min 33 sec
Computing EP ...
    esw ...
    empirical p-values ...
    esw_s ...
    finished in 0 min 0 sec
Computing GES ...
    esw ...
    empirical p-values ...
    esw_s ...
    finished in 0 min 3 sec
Computing NSI ...
    esw ...
    empirical p-values ...
    esw_s ...
    finished in 0 min 1 sec
Computing ESmu ...
    finished in 0 min 0 sec
Computing ESsd ...
    finished in 0 min 0 sec
Computed ['det.esw', 'det.esw_null', 'det.pvals', 'det.esw_s', 'ep.esw', 'ep.esw_null', 'ep.pvals', 'ep.esw_s', 'ges.esw', 'ges.esw_null', 'ges.pvals', 'ges.esw_s', 'nsi.esw', 'nsi.esw_null', 'nsi.pvals', 'nsi.esw_s', 'esmu', 'essd'].
CPU times: user 11.6 s, sys: 1min 26s, total: 1min 37s
Wall time: 1min 38s


In [12]:
eso.results['esmu']

Unnamed: 0_level_0,Astro,Endo,Ex1,Ex2,Ex3e,Ex4,Ex5b,Ex6a,Ex6b,Ex8,...,In4a,In4b,In6a,In6b,In7,In8,Microglia,OPC,Oligo,Per
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG-AS1,0.000000,0.000000,0.609659,0.561729,0.000000,0.177479,0.431299,0.000000,0.260826,0.494279,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
A1CF,0.000000,0.000000,0.291992,0.661875,0.000000,0.000000,0.509084,0.000000,0.801410,0.000000,...,0.866776,0.000000,0.000000,0.000000,0.000000,0.000000,0.149420,0.000000,0.000000,0.000000
A2M,0.000000,0.917038,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.793155,0.000000,0.000000,0.850124
A2ML1,0.556876,0.000000,0.148710,0.136587,0.000000,0.000000,0.010709,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.063774,0.671036,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
A2ML1-AS1,0.832807,0.078205,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.033866,0.000000,...,0.000000,0.000000,0.000000,0.178115,0.000000,0.000000,0.000000,0.104012,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZXDC,0.359144,0.000000,0.000000,0.192741,0.000000,0.000000,0.182129,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.119960,0.000000
ZYG11B,0.000000,0.000000,0.099794,0.091860,0.257444,0.106301,0.253256,0.000000,0.000000,0.000000,...,0.000000,0.190929,0.050073,0.214501,0.000000,0.076052,0.000000,0.000000,0.000000,0.000000
ZYX,0.000000,0.000000,0.283904,0.380187,0.000000,0.000000,0.798553,0.487914,0.743990,0.263336,...,0.000000,0.000000,0.000000,0.000000,0.144191,0.000000,0.000000,0.000000,0.000000,0.000000
ZZEF1,0.000000,0.000000,0.022213,0.307151,0.000000,0.000000,0.149563,0.000000,0.000000,0.052446,...,0.000000,0.050181,0.000000,0.071881,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [13]:
cellex.utils.mapping.human_symbol_to_human_ens(eso.results["esmu"], drop_unmapped=True, verbose=True)

Mapping: human gene symbols --> human ensembl gene id's ...
0.08 pct of genes are unmapped ...
Removed 11 unmapped genes ...


In [14]:
eso.results['esmu']

Unnamed: 0_level_0,Astro,Endo,Ex1,Ex2,Ex3e,Ex4,Ex5b,Ex6a,Ex6b,Ex8,...,In4a,In4b,In6a,In6b,In7,In8,Microglia,OPC,Oligo,Per
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000268895,0.000000,0.000000,0.609659,0.561729,0.000000,0.177479,0.431299,0.000000,0.260826,0.494279,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
ENSG00000148584,0.000000,0.000000,0.291992,0.661875,0.000000,0.000000,0.509084,0.000000,0.801410,0.000000,...,0.866776,0.000000,0.000000,0.000000,0.000000,0.000000,0.149420,0.000000,0.000000,0.000000
ENSG00000175899,0.000000,0.917038,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.793155,0.000000,0.000000,0.850124
ENSG00000166535,0.556876,0.000000,0.148710,0.136587,0.000000,0.000000,0.010709,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.063774,0.671036,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
ENSG00000256661,0.832807,0.078205,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.033866,0.000000,...,0.000000,0.000000,0.000000,0.178115,0.000000,0.000000,0.000000,0.104012,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000070476,0.359144,0.000000,0.000000,0.192741,0.000000,0.000000,0.182129,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.119960,0.000000
ENSG00000162378,0.000000,0.000000,0.099794,0.091860,0.257444,0.106301,0.253256,0.000000,0.000000,0.000000,...,0.000000,0.190929,0.050073,0.214501,0.000000,0.076052,0.000000,0.000000,0.000000,0.000000
ENSG00000159840,0.000000,0.000000,0.283904,0.380187,0.000000,0.000000,0.798553,0.487914,0.743990,0.263336,...,0.000000,0.000000,0.000000,0.000000,0.144191,0.000000,0.000000,0.000000,0.000000,0.000000
ENSG00000074755,0.000000,0.000000,0.022213,0.307151,0.000000,0.000000,0.149563,0.000000,0.000000,0.052446,...,0.000000,0.050181,0.000000,0.071881,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [15]:
eso.results["esmu"].to_csv("PsychENCODE_DER-22.esmu.csv.gz")