In [1]:
import loompy # needed for importing data for this tutorial
import numpy as np # needed for formatting data for this tutorial
import pandas as pd # needed for formatting data for this tutorial
import os

import cellex

In [2]:
import git
path_repo_root = git.Repo('.', search_parent_directories=True).working_tree_dir

In [3]:
### Constants
input_dir = os.path.join(path_repo_root, "tmp-data/expression")
output_dir = os.path.join(path_repo_root, "tmp-data/cellex")
assert(os.path.exists(output_dir)) # output_dir must exists

In [4]:
### Set constants
# dirOut = "cellex_out" # output directory for results and plots
# prefixData = "mousebrain_cellex" # prefix to prepend to files

pathData = os.path.join(input_dir, "mousebrain-l5_all.loom")
nameAnno = "ClusterName" # metadata annotation column attribute name
nameId = "CellID" # metadata cell id column attribute name
nameClass = "Class"

In [5]:
with loompy.connect(pathData) as ds:
    rows = (ds.row_attrs["Accession"])
    cols = (ds.col_attrs[nameId])
    data = pd.DataFrame(ds[:, :], index=rows, columns=cols)
    metadata = pd.DataFrame(data={"cell_type" : ds.col_attrs[nameAnno]}, index=ds.col_attrs[nameId])
    metadata_class = pd.DataFrame(data={"cell_class" : ds.col_attrs[nameClass]}, index=ds.col_attrs[nameAnno])
    n_cells_total = data.shape[1]
    
    # Handle duplicate CellID's in dataset
    data = data.loc[:, ~data.columns.duplicated()] # our data
    metadata = metadata.loc[~metadata.index.duplicated(keep='first')] # the type-annotation for individual cells
    metadata_class = metadata_class.loc[~metadata_class.index.duplicated(keep='first')] # class-annotation for cell types
    n_types = len(np.unique(metadata["cell_type"].values))
    
    n_cells_remaining = data.shape[1]
    print("Removed duplicates from data and metadata. {} / {} cells remaining.".format(n_cells_remaining, n_cells_total))
    print("Metadata contains {} unique cell-type annotations.".format(n_types))

Removed duplicates from data and metadata. 160678 / 160796 cells remaining.
Metadata contains 265 unique cell-type annotations.


In [6]:
data.head()

Unnamed: 0,10X82_2_TCTCTCACCAGTTA-,10X82_2_TATTATCTACCAGA-,10X82_2_TATCCCAGATGGCA-,10X82_2_ATTACGTATGAATG-,10X82_2_ATACGTCAATAAGG-,10X82_2_TACAGTCTTCGGTC-,10X81_2_CGTAACATTCGACA-,10X81_3_TGATGAGATACACA-,10X82_2_GCCAGGTAGGACAC-,10X81_2_AGATCAGTCCGTAT-,...,10X53_7_TGTCTGAGAGGC-,10X43_2_CAGTTGCTTGGA-,10X43_2_AGTCGATCGTGA-,10X43_2_GAAGCTTCGTAG-,10X43_2_CCATTGGGCAAG-,10X43_2_TACAACAGTCGT-,10X43_2_ATGATGGGTTAC-,10X43_2_GCAGCTTAGAGA-,10X53_7_GCGATGGGAGGT-,10X43_2_TTAATGGGGCAA-
ENSMUSG00000024647,21.0,28.0,28.0,48.0,31.0,29.0,66.0,13.0,22.0,45.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ENSMUSG00000041544,1.0,1.0,0.0,0.0,0.0,0.0,1.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ENSMUSG00000029503,14.0,11.0,7.0,13.0,11.0,7.0,16.0,7.0,14.0,9.0,...,0.0,3.0,2.0,6.0,8.0,3.0,1.0,2.0,2.0,2.0
ENSMUSG00000039942,2.0,1.0,2.0,2.0,2.0,1.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
ENSMUSG00000059187,8.0,7.0,9.0,20.0,4.0,11.0,20.0,4.0,16.0,18.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
metadata.head()

Unnamed: 0,cell_type
10X82_2_TCTCTCACCAGTTA-,ENT9
10X82_2_TATTATCTACCAGA-,ENT9
10X82_2_TATCCCAGATGGCA-,ENT9
10X82_2_ATTACGTATGAATG-,ENT9
10X82_2_ATACGTCAATAAGG-,ENT9


In [8]:
eso = cellex.ESObject(data=data, annotation=metadata, verbose=True)

Preprocessing - running remove_non_expressed ... excluded 4612 / 27998 genes in 0 min 31 sec
Preprocessing - normalizing data ... data normalized in 7 min 23 sec
Preprocessing - running ANOVA ... excluded 3055 / 23386 genes in 5 min 37 sec


In [9]:
eso.compute(verbose=True)

Computing DET ... 
    esw ...
    empirical p-values ...
    esw_s ...
    finished in 1 min 54 sec
Computing EP ...
    esw ...
    empirical p-values ...
    esw_s ...
    finished in 0 min 2 sec
Computing GES ...
    esw ...
    empirical p-values ...
    esw_s ...
    finished in 3 min 22 sec
Computing NSI ...
    esw ...
    empirical p-values ...
    esw_s ...
    finished in 5 min 48 sec
Computing ESmu ...
    finished in 0 min 0 sec
Computing ESsd ...
    finished in 0 min 0 sec
Computed ['det.esw', 'det.esw_null', 'det.pvals', 'det.esw_s', 'ep.esw', 'ep.esw_null', 'ep.pvals', 'ep.esw_s', 'ges.esw', 'ges.esw_null', 'ges.pvals', 'ges.esw_s', 'nsi.esw', 'nsi.esw_null', 'nsi.pvals', 'nsi.esw_s', 'esmu', 'essd'].


In [10]:
# eso.save_as_csv()
eso.save_as_csv(file_prefix="mousebrain_cellex.mouse", path=output_dir, keys=None, verbose=True)

Saving results as csv to disk ...
  Saved: /nfsdata/projects/timshel/sc-genetics/timshel-bmicelltypes2019/tmp-data/cellex/mousebrain_cellex.mouse.esmu.csv.gz
  Saved: /nfsdata/projects/timshel/sc-genetics/timshel-bmicelltypes2019/tmp-data/cellex/mousebrain_cellex.mouse.essd.csv.gz
Finished saving results to /nfsdata/projects/timshel/sc-genetics/timshel-bmicelltypes2019/tmp-data/cellex


In [11]:
cellex.utils.mapping.ens_mouse_to_ens_human(eso.results["esmu"], drop_unmapped=True, verbose=True)

Mapping: mouse ensembl gene id's --> human ensembl gene id's ...


FileNotFoundError: [Errno 2] No such file or directory: 'CELLEX/cellex/utils/mapping/maps/hsapiens_mmusculus_unique_orthologs.GRCh37.ens_v91.txt.gz'

In [None]:
eso.save_as_csv(file_prefix="mousebrain_cellex.human", path=output_dir, keys=None, verbose=True)