In [1]:
from pathlib import Path
from collections import Counter
import os
import re
import random
import subprocess

import anndata
import scanpy as sc
import pandas as pd
import numpy as np

import mgitools.os_helpers as os_helpers

In [2]:
# !pip install git+https://github.com/estorrs/mgitools


In [3]:
CELL_TYPE_KEY = 'cell_type'
N_PER_CELL_TYPE = 500
DATA_DIR = '/home/estorrs/pollock/benchmarking/data/11302020_harmonized/teir_1/'
RESULTS_DIR = '/home/estorrs/pollock/benchmarking/results/11302020_teir1'
RESULTS_CROSS_DISEASE_DIR = '/home/estorrs/pollock/benchmarking/results/11302020_teir1_cross_disease'
RESULTS_CROSS_DTYPE_DIR = '/home/estorrs/pollock/benchmarking/results/11302020_teir1_cross_datatype'
SANDBOX_DIR = '/home/estorrs/pollock/benchmarking/sandbox'

Path(RESULTS_DIR).mkdir(parents=True, exist_ok=True)
Path(RESULTS_CROSS_DISEASE_DIR).mkdir(parents=True, exist_ok=True)
Path(RESULTS_CROSS_DTYPE_DIR).mkdir(parents=True, exist_ok=True)

read in train/val data

In [4]:
fps = sorted(os_helpers.listfiles(DATA_DIR, regex='.h5ad$'))
adata_map = {fp.split('/')[-2]:{} for fp in fps}
for fp in fps:
    dtype = fp.split('/')[-2]
    disease = re.sub(r'^(.*)((_train)|(_val)).h5ad$', r'\1', fp.split('/')[-1])
    if disease not in adata_map[dtype] and '.h5ad' not in disease: adata_map[dtype][disease] = {}
    if 'train.h5ad' in fp:
        adata_map[dtype][disease]['train'] = fp
    if 'val.h5ad' in fp:
        adata_map[dtype][disease]['val'] = fp
for dtype, d in adata_map.items():
    for disease, m in d.items():
        print(dtype, disease)

scRNAseq brca
scRNAseq cesc
scRNAseq hnscc
scRNAseq melanoma
scRNAseq pbmc
scRNAseq pdac
snATACseq brca_gene_activity
snATACseq brca_motif
snATACseq brca_peaks
snATACseq ccrcc_gene_activity
snATACseq ccrcc_motif
snATACseq ccrcc_peaks
snATACseq gbm_gene_activity
snATACseq gbm_motif
snATACseq gbm_peaks
snRNAseq brca
snRNAseq ccrcc
snRNAseq gbm


In [5]:
def run_workflow_for_datasets(adata_map, workflow, workflow_identifier, output_dir):
    for dtype, d in adata_map.items():
#         if dtype != 'snATACseq':
        for disease, m in d.items():
            # make dir if doesnt exist yet
            directory = os.path.join(output_dir, dtype, disease)
            Path(directory).mkdir(parents=True, exist_ok=True)
            train, val = sc.read_h5ad(m['train']), sc.read_h5ad(m['val'])

            print(dtype, disease, train.shape, val.shape)
            run_workflow(workflow, workflow_identifier,
                train, val, directory)
            
def run_workflow_for_cross_disease(adata_map, workflow, workflow_identifier, output_dir):
    for dtype, d in adata_map.items():
        for disease1, m1 in d.items():
            for disease2, m2 in d.items():
#                 if disease1 != disease2:
                # make dir if doesnt exist yet
                directory = os.path.join(output_dir, dtype, f'{disease1}_train_{disease2}_val')
                Path(directory).mkdir(parents=True, exist_ok=True)
                train, val = sc.read_h5ad(m1['train']), sc.read_h5ad(m2['val'])

                print(dtype, f'{disease1}_train_{disease2}_val', train.shape, val.shape)
                run_workflow(workflow, workflow_identifier,
                    train, val, directory)
                    
                    
def run_workflow_for_cross_datatype(adata_map, workflow, workflow_identifier, output_dir):
    for dtype1, d1 in adata_map.items():
        for dtype2, d2 in adata_map.items():
            for disease1, m1 in d1.items():
                for disease2, m2 in d2.items():
                    # make dir if doesnt exist yet
                    directory = os.path.join(output_dir, f'{dtype1}_{dtype2}',
                                             f'{dtype1}_{disease1}_train_{dtype2}_{disease2}_val')
                    Path(directory).mkdir(parents=True, exist_ok=True)
                    train, val = sc.read_h5ad(m1['train']), sc.read_h5ad(m2['val'])

                    print(f'{dtype1}_{dtype2}',
                          f'{dtype1}_{disease1}_train_{dtype2}_{disease2}_val', train.shape, val.shape)
                    run_workflow(workflow, workflow_identifier,
                        train, val, directory)

def run_workflow(workflow, workflow_identifier, train, val, output_dir):
    """
    Run the workflow defined by the workflow function.
    
    workflow function takes a train adata and a val adata as inputs,
    and returns dataframe with cell_id, groundtruth, predicted, and probability columns
    """
    try:
        # if it is pollock it needs to know where to save the module
        if workflow_identifier == 'pollock':
            df = workflow(train, val, CELL_TYPE_KEY, os.path.join(output_dir, f'{workflow_identifier}_module'))
        else:
            df = workflow(train, val, CELL_TYPE_KEY)

        df.to_csv(os.path.join(output_dir, f'{workflow_identifier}.tsv'), sep='\t', index=False, header=True)
    except Exception as e:
        print('failed ' + os.path.join(output_dir, f'{workflow_identifier}.tsv'))
        print(e)

##### MARS

In [6]:
def mars_preprocess(adata):
    
    sc.pp.normalize_per_cell(adata, counts_per_cell_after=1e4)
    sc.pp.scale(adata, max_value=10, zero_center=True)
    sc.pp.pca(adata, n_comps=50)
    
    
def run_mars_workflow(train, val, cell_type_key):
    var_names = train.var_names.intersection(val.var_names)
    train = train[:, var_names]
    val = val[:, var_names]
    
    train.obs['dataset'] = 'train'
    val.obs['dataset'] = 'val'
    
    combined = train.concat(val)
    

    
    sc.tl.ingest(val, train, obs=cell_type_key)
    
    df = pd.DataFrame.from_dict({
        'cell_id': val.obs.index.to_list(),
        'groundtruth': groundtruth,
        'predicted': val.obs[cell_type_key].to_list(),
        'probability': [np.nan] * val.shape[0]
    })
    
    return df

###### testing stuff

In [7]:
from args_parser import get_parser
from model.mars import MARS
from model.experiment_dataset import ExperimentDataset


In a future version of Scanpy, `scanpy.api` will be removed.
Simply use `import scanpy as sc` and `import scanpy.external as sce` instead.



In [8]:
params, unknown = get_parser().parse_known_args()

In [9]:
anndata.__version__

'0.7.5'

In [10]:
# !pip install anndata==0.7.5
# !pip install anndata==0.6.22.post1

In [11]:
train, val = (sc.read_h5ad(adata_map['scRNAseq']['pbmc']['train']),
              sc.read_h5ad(adata_map['scRNAseq']['pbmc']['val']))

In [12]:
if 'sparse' not in str(type(train.X)): train.X = scipy.sparse.csr_matrix(train.X)
if 'sparse' not in str(type(val.X)): val.X = scipy.sparse.csr_matrix(val.X)


In [13]:
var_names = train.var_names.intersection(val.var_names)
train = train[:, var_names]
val = val[:, var_names]

train.obs['dataset'] = 'train'
val.obs['dataset'] = 'val'

adata = anndata.concat((train, val))

sc.pp.normalize_per_cell(adata, counts_per_cell_after=1e4)
sc.pp.scale(adata, zero_center=True)

sc.pp.neighbors(adata, n_neighbors=30, use_rep='X')
sc.pp.pca(adata, n_comps=50)

Trying to set attribute `.obs` of view, copying.
Trying to set attribute `.obs` of view, copying.
  if adata.isview:
  if adata.isview:  # we shouldn't need this here...
Compilation is falling back to object mode WITH looplifting enabled because Function "fuzzy_simplicial_set" failed type inference due to: Untyped global name 'nearest_neighbors': cannot determine Numba type of <class 'function'>

File "../miniconda3/envs/mars/lib/python3.7/site-packages/umap/umap_.py", line 467:
def fuzzy_simplicial_set(
    <source elided>
    if knn_indices is None or knn_dists is None:
        knn_indices, knn_dists, _ = nearest_neighbors(
        ^

  @numba.jit()

File "../miniconda3/envs/mars/lib/python3.7/site-packages/umap/umap_.py", line 350:
@numba.jit()
def fuzzy_simplicial_set(
^

  self.func_ir.loc))
Fall-back from the nopython compilation path to the object mode compilation path has been detected, this is deprecated behaviour.

For more information visit http://numba.pydata.org/numba-doc/

In [14]:
train = adata[adata.obs['dataset'] == 'train',:]
val = adata[adata.obs['dataset'] == 'val',:]

train = train[train.obs.sort_values('cell_type').index]
val = val[val.obs.sort_values('cell_type').index]

In [15]:
class_to_int = {c:i for i, c in enumerate(sorted(set(train.obs['cell_type'])))}
int_to_class = {i:c for c, i in class_to_int.items()}

In [16]:
y_train = np.array([class_to_int[c] for c in train.obs['cell_type']], dtype=np.int64)
annotated = ExperimentDataset(train.X.toarray(), train.obs_names, train.var_names, 'train', y_train, )

== Dataset: Found 1607 items 
== Dataset: Found 7 classes


In [17]:
y_val = np.array([class_to_int[c] for c in val.obs['cell_type']], dtype=np.int64)
unannotated = ExperimentDataset(val.X.toarray(), val.obs_names, val.var_names, 'val', y_val)

== Dataset: Found 888 items 
== Dataset: Found 7 classes


In [18]:
pretrain_data = ExperimentDataset(val.X.toarray(), val.obs_names, val.var_names, 'val')

In [19]:

n_clusters = len(np.unique(unannotated.y))
n_clusters

7

In [20]:
params.device = 'cpu'

In [21]:
mars = MARS(n_clusters, params, [annotated], unannotated, pretrain_data, hid_dim_1=1000, hid_dim_2=100)

<class 'list'>


In [22]:
# return both annotated and unannotated datasets with save_all_embeddings
a, landmarks, scores = mars.train(evaluation_mode=True, save_all_embeddings=True) # evaluation mode

Pretraining..

=== Epoch: 30 ===
Train acc: 0.9968886375427246
['CTAGGATGATCGTG-1', 'TATGGGTGCTAGCA-1', 'GTAACGTGATCGGT-1', 'TATGTCACGGAACG-1', 'ACGCACCTGTTAGC-1']
tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])
tensor([2, 2, 2, 2, 2])


In [26]:
len(landmarks[-1])

7

In [27]:
scores

{'precision': 0.5574168980375315,
 'accuracy': 0.7274774774774775,
 'recall': 0.508487892171794,
 'f1_score': 0.5100364326275104,
 'nmi': 0.4724061645165623,
 'adj_mi': 0.38489017982364193,
 'adj_rand': 0.4689671164538997}

In [24]:
a.obs

Unnamed: 0,truth_labels,MARS_labels,experiment
TACATAGAACGCAT-1-train,2,,train
ACTTAAGATTACTC-1-train,3,,train
TACGGAACGCGTTA-1-train,0,,train
GCTCAAGAACCATG-1-train,0,,train
CACTTTGACTCTAT-1-train,2,,train
...,...,...,...
TAAGAGGACTTGTT-1-val,6,1,val
TTCCCACTTGAGGG-1-val,6,2,val
CGACCACTGCCAAT-1-val,6,1,val
GGTGGAGACAGATC-1-val,6,1,val


In [None]:
scores

In [None]:
a.obs

In [23]:
preds = a[a.obs['experiment']=='val'].copy()
preds.obs.index = [x.replace('-val', '') for x in preds.obs.index]
preds.obs

Unnamed: 0,truth_labels,MARS_labels,experiment
CTAGGATGATCGTG-1,0,2,val
TATGGGTGCTAGCA-1,0,2,val
GTAACGTGATCGGT-1,0,2,val
TATGTCACGGAACG-1,0,2,val
ACGCACCTGTTAGC-1,0,2,val
...,...,...,...
TAAGAGGACTTGTT-1,6,1,val
TTCCCACTTGAGGG-1,6,2,val
CGACCACTGCCAAT-1,6,1,val
GGTGGAGACAGATC-1,6,1,val


In [None]:
preds.obs['groundtruth'] = [val.obs.loc[x, 'cell_type'] for x in preds.obs.index]
preds.obs['predicted'] = [int_to_class[x] for x in preds.obs['MARS_labels']]
preds.obs

In [None]:
# just map to the best group since there is a bug with MARS encodings
cluster_map = {}
for k in int_to_class.keys():
    filtered = preds[preds.obs['truth_labels']==k]
    counts = Counter(filtered.obs['MARS_labels'])
    cluster_map[k] = counts.most_common()[0][0]
    print(k, counts.most_common())
r_cluster_map = {v:k for k, v in cluster_map.items()}
cluster_map

In [None]:
preds.obs['adjusted_predictions'] = [cluster_map[c] for c in preds.obs['MARS_labels']]
preds.obs

In [None]:
sc.pp.neighbors(preds)
sc.tl.umap(preds)
sc.pl.umap(preds)

In [None]:
sc.pl.umap(preds, color=['truth_labels', 'MARS_labels', 'groundtruth', 'predicted'])

In [None]:
collected = []
for c in y_train:
    if c not in collected: collected.append(c)
collected

In [None]:
collected = []
for c in y_val:
    if c not in collected: collected.append(c)
collected

In [None]:
int_to_class

In [None]:
annotated.