In [31]:
from pathlib import Path
from collections import Counter
import os
import re
import random
import subprocess

import anndata
import scanpy as sc
import pandas as pd
import numpy as np

import mgitools.os_helpers as os_helpers

In [32]:
# !pip install git+https://github.com/estorrs/mgitools
# !pip install tensorflow==2.1.0

In [33]:
%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [34]:
%autoreload 2

In [35]:
# !pip install -e /home/estorrs/pollock/
import pollock
from pollock.models.model import PollockDataset, PollockModel, load_from_directory, predict_from_anndata

In [36]:
# !conda install -y scanpy

In [37]:
# !pip install git+https://github.com/estorrs/mgitools

In [68]:
CELL_TYPE_KEY = 'cell_type'
N_PER_CELL_TYPE = 200
DATA_DIR = '/home/estorrs/pollock/benchmarking/data/10232020_harmonized/teir_1/'
RESULTS_DIR = '/home/estorrs/pollock/benchmarking/results/10272020_teir1'
RESULTS_CROSS_DISEASE_DIR = '/home/estorrs/pollock/benchmarking/results/10272020_teir1_cross_disease'
SANDBOX_DIR = '/home/estorrs/pollock/benchmarking/sandbox'

Path(RESULTS_DIR).mkdir(parents=True, exist_ok=True)
Path(RESULTS_CROSS_DISEASE_DIR).mkdir(parents=True, exist_ok=True)

##### create training and validation datasets

only run if you haven't created these datasets yet

In [23]:
def cap_list(ls, n=100, split=.8, oversample=True):
    """
    Grabs items from a pool.
    
    if split * pool size is greater than n, then just randomly sample 80% of the pool
    otherwise sample 80% of the pool, then oversample so you end up with a final size of n
    """
    # just return list if it is of length 1
    if len(ls) <= 1: return ls
    cap = int(len(ls) * split)
    if cap > n:
        return random.sample(ls, n)

    if oversample:
        pool = random.sample(ls, cap)
        ## oversample to
        return random.choices(pool, k=n)

    return random.sample(ls, cap)

def balancedish_training_generator(adata, cell_type_key, n_per_cell_type, oversample=True, split=.8):
    """
    Return balanced train and validation sets
    """
    cell_type_to_idxs = {}
    for cell_id, cell_type in zip(adata.obs.index, adata.obs[cell_type_key]):
        if cell_type not in cell_type_to_idxs:
            cell_type_to_idxs[cell_type] = [cell_id]
        else:
            cell_type_to_idxs[cell_type].append(cell_id)

    cell_type_to_idxs = {k:cap_list(ls, n_per_cell_type, oversample=oversample, split=split)
                         for k, ls in cell_type_to_idxs.items()}

    train_ids = np.asarray([x for ls in cell_type_to_idxs.values() for x in ls])
    train_idxs = np.arange(adata.shape[0])[np.isin(np.asarray(adata.obs.index), train_ids)]
    val_idxs = np.delete(np.arange(adata.shape[0]), train_idxs)

    train_adata = adata[train_idxs, :]
    val_adata = adata[val_idxs, :]

    return train_adata, val_adata

# def create_train_val_datasets(adata, cell_type_key, oversample=True):
#     counts = Counter(adata.obs[cell_type_key])
#     min_count = counts.most_common()[-1][1]
#     n_per_cell_type = max(min_count, )
#     train_adata, val_adata = balancedish_training_generator(adata, cell_type_key,
#                                                             n_per_cell_type, oversample=oversample)
#     return train_adata, val_adata

In [24]:
fps = sorted(os_helpers.listfiles(DATA_DIR, regex='.h5ad$'))
fp_map = {fp.split('/')[-2]:{} for fp in fps}
for fp in fps:
    if '/_train.h5ad' not in fp and '/_val.h5ad' not in fp:
        dtype = fp.split('/')[-2]
        disease = fp.split('/')[-1].replace('.h5ad', '')
        fp_map[dtype][disease] = fp
fp_map

{'scRNAseq': {'brca': '/home/estorrs/pollock/benchmarking/data/10232020_harmonized/teir_2/scRNAseq/brca.h5ad',
  'cesc': '/home/estorrs/pollock/benchmarking/data/10232020_harmonized/teir_2/scRNAseq/cesc.h5ad',
  'hnscc': '/home/estorrs/pollock/benchmarking/data/10232020_harmonized/teir_2/scRNAseq/hnscc.h5ad',
  'melanoma': '/home/estorrs/pollock/benchmarking/data/10232020_harmonized/teir_2/scRNAseq/melanoma.h5ad',
  'pbmc': '/home/estorrs/pollock/benchmarking/data/10232020_harmonized/teir_2/scRNAseq/pbmc.h5ad',
  'pdac': '/home/estorrs/pollock/benchmarking/data/10232020_harmonized/teir_2/scRNAseq/pdac.h5ad'},
 'snATACseq': {'brca': '/home/estorrs/pollock/benchmarking/data/10232020_harmonized/teir_2/snATACseq/brca.h5ad',
  'ccrcc': '/home/estorrs/pollock/benchmarking/data/10232020_harmonized/teir_2/snATACseq/ccrcc.h5ad',
  'gbm': '/home/estorrs/pollock/benchmarking/data/10232020_harmonized/teir_2/snATACseq/gbm.h5ad'},
 'snRNAseq': {'brca': '/home/estorrs/pollock/benchmarking/data/102320

In [25]:
for dtype, d in fp_map.items():
    for disease, fp in d.items():
        print(dtype, disease)
        adata = sc.read_h5ad(fp)
        # check for cell type key
        if CELL_TYPE_KEY not in adata.obs: raise RuntimeError(f'{CELL_TYPE_KEY} not in {fp}')
        
        train_adata, val_adata = balancedish_training_generator(adata, CELL_TYPE_KEY, N_PER_CELL_TYPE)
        # resample validation data to make dataset smaller while keeping rare cell types
        val_adata, _ = balancedish_training_generator(val_adata, CELL_TYPE_KEY, 1000, oversample=False,
                                                     split=1.)
        train_adata.write_h5ad(fp.replace('.h5ad', '_train.h5ad'))
        val_adata.write_h5ad(fp.replace('.h5ad', '_val.h5ad'))
        

scRNAseq brca
scRNAseq cesc
scRNAseq hnscc
scRNAseq melanoma
scRNAseq pbmc
scRNAseq pdac
snATACseq brca
snATACseq ccrcc
snATACseq gbm
snRNAseq brca
snRNAseq ccrcc
snRNAseq gbm


##### load in training and validation datasets

In [71]:
fps = sorted(os_helpers.listfiles(DATA_DIR, regex='.h5ad$'))
adata_map = {fp.split('/')[-2]:{} for fp in fps}
for fp in fps:
    dtype = fp.split('/')[-2]
    disease = re.sub(r'^(.*)((_train)|(_val)).h5ad$', r'\1', fp.split('/')[-1])
    if disease not in adata_map[dtype] and '.h5ad' not in disease: adata_map[dtype][disease] = {}
    if 'train.h5ad' in fp:
        adata_map[dtype][disease]['train'] = fp
    if 'val.h5ad' in fp:
        adata_map[dtype][disease]['val'] = fp
for dtype, d in adata_map.items():
    for disease, m in d.items():
        print(dtype, disease)

scRNAseq brca
scRNAseq cesc
scRNAseq hnscc
scRNAseq melanoma
scRNAseq pbmc
scRNAseq pdac
snATACseq brca
snATACseq ccrcc
snATACseq gbm
snRNAseq brca
snRNAseq ccrcc
snRNAseq gbm


In [72]:
adata_map['snATACseq'].pop('ccrcc')

{'train': '/home/estorrs/pollock/benchmarking/data/10232020_harmonized/teir_1/snATACseq/ccrcc_train.h5ad',
 'val': '/home/estorrs/pollock/benchmarking/data/10232020_harmonized/teir_1/snATACseq/ccrcc_val.h5ad'}

### run workflows

In [85]:
def run_workflow_for_datasets(adata_map, workflow, workflow_identifier, output_dir):
    for dtype, d in adata_map.items():
#         if dtype != 'snATACseq':
        for disease, m in d.items():
            # make dir if doesnt exist yet
            directory = os.path.join(output_dir, dtype, disease)
            Path(directory).mkdir(parents=True, exist_ok=True)
            train, val = sc.read_h5ad(m['train']), sc.read_h5ad(m['val'])

            print(dtype, disease, train.shape, val.shape)
            run_workflow(workflow, workflow_identifier,
                train, val, directory)
            
def run_workflow_for_cross_disease(adata_map, workflow, workflow_identifier, output_dir):
    for dtype, d in adata_map.items():
        for disease1, m1 in d.items():
            for disease2, m2 in d.items():
                if disease1 != disease2:
                    # make dir if doesnt exist yet
                    directory = os.path.join(output_dir, dtype, f'{disease1}_train_{disease2}_val')
                    Path(directory).mkdir(parents=True, exist_ok=True)
                    train, val = sc.read_h5ad(m1['train']), sc.read_h5ad(m2['val'])

                    print(dtype, f'{disease1}_train_{disease2}_val', train.shape, val.shape)
                    run_workflow(workflow, workflow_identifier,
                        train, val, directory)
                    
                    
# def run_workflow_for_cross_dataset(adata_map, workflow, workflow_identifier, output_dir):
#     for dtype, d in adata_map.items():
#         for disease1, m1 in d.items():
#             for disease2, m2 in d.items():
#                 if disease1 != disease2:
#                     # make dir if doesnt exist yet
#                     directory = os.path.join(output_dir, dtype, f'{disease1}_train_{disease2}_val')
#                     Path(directory).mkdir(parents=True, exist_ok=True)
#                     train, val = sc.read_h5ad(m1['train']), sc.read_h5ad(m2['val'])

#                     print(dtype, f'{disease1}_train_{disease2}_val', train.shape, val.shape)
#                     run_workflow(workflow, workflow_identifier,
#                         train, val, directory)

def run_workflow(workflow, workflow_identifier, train, val, output_dir):
    """
    Run the workflow defined by the workflow function.
    
    workflow function takes a train adata and a val adata as inputs,
    and returns dataframe with cell_id, groundtruth, predicted, and probability columns
    """
    # if it is pollock it needs to know where to save the module
    if workflow_identifier == 'pollock':
        df = workflow(train, val, CELL_TYPE_KEY, os.path.join(output_dir, f'{workflow_identifier}_module'))
    else:
        df = workflow(train, val, CELL_TYPE_KEY)
    df.to_csv(os.path.join(output_dir, f'{workflow_identifier}.tsv'), sep='\t', index=False, header=True)

##### pollock

In [86]:
def run_pollock_workflow(train, val, cell_type_key, module_fp):
    train.obs['is_validation'] = [False] * train.shape[0]
    val.obs['is_validation'] = [True] * val.shape[0]
    combined = train.concatenate(val)
    
    pds = PollockDataset(combined.copy(), cell_type_key=cell_type_key,
                     dataset_type='training', validation_key='is_validation')
    
    pm = PollockModel(pds.cell_types, pds.train_adata.shape[1], alpha=.0001, latent_dim=25)
    
    pm.fit(pds, epochs=20)
    
    pm.save(pds, module_fp)

    preds = predict_from_anndata(val.copy(), module_fp, adata_batch_size=10000)
    
    df = pd.DataFrame.from_dict({
        'cell_id': preds.index.to_list(),
        'groundtruth': val.obs.loc[preds.index][cell_type_key].to_list(),
        'predicted': preds['predicted_cell_type'],
        'probability': preds['cell_type_probability']
    })

    return df

In [30]:
run_workflow_for_datasets(adata_map, run_pollock_workflow, 'pollock', RESULTS_DIR)

scRNAseq brca (2600, 27131) (11253, 27131)


2020-10-29 16:02:50,078 normalizing the expression counts for model training
2020-10-29 16:03:07,475 input dataset shape: (13853, 27131)
2020-10-29 16:03:07,478 possible cell types: ['B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Erythrocyte', 'Fibroblast', 'Macrophage', 'Malignant', 'Mast', 'NK', 'Plasma', 'Treg']
2020-10-29 16:03:07,480 possible cell types: [('Endothelial', 1200), ('CD4 T cell', 1200), ('NK', 1200), ('Fibroblast', 1200), ('CD8 T cell', 1200), ('Macrophage', 1200), ('Malignant', 1200), ('Plasma', 1200), ('Treg', 1200), ('B cell', 1200), ('Mast', 746), ('Dendritic', 592), ('Erythrocyte', 515)]
2020-10-29 16:03:14,346 training dataset shape: (6102, 27131)
2020-10-29 16:03:14,348 validation dataset shape: (7751, 27131)
2020-10-29 16:03:34,674 epoch: 1, train loss: 36.64873504638672, val loss: 38.795326232910156
2020-10-29 16:03:51,631 epoch: 2, train loss: 35.816097259521484, val loss: 37.67295837402344
2020-10-29 16:04:08,309 epoch: 3, train loss: 35

scRNAseq cesc (1939, 22928) (8451, 22928)


2020-10-29 16:09:36,397 normalizing the expression counts for model training
2020-10-29 16:09:42,886 input dataset shape: (10390, 22928)
2020-10-29 16:09:42,887 possible cell types: ['CD4 T cell', 'CD8 T cell', 'Endothelial', 'Epithelial', 'Erythrocyte', 'Fibroblast', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Plasma']
2020-10-29 16:09:42,889 possible cell types: [('Fibroblast', 1200), ('Monocyte', 1200), ('CD8 T cell', 1200), ('Epithelial', 1200), ('NK', 1200), ('Malignant', 1200), ('Plasma', 1135), ('CD4 T cell', 1118), ('Endothelial', 732), ('Mast', 138), ('Erythrocyte', 67)]
2020-10-29 16:09:45,684 training dataset shape: (4663, 22928)
2020-10-29 16:09:45,685 validation dataset shape: (5727, 22928)
2020-10-29 16:09:56,775 epoch: 1, train loss: 37.39118576049805, val loss: 33.17431640625
2020-10-29 16:10:06,671 epoch: 2, train loss: 36.19399642944336, val loss: 32.364967346191406
2020-10-29 16:10:16,554 epoch: 3, train loss: 35.4684944152832, val loss: 31.438339233398438
2020-10-29 16:

scRNAseq hnscc (2200, 26929) (10288, 26929)


2020-10-29 16:13:43,597 normalizing the expression counts for model training
2020-10-29 16:13:52,696 input dataset shape: (12488, 26929)
2020-10-29 16:13:52,700 possible cell types: ['B cell', 'CD4 T cell', 'CD8 T cell', 'Endothelial', 'Erythrocyte', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Plasma', 'Treg']
2020-10-29 16:13:52,702 possible cell types: [('Treg', 1200), ('Plasma', 1200), ('Mast', 1200), ('B cell', 1200), ('CD4 T cell', 1200), ('Malignant', 1200), ('CD8 T cell', 1200), ('Endothelial', 1200), ('Monocyte', 1200), ('NK', 1200), ('Erythrocyte', 488)]
2020-10-29 16:13:56,412 training dataset shape: (5268, 26929)
2020-10-29 16:13:56,413 validation dataset shape: (7220, 26929)
2020-10-29 16:14:11,542 epoch: 1, train loss: 30.803085327148438, val loss: 16.803770065307617
2020-10-29 16:14:25,990 epoch: 2, train loss: 29.932323455810547, val loss: 16.530471801757812
2020-10-29 16:14:40,380 epoch: 3, train loss: 29.50837516784668, val loss: 16.330692291259766
2020-10-29 16:14:54,867 

scRNAseq melanoma (2824, 23452) (8028, 23452)


2020-10-29 16:19:28,964 normalizing the expression counts for model training
2020-10-29 16:19:35,791 input dataset shape: (10852, 23452)
2020-10-29 16:19:35,793 possible cell types: ['B cell', 'CD4 T cell naive', 'CD8 CTL', 'CD8 CTL exausted', 'CD8 follicular T-cell', 'Dendritic', 'Fibroblast', 'M1 Macrophage', 'M2 Macrophage', 'Malignant', 'Monocyte', 'NK', 'Plasma', 'Treg', 'pDC']
2020-10-29 16:19:35,794 possible cell types: [('CD8 CTL', 1200), ('B cell', 1200), ('CD8 CTL exausted', 1200), ('M2 Macrophage', 1200), ('Malignant', 1200), ('CD4 T cell naive', 1200), ('Treg', 862), ('Monocyte', 539), ('NK', 538), ('pDC', 478), ('Fibroblast', 321), ('Plasma', 273), ('Dendritic', 263), ('M1 Macrophage', 200), ('CD8 follicular T-cell', 178)]
2020-10-29 16:19:38,949 training dataset shape: (5289, 23452)
2020-10-29 16:19:38,950 validation dataset shape: (5563, 23452)
2020-10-29 16:19:51,993 epoch: 1, train loss: 35.766632080078125, val loss: 44.68608093261719
2020-10-29 16:20:04,155 epoch: 2, 

scRNAseq pbmc (834, 32738) (1320, 32738)


2020-10-29 16:24:12,971 normalizing the expression counts for model training
2020-10-29 16:24:14,895 input dataset shape: (2154, 32738)
2020-10-29 16:24:14,896 possible cell types: ['B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'FCGR3A Monocyte', 'Megakaryocyte', 'NK']
2020-10-29 16:24:14,897 possible cell types: [('CD4 T cell', 1143), ('B cell', 341), ('CD8 T cell', 306), ('NK', 163), ('FCGR3A Monocyte', 150), ('Dendritic', 38), ('Megakaryocyte', 13)]
2020-10-29 16:24:15,475 training dataset shape: (1220, 32738)
2020-10-29 16:24:15,476 validation dataset shape: (934, 32738)




2020-10-29 16:24:20,309 5 out of the last 13 calls to <function compute_loss at 0x7f13991523b0> triggered tf.function retracing. Tracing is expensive and the excessive number of tracings is likely due to passing python objects instead of tensors. Also, tf.function has experimental_relax_shapes=True option that relaxes argument shapes that can avoid unnecessary retracing. Please refer to https://www.tensorflow.org/tutorials/customization/performance#python_or_tensor_args and https://www.tensorflow.org/api_docs/python/tf/function for more details.




2020-10-29 16:24:20,397 5 out of the last 11 calls to <function compute_loss at 0x7f13991523b0> triggered tf.function retracing. Tracing is expensive and the excessive number of tracings is likely due to passing python objects instead of tensors. Also, tf.function has experimental_relax_shapes=True option that relaxes argument shapes that can avoid unnecessary retracing. Please refer to https://www.tensorflow.org/tutorials/customization/performance#python_or_tensor_args and https://www.tensorflow.org/api_docs/python/tf/function for more details.




2020-10-29 16:24:20,483 6 out of the last 12 calls to <function compute_loss at 0x7f13991523b0> triggered tf.function retracing. Tracing is expensive and the excessive number of tracings is likely due to passing python objects instead of tensors. Also, tf.function has experimental_relax_shapes=True option that relaxes argument shapes that can avoid unnecessary retracing. Please refer to https://www.tensorflow.org/tutorials/customization/performance#python_or_tensor_args and https://www.tensorflow.org/api_docs/python/tf/function for more details.




2020-10-29 16:24:20,614 7 out of the last 14 calls to <function compute_loss at 0x7f13991523b0> triggered tf.function retracing. Tracing is expensive and the excessive number of tracings is likely due to passing python objects instead of tensors. Also, tf.function has experimental_relax_shapes=True option that relaxes argument shapes that can avoid unnecessary retracing. Please refer to https://www.tensorflow.org/tutorials/customization/performance#python_or_tensor_args and https://www.tensorflow.org/api_docs/python/tf/function for more details.
2020-10-29 16:24:21,194 epoch: 1, train loss: 47.38005065917969, val loss: 48.97697448730469
2020-10-29 16:24:25,415 epoch: 2, train loss: 45.49711608886719, val loss: 46.78955078125
2020-10-29 16:24:29,654 epoch: 3, train loss: 45.3695068359375, val loss: 46.15694808959961
2020-10-29 16:24:33,893 epoch: 4, train loss: 46.088008880615234, val loss: 45.939239501953125
2020-10-29 16:24:38,090 epoch: 5, train loss: 45.7983283996582, val loss: 45.8

scRNAseq pdac (3703, 28756) (17428, 28756)


2020-10-29 16:25:51,707 normalizing the expression counts for model training
2020-10-29 16:26:07,917 input dataset shape: (21131, 28756)
2020-10-29 16:26:07,919 possible cell types: ['ADM', 'Acinar', 'B cell', 'CD4 T cell', 'CD4 T cell naive', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Epithelial', 'Erythrocyte', 'Fibroblast', 'Islet', 'Macrophage', 'Malignant', 'Mast', 'NK', 'Plasma', 'Treg', 'Tuft']
2020-10-29 16:26:07,921 possible cell types: [('Dendritic', 1200), ('CD4 T cell naive', 1200), ('Macrophage', 1200), ('NK', 1200), ('Plasma', 1200), ('CD8 T cell', 1200), ('Malignant', 1200), ('Treg', 1200), ('Fibroblast', 1200), ('Acinar', 1200), ('Islet', 1200), ('Endothelial', 1200), ('B cell', 1200), ('CD4 T cell', 1200), ('Mast', 1200), ('ADM', 1200), ('Epithelial', 1168), ('Erythrocyte', 612), ('Tuft', 151)]
2020-10-29 16:26:16,050 training dataset shape: (8943, 28756)
2020-10-29 16:26:16,051 validation dataset shape: (12188, 28756)
2020-10-29 16:26:40,125 epoch: 1, train loss: 29.5

snATACseq brca (2552, 19891) (9304, 19891)


2020-10-29 16:35:06,960 normalizing the expression counts for model training
2020-10-29 16:35:13,357 input dataset shape: (11856, 19891)
2020-10-29 16:35:13,358 possible cell types: ['B cell', 'CD4 T cell', 'CD8 T cell', 'Endothelial', 'Macrophage', 'Malignant', 'Mast', 'NK', 'Treg', 'cCAF', 'cDC', 'mCAF', 'pDC', 'vCAF']
2020-10-29 16:35:13,360 possible cell types: [('Malignant', 1200), ('mCAF', 1200), ('Macrophage', 1200), ('Endothelial', 1200), ('CD4 T cell', 1200), ('CD8 T cell', 1200), ('Treg', 1200), ('B cell', 1200), ('vCAF', 637), ('cDC', 519), ('pDC', 457), ('NK', 430), ('cCAF', 127), ('Mast', 86)]
2020-10-29 16:35:16,451 training dataset shape: (5502, 19891)
2020-10-29 16:35:16,452 validation dataset shape: (6354, 19891)
2020-10-29 16:35:29,482 epoch: 1, train loss: 37.69630813598633, val loss: 43.788761138916016
2020-10-29 16:35:41,436 epoch: 2, train loss: 37.37327194213867, val loss: 43.38957977294922
2020-10-29 16:35:53,423 epoch: 3, train loss: 37.03084182739258, val loss

snATACseq gbm (1316, 19891) (5650, 19891)


2020-10-29 16:40:08,714 normalizing the expression counts for model training
2020-10-29 16:40:12,470 input dataset shape: (6966, 19891)
2020-10-29 16:40:12,472 possible cell types: ['B cell', 'Endothelial', 'Fibroblast', 'Malignant', 'Microglia', 'Neuron', 'Oligodendrocytes', 'T cells']
2020-10-29 16:40:12,473 possible cell types: [('Microglia', 1200), ('Neuron', 1200), ('Malignant', 1200), ('Oligodendrocytes', 1200), ('T cells', 1200), ('Fibroblast', 810), ('Endothelial', 101), ('B cell', 55)]
2020-10-29 16:40:13,940 training dataset shape: (3124, 19891)
2020-10-29 16:40:13,941 validation dataset shape: (3842, 19891)




2020-10-29 16:40:21,281 5 out of the last 54 calls to <function compute_loss at 0x7f13991523b0> triggered tf.function retracing. Tracing is expensive and the excessive number of tracings is likely due to passing python objects instead of tensors. Also, tf.function has experimental_relax_shapes=True option that relaxes argument shapes that can avoid unnecessary retracing. Please refer to https://www.tensorflow.org/tutorials/customization/performance#python_or_tensor_args and https://www.tensorflow.org/api_docs/python/tf/function for more details.




2020-10-29 16:40:21,373 6 out of the last 55 calls to <function compute_loss at 0x7f13991523b0> triggered tf.function retracing. Tracing is expensive and the excessive number of tracings is likely due to passing python objects instead of tensors. Also, tf.function has experimental_relax_shapes=True option that relaxes argument shapes that can avoid unnecessary retracing. Please refer to https://www.tensorflow.org/tutorials/customization/performance#python_or_tensor_args and https://www.tensorflow.org/api_docs/python/tf/function for more details.
2020-10-29 16:40:22,452 epoch: 1, train loss: 44.33525085449219, val loss: 23.476886749267578
2020-10-29 16:40:29,674 epoch: 2, train loss: 44.14936828613281, val loss: 23.278709411621094
2020-10-29 16:40:36,817 epoch: 3, train loss: 43.8795280456543, val loss: 23.152862548828125
2020-10-29 16:40:43,861 epoch: 4, train loss: 43.461429595947266, val loss: 23.063217163085938
2020-10-29 16:40:50,911 epoch: 5, train loss: 43.23381042480469, val los

snRNAseq brca (2455, 29175) (9490, 29175)


2020-10-29 16:43:04,834 normalizing the expression counts for model training
2020-10-29 16:43:14,110 input dataset shape: (11945, 29175)
2020-10-29 16:43:14,112 possible cell types: ['Adipocyte', 'B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Fibroblast', 'Macrophage', 'Malignant', 'Mast', 'NK', 'Plasma', 'Treg']
2020-10-29 16:43:14,114 possible cell types: [('Malignant', 1200), ('CD4 T cell', 1200), ('Endothelial', 1200), ('Fibroblast', 1200), ('CD8 T cell', 1200), ('Macrophage', 1200), ('B cell', 1200), ('Treg', 1200), ('Plasma', 1200), ('NK', 482), ('Mast', 318), ('Adipocyte', 273), ('Dendritic', 72)]
2020-10-29 16:43:17,956 training dataset shape: (5247, 29175)
2020-10-29 16:43:17,957 validation dataset shape: (6698, 29175)
2020-10-29 16:43:33,817 epoch: 1, train loss: 39.688575744628906, val loss: 39.98768615722656
2020-10-29 16:43:48,882 epoch: 2, train loss: 39.03700637817383, val loss: 39.365821838378906
2020-10-29 16:44:03,968 epoch: 3, train loss: 38.41039

snRNAseq ccrcc (3867, 33538) (10444, 33538)


2020-10-29 16:49:05,670 normalizing the expression counts for model training
2020-10-29 16:49:18,413 input dataset shape: (14311, 33538)
2020-10-29 16:49:18,415 possible cell types: ['Basophil', 'CD4 CTL', 'CD4 T cell', 'CD4 T cell activated', 'CD4 T cell naive', 'CD4/CD8 proliferating', 'CD8 CTL', 'CD8 T cell preexhausted', 'Endothelial', 'Epithelial', 'Fibroblast', 'Macrophage', 'Macrophage proliferating', 'Malignant', 'NK cell strong', 'NK cell weak', 'Plasma', 'TRM', 'Treg', 'cDC', 'mCAF', 'pDC']
2020-10-29 16:49:18,417 possible cell types: [('Malignant', 1200), ('Endothelial', 1200), ('mCAF', 1200), ('Macrophage', 1200), ('Epithelial', 1200), ('CD4 T cell', 1200), ('NK cell strong', 871), ('CD4 T cell activated', 791), ('CD4 T cell naive', 725), ('Fibroblast', 639), ('Treg', 574), ('TRM', 513), ('CD4/CD8 proliferating', 466), ('Macrophage proliferating', 466), ('cDC', 447), ('CD8 CTL', 439), ('NK cell weak', 409), ('CD4 CTL', 348), ('Plasma', 208), ('CD8 T cell preexhausted', 141)



2020-10-29 16:49:44,584 5 out of the last 123 calls to <function compute_loss at 0x7f13991523b0> triggered tf.function retracing. Tracing is expensive and the excessive number of tracings is likely due to passing python objects instead of tensors. Also, tf.function has experimental_relax_shapes=True option that relaxes argument shapes that can avoid unnecessary retracing. Please refer to https://www.tensorflow.org/tutorials/customization/performance#python_or_tensor_args and https://www.tensorflow.org/api_docs/python/tf/function for more details.
2020-10-29 16:49:48,759 epoch: 1, train loss: 34.759559631347656, val loss: 50.450416564941406
2020-10-29 16:50:12,131 epoch: 2, train loss: 34.12269592285156, val loss: 49.272796630859375
2020-10-29 16:50:35,853 epoch: 3, train loss: 33.885658264160156, val loss: 48.648887634277344
2020-10-29 16:50:59,535 epoch: 4, train loss: 33.5566291809082, val loss: 48.25336456298828
2020-10-29 16:51:23,306 epoch: 5, train loss: 33.337772369384766, val l

snRNAseq gbm (1692, 29748) (6807, 29748)


2020-10-29 16:58:21,082 normalizing the expression counts for model training
2020-10-29 16:58:27,959 input dataset shape: (8499, 29748)
2020-10-29 16:58:27,961 possible cell types: ['B cell', 'Endothelial', 'Fibroblast', 'Malignant', 'Microglia', 'Monocyte', 'Neuron', 'Oligodendrocytes', 'T cells']
2020-10-29 16:58:27,962 possible cell types: [('Fibroblast', 1200), ('Neuron', 1200), ('Microglia', 1200), ('Oligodendrocytes', 1200), ('Malignant', 1200), ('T cells', 1200), ('Endothelial', 581), ('B cell', 578), ('Monocyte', 140)]
2020-10-29 16:58:30,434 training dataset shape: (3723, 29748)
2020-10-29 16:58:30,435 validation dataset shape: (4776, 29748)
2020-10-29 16:58:41,820 epoch: 1, train loss: 45.88108825683594, val loss: 47.097232818603516
2020-10-29 16:58:52,290 epoch: 2, train loss: 45.138607025146484, val loss: 46.600093841552734
2020-10-29 16:59:02,919 epoch: 3, train loss: 44.354984283447266, val loss: 46.20143508911133
2020-10-29 16:59:13,389 epoch: 4, train loss: 43.869003295

In [None]:
run_workflow_for_cross_disease(adata_map, run_pollock_workflow, 'pollock', RESULTS_CROSS_DISEASE_DIR)

scRNAseq brca_train_cesc_val (2600, 27131) (8449, 22928)


2020-11-03 10:05:32,144 normalizing the expression counts for model training
2020-11-03 10:05:39,440 input dataset shape: (11049, 22662)
2020-11-03 10:05:39,442 possible cell types: ['B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Epithelial', 'Erythrocyte', 'Fibroblast', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Plasma', 'Treg']
2020-11-03 10:05:39,444 possible cell types: [('NK', 1200), ('CD8 T cell', 1200), ('Fibroblast', 1200), ('Malignant', 1200), ('Monocyte', 1200), ('Plasma', 1135), ('CD4 T cell', 1118), ('Epithelial', 1000), ('Endothelial', 732), ('Mast', 249), ('Erythrocyte', 215), ('Treg', 200), ('B cell', 200), ('Dendritic', 200)]
2020-11-03 10:05:42,660 training dataset shape: (5315, 22662)
2020-11-03 10:05:42,662 validation dataset shape: (5734, 22662)
2020-11-03 10:05:55,843 epoch: 1, train loss: 34.00298309326172, val loss: 22.467525482177734
2020-11-03 10:06:07,968 epoch: 2, train loss: 32.6106071472168, val loss: 21.930818557739258
2020-11-03 10:06:20,

scRNAseq brca_train_hnscc_val (2600, 27131) (10288, 26929)


2020-11-03 10:10:27,648 normalizing the expression counts for model training
2020-11-03 10:10:37,265 input dataset shape: (12888, 25823)
2020-11-03 10:10:37,267 possible cell types: ['B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Erythrocyte', 'Fibroblast', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Plasma', 'Treg']
2020-11-03 10:10:37,269 possible cell types: [('NK', 1200), ('CD8 T cell', 1200), ('Endothelial', 1200), ('CD4 T cell', 1200), ('Treg', 1200), ('Malignant', 1200), ('Monocyte', 1200), ('B cell', 1200), ('Mast', 1200), ('Plasma', 1200), ('Erythrocyte', 488), ('Fibroblast', 200), ('Dendritic', 200)]
2020-11-03 10:10:41,132 training dataset shape: (5596, 25823)
2020-11-03 10:10:41,134 validation dataset shape: (7292, 25823)
2020-11-03 10:10:56,815 epoch: 1, train loss: 30.21483612060547, val loss: 24.32537078857422
2020-11-03 10:11:11,807 epoch: 2, train loss: 29.540552139282227, val loss: 23.86591339111328
2020-11-03 10:11:26,608 epoch: 3, train loss: 29.0252

scRNAseq brca_train_melanoma_val (2600, 27131) (6735, 23452)


2020-11-03 10:16:24,276 normalizing the expression counts for model training
2020-11-03 10:16:30,130 input dataset shape: (9335, 21538)
2020-11-03 10:16:30,132 possible cell types: ['B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Erythrocyte', 'Fibroblast', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Plasma', 'Treg']
2020-11-03 10:16:30,133 possible cell types: [('CD8 T cell', 1200), ('CD4 T cell', 1200), ('Malignant', 1200), ('Monocyte', 1200), ('B cell', 1200), ('Treg', 862), ('Dendritic', 741), ('NK', 538), ('Fibroblast', 321), ('Plasma', 273), ('Endothelial', 200), ('Mast', 200), ('Erythrocyte', 200)]
2020-11-03 10:16:32,486 training dataset shape: (4676, 21538)
2020-11-03 10:16:32,487 validation dataset shape: (4659, 21538)
2020-11-03 10:16:43,964 epoch: 1, train loss: 36.257877349853516, val loss: 46.29212951660156
2020-11-03 10:16:54,352 epoch: 2, train loss: 35.411956787109375, val loss: 45.69017791748047
2020-11-03 10:17:04,734 epoch: 3, train loss: 34.414100646

###### testing stuff

In [None]:
train, val = sc.read_h5ad(adata_map['snRNAseq']['brca']['train']), sc.read_h5ad(adata_map['scRNAseq']['brca']['val'])

In [None]:
module_dir = os.path.join(SANDBOX_DIR, 'temp_module')

In [None]:
train.obs['is_validation'] = [False] * train.shape[0]
val.obs['is_validation'] = [True] * val.shape[0]
combined = train.concatenate(val)
combined

In [None]:
# pds = PollockDataset(train, cell_type_key=CELL_TYPE_KEY,
#                      dataset_type='training')

In [None]:
val.shape

In [None]:
pds = PollockDataset(combined, cell_type_key=CELL_TYPE_KEY,
                     dataset_type='training', validation_key='is_validation')

In [None]:
pm = PollockModel(pds.cell_types, pds.train_adata.shape[1], alpha=.0001, latent_dim=25)

In [None]:
pm.fit(pds, epochs=2)

In [None]:
pm.save(pds, module_dir)

In [None]:
val.shape

In [None]:
preds = predict_from_anndata(val.copy(),
        '/home/estorrs/pollock/benchmarking/sandbox/temp_module', adata_batch_size=10000)
preds

In [None]:
df = pd.DataFrame.from_dict({
    'cell_id': preds.index.to_list(),
    'groundtruth': val.obs.loc[preds.index][CELL_TYPE_KEY].to_list(),
    'predicted': preds['predicted_cell_type'],
    'probability': preds['cell_type_probability']
})
df

##### scanpy ingest

In [21]:
def ingest_preprocess(adata):
    adata.var['mt'] = adata.var_names.str.startswith('MT-')
    sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)
    sc.pp.normalize_total(adata, target_sum=1e4)
    sc.pp.log1p(adata)
    sc.pp.highly_variable_genes(adata, flavor="seurat", n_top_genes=2500)
    adata.raw = adata
    adata = adata[:, adata.var.highly_variable]
    sc.pp.regress_out(adata, ['total_counts', 'pct_counts_mt'])
    sc.pp.scale(adata)
    
    return adata

def run_scanpy_workflow(train, val, cell_type_key):
    var_names = train.var_names.intersection(val.var_names)
    train = train[:, var_names]
    val = val[:, var_names]
    
    groundtruth = val.obs[cell_type_key].to_list()

    sc.pp.pca(train)
    sc.pp.neighbors(train)
    sc.tl.umap(train)
    
    sc.tl.ingest(val, train, obs=cell_type_key)
    
    df = pd.DataFrame.from_dict({
        'cell_id': val.obs.index.to_list(),
        'groundtruth': groundtruth,
        'predicted': val.obs[cell_type_key].to_list(),
        'probability': [np.nan] * val.shape[0]
    })
    
    return df

In [22]:
run_workflow_for_datasets(adata_map, run_scanpy_workflow, 'scanpy_ingest', RESULTS_DIR)

scRNAseq brca (2600, 27131) (11253, 27131)


  if not is_categorical(df_full[k]):


scRNAseq cesc (1941, 22928) (8449, 22928)


  if not is_categorical(df_full[k]):


scRNAseq hnscc (2200, 26929) (10288, 26929)


  if not is_categorical(df_full[k]):


scRNAseq melanoma (2000, 23452) (6735, 23452)


  if not is_categorical(df_full[k]):


scRNAseq pbmc (940, 32738) (1698, 32738)


  if not is_categorical(df_full[k]):


scRNAseq pdac (3296, 28756) (15435, 28756)


  if not is_categorical(df_full[k]):


snATACseq brca (2064, 19891) (9028, 19891)


  if not is_categorical(df_full[k]):


snATACseq gbm (1316, 19891) (5650, 19891)


  if not is_categorical(df_full[k]):


snRNAseq brca (2455, 29175) (9490, 29175)


  if not is_categorical(df_full[k]):


snRNAseq ccrcc (2113, 33538) (8605, 33538)


  if not is_categorical(df_full[k]):


snRNAseq gbm (1689, 29748) (6810, 29748)


  if not is_categorical(df_full[k]):


###### testing stuff

In [None]:
train, val = adata_map['scRNAseq']['pbmc']['train'].copy(), adata_map['scRNAseq']['pbmc']['val'].copy()

In [None]:
train, val = ingest_preprocess(train), ingest_preprocess(val)

var_names = train.var_names.intersection(val.var_names)
train = train[:, var_names]
val = val[:, var_names]

sc.pp.pca(train)
sc.pp.neighbors(train)
sc.tl.umap(train)

In [None]:
sc.pl.umap(train, color='cell_type')

In [None]:
sc.tl.ingest(val, train, obs=CELL_TYPE_KEY)
val.uns[f'{CELL_TYPE_KEY}_colors'] = train.uns[f'{CELL_TYPE_KEY}_colors']

In [None]:
sc.pl.umap(val, color=[CELL_TYPE_KEY], wspace=0.5)


In [None]:
val

In [None]:
val.obs

##### ACTINN

In [9]:
def run_actinn_workflow(train, val, cell_type_key):
    X = train.X.toarray() if 'sparse' in str(type(train.X)) else train.X
    train_counts_df = pd.DataFrame(data=X.transpose(), index=train.var.index.to_list(),
                        columns=train.obs.index.to_list())
    X = val.X.toarray() if 'sparse' in str(type(val.X)) else val.X
    val_counts_df = pd.DataFrame(data=X.transpose(), index=val.var.index.to_list(),
                        columns=val.obs.index.to_list())
    
    train_counts_fp = os.path.join(SANDBOX_DIR, 'train_counts.txt')
    val_counts_fp = os.path.join(SANDBOX_DIR, 'val_counts.txt')
    train_counts_df.to_csv(train_counts_fp, sep='\t')
    val_counts_df.to_csv(val_counts_fp, sep='\t')
    
    train_h5_fp = os.path.join(SANDBOX_DIR, 'train.h5')
    train_annotations_fp = os.path.join(SANDBOX_DIR, 'train_annotations.txt')
    val_h5_fp = os.path.join(SANDBOX_DIR, 'val.h5')

    train.obs[[CELL_TYPE_KEY]].to_csv(train_annotations_fp, sep='\t', index=True, header=False)

    subprocess.check_output(('python', ACTINN_FORMAT, '-i', train_counts_fp,
                            '-o', train_h5_fp.replace('.h5', ''), '-f', 'txt'))
    subprocess.check_output(('python', ACTINN_FORMAT, '-i', val_counts_fp,
                            '-o', val_h5_fp.replace('.h5', ''), '-f', 'txt'))
    # dont use probablity argument or it breaks
    subprocess.check_output(('python', ACTINN_PREDICT, '-trs', train_h5_fp,
                            '-trl', train_annotations_fp, '-ts', val_h5_fp))
    
    prediction_df = pd.read_csv('predicted_label.txt', sep='\t')
    
    df = pd.DataFrame.from_dict({
        'cell_id': prediction_df['cellname'].to_list(),
        'predicted': prediction_df['celltype'].to_list(),
        'probability': [np.nan] * prediction_df.shape[0]
    })
    
    df = pd.merge(df, val.obs, left_on='cell_id', right_index=True)
    df = df[['cell_id', 'cell_type', 'predicted', 'probability']]
    df.columns = ['cell_id', 'groundtruth', 'predicted', 'probability']
    
    return df
    
    
    
    


In [10]:
ACTINN_FORMAT = '/home/estorrs/ACTINN/actinn_format.py'
ACTINN_PREDICT = '/home/estorrs/ACTINN/actinn_predict.py'

run_workflow_for_datasets(adata_map, run_actinn_workflow, 'actinn', RESULTS_DIR)

scRNAseq brca (2600, 27131) (11253, 27131)
scRNAseq cesc (1941, 22928) (8449, 22928)
scRNAseq hnscc (2200, 26929) (10288, 26929)
scRNAseq melanoma (2000, 23452) (6735, 23452)
scRNAseq pbmc (940, 32738) (1698, 32738)
scRNAseq pdac (3296, 28756) (15435, 28756)
snATACseq brca (2064, 19891) (9028, 19891)
snATACseq gbm (1316, 19891) (5650, 19891)
snRNAseq brca (2455, 29175) (9490, 29175)
snRNAseq ccrcc (2113, 33538) (8605, 33538)
snRNAseq gbm (1689, 29748) (6810, 29748)


###### testing stuff

In [None]:
train, val = adata_map['scRNAseq']['pbmc']['train'].copy(), adata_map['scRNAseq']['pbmc']['val'].copy()

In [None]:
# train.obs['dataset'] = ['train'] * train.shape[0]
# val.obs['dataset'] = ['val'] * val.shape[0]
# combined = train.concatenate(val)
# combined

In [None]:
train_counts_df = pd.DataFrame(data=train.X.transpose().toarray(), index=train.var.index.to_list(),
                        columns=train.obs.index.to_list())
val_counts_df = pd.DataFrame(data=val.X.transpose().toarray(), index=val.var.index.to_list(),
                        columns=val.obs.index.to_list())
train_counts_df

In [None]:
train_counts_fp = os.path.join(SANDBOX_DIR, 'train_counts.txt')
val_counts_fp = os.path.join(SANDBOX_DIR, 'val_counts.txt')
train_counts_df.to_csv(train_counts_fp, sep='\t')
val_counts_df.to_csv(val_counts_fp, sep='\t')

python actinn_format.py -i input_file -o output_prefix -f format

python actinn_format.py -i ./test_data/train_set.txt.gz -o train_set -f txt


In [None]:
train_h5_fp = os.path.join(SANDBOX_DIR, 'train.h5')
train_annotations_fp = os.path.join(SANDBOX_DIR, 'train_annotations.txt')
val_h5_fp = os.path.join(SANDBOX_DIR, 'val.h5')

train.obs[[CELL_TYPE_KEY]].to_csv(train_annotations_fp, sep='\t', index=True, header=False)

subprocess.check_output(('python', '/home/estorrs/ACTINN/actinn_format.py', '-i', train_counts_fp,
                        '-o', train_h5_fp.replace('.h5', ''), '-f', 'txt'))

In [None]:
subprocess.check_output(('python', '/home/estorrs/ACTINN/actinn_format.py', '-i', val_counts_fp,
                        '-o', val_h5_fp.replace('.h5', ''), '-f', 'txt'))

In [None]:
train.obs[[CELL_TYPE_KEY]]

python actinn_predict.py -trs training_set -trl training_label -ts test_set -lr learning_rat -ne num_epoch -ms minibatch_size -pc print_cost -op output_probability


-trs Path to the training set, must be HDF5 format with key "dge".

-trl Path to the training label (the cell types for the training set), must be tab separated text file with no column and row names.

-ts Path to test sets, must be HDF5 format with key "dge".

-lr Learning rate (default: 0.0001). We can increase the learning rate if the cost drops too slow, or decrease the learning rate if the cost drops super fast in the beginning and starts to fluctuate in later epochs.

-ne Number of epochs (default: 50). The number of epochs can be determined by looking at the cost after each epoch. If the cost starts to decrease very slowly after ceartain epoch, then the "ne" parameter should be set to that epoch number.

-ms Minibatch size (default: 128). This parameter can be set larger when training a large dataset.

-pc Print cost (default: True). Whether to print cost after each 5 epochs.

-op Output probabilities for each cell being the cell types in the training data (default: False).


In [None]:
subprocess.check_output(('python', '/home/estorrs/ACTINN/actinn_predict.py', '-trs', train_h5_fp,
                        '-trl', train_annotations_fp, '-ts', val_h5_fp))

In [None]:
' '.join(('python', '/home/estorrs/ACTINN/actinn_predict.py', '-trs', train_h5_fp,
                        '-trl', train_annotations_fp, '-ts', val_h5_fp,
                        '-op', 'True'))

In [None]:
prediction_df = pd.read_csv('predicted_label.txt', sep='\t')
prediction_df

In [None]:
df = pd.DataFrame.from_dict({
        'cell_id': prediction_df['cellname'].to_list(),
        'prediction': prediction_df['celltype'].to_list(),
        'probability': [np.nan] * val.shape[0]
    })
df

In [None]:
val.obs

In [None]:
df = pd.merge(df, val.obs, left_on='cell_id', right_index=True)
df = df[['cell_id', 'cell_type', 'prediction', 'probability']]
df.columns = ['cell_id', 'groundtruth', 'prediction', 'probability']
df


##### Seurat

In [66]:
def run_seurat_transfer(train, val, cell_type_key):
    # save the input data for the seurat script
    train_counts_fp, val_counts_fp = (os.path.join(SANDBOX_DIR, 'train_counts.txt'),
                                        os.path.join(SANDBOX_DIR, 'val_counts.txt'))
    train_annotations_fp, val_annotations_fp = (os.path.join(SANDBOX_DIR, 'train_annotations.txt'),
                                                os.path.join(SANDBOX_DIR, 'val_annotations.txt'))

    ## prepare train and val count matrices
    X = train.X.toarray() if 'sparse' in str(type(train.X)) else train.X
    train_counts = pd.DataFrame(data=X.transpose().astype(np.int32), index=train.var.index,
                                columns=train.obs.index)
    train_counts.index.name = ''
    # for some reason SCTransform fails if the integer values are too high, so capping them here
    cap = pow(2, 14)
    train_counts.values[train_counts.values>cap] = cap
    train_counts.to_csv(train_counts_fp, sep='\t', header=True, index=True)
    
    X = val.X.toarray() if 'sparse' in str(type(val.X)) else val.X
    val_counts = pd.DataFrame(data=X.transpose().astype(np.int32), index=val.var.index,
                                columns=val.obs.index)
    val_counts.index.name = ''
    val_counts.values[val_counts.values>cap] = cap
    val_counts.to_csv(val_counts_fp, sep='\t', header=True, index=True)

    train.obs[[CELL_TYPE_KEY]].to_csv(train_annotations_fp, sep='\t', header=False, index=False)
    val.obs[[CELL_TYPE_KEY]].to_csv(val_annotations_fp, sep='\t', header=False, index=False)
    
    # actually run the script and read the results back in
    prediction_fp = os.path.join(SANDBOX_DIR, 'seurat_predictions.txt')
    subprocess.check_output(('Rscript', SEURAT_SCRIPT, train_counts_fp, train_annotations_fp,
                        val_counts_fp, val_annotations_fp, prediction_fp))
    
    # format the predictions dataframe
    df = pd.read_csv(prediction_fp, sep='\t')
    df.index = [x.replace('.', '-') for x in df.index]
    # also remove that weird X thing seurat sometimes puts there if first char is _
    df.index = [x[1:] if x[:2]=='X_' else x for x in df.index]
    df = pd.merge(df, val.obs, left_index=True, right_index=True)
    df['cell_id'] = df.index.to_list()
    try:
        df = df[['cell_id', 'cell_type', 'predicted.id', 'prediction.score.max']]        
        df.columns = ['cell_id', 'groundtruth', 'predicted', 'probability']
        return df
    except KeyError as e:
        print(f'key error', e)
        return pd.DataFrame()

In [67]:
SEURAT_SCRIPT = '/home/estorrs/pollock/benchmarking/tools/run_seurat_workflow.R'
run_workflow_for_datasets(adata_map, run_seurat_transfer, 'seurat_transfer', RESULTS_DIR)

scRNAseq brca (2600, 27131) (11253, 27131)
scRNAseq cesc (1941, 22928) (8449, 22928)
scRNAseq hnscc (2200, 26929) (10288, 26929)
scRNAseq melanoma (2000, 23452) (6735, 23452)
key error "['predicted.id', 'prediction.score.max'] not in index"
scRNAseq pbmc (940, 32738) (1698, 32738)
scRNAseq pdac (3296, 28756) (15435, 28756)
snATACseq brca (2064, 19891) (9028, 19891)
key error "['predicted.id', 'prediction.score.max'] not in index"
snATACseq gbm (1316, 19891) (5650, 19891)
key error "['predicted.id', 'prediction.score.max'] not in index"
snRNAseq brca (2455, 29175) (9490, 29175)
snRNAseq ccrcc (2113, 33538) (8605, 33538)
snRNAseq gbm (1689, 29748) (6810, 29748)


###### testing stuff

In [55]:
# train, val = adata_map['scRNAseq']['pbmc']['train'].copy(), adata_map['scRNAseq']['pbmc']['val'].copy()
train, val = sc.read_h5ad(adata_map['scRNAseq']['brca']['train']), sc.read_h5ad(adata_map['scRNAseq']['brca']['val'])

In [56]:
pow(2, 14)

16384

In [57]:
# save the input data for the seurat script
train_counts_fp, val_counts_fp = (os.path.join(SANDBOX_DIR, 'train_counts.txt'),
                                    os.path.join(SANDBOX_DIR, 'val_counts.txt'))
train_annotations_fp, val_annotations_fp = (os.path.join(SANDBOX_DIR, 'train_annotations.txt'),
                                            os.path.join(SANDBOX_DIR, 'val_annotations.txt'))

## prepare train and val count matrices
X = train.X.toarray() if 'sparse' in str(type(train.X)) else train.X
train_counts = pd.DataFrame(data=X.transpose().astype(np.int32), index=train.var.index,
                            columns=train.obs.index)
train_counts.index.name = ''
# for some reason SCTransform fails if the integer values are too high, so capping them here
cap = pow(2, 14)
train_counts.values[train_counts.values>cap] = cap
train_counts.to_csv(train_counts_fp, sep='\t', header=True, index=True)

X = val.X.toarray() if 'sparse' in str(type(val.X)) else val.X
val_counts = pd.DataFrame(data=X.transpose().astype(np.int32), index=val.var.index,
                            columns=val.obs.index)
val_counts.index.name = ''
val_counts.values[val_counts.values>cap] = cap
val_counts.to_csv(val_counts_fp, sep='\t', header=True, index=True)

train.obs[[CELL_TYPE_KEY]].to_csv(train_annotations_fp, sep='\t', header=False, index=False)
val.obs[[CELL_TYPE_KEY]].to_csv(val_annotations_fp, sep='\t', header=False, index=False)

In [58]:
train_counts

Unnamed: 0,_HT062B1_S1PA_AACCATGTCTCTGGTC-1,_HT062B1_S1PA_AAGCGAGAGAACAGGA-1,_HT062B1_S1PA_AAGTCGTCACAGAGAC-1,_HT062B1_S1PA_AATGACCTCAGGACAG-1,_HT062B1_S1PA_AATGGCTAGCACGGAT-1,_HT062B1_S1PA_ACATTTCGTACTAACC-1,_HT062B1_S1PA_ACGTCCTGTGTATACC-1,_HT062B1_S1PA_ACTGTGAGTGTGGACA-1,_HT062B1_S1PA_ACTTCGCCACAAGCAG-1,_HT062B1_S1PA_ATACTTCGTACAAGTA-1,...,_HT171B1_BC2_TTCGCTGGTATGGTAA-1,_HT171B1_BC2_TTCTAGTCAAGTTCCA-1,_HT171B1_BC2_TTCTTGACATGACTGT-1,_HT171B1_BC2_TTGAACGCAAACCACT-1,_HT171B1_BC2_TTGAACGTCAAGCCCG-1,_HT171B1_BC2_TTGAACGTCGGATACT-1,_HT171B1_BC2_TTGAGTGGTCCAATCA-1,_HT171B1_BC2_TTGATGGCAGAGTGTG-1,_HT171B1_BC2_TTGGGATTCGATGCAT-1,_HT171B1_BC2_TTTCAGTGTTCGGTAT-1
,,,,,,,,,,,,,,,,,,,,,
AL627309.1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AL627309.3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AL627309.4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AL732372.1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AL669831.2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AL354822.1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AC004556.1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AC233755.2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
train_counts

In [None]:
type(train_counts.values), type(train_counts.values[0, 0])

In [None]:
vals = sorted(set(train_counts.values.flatten()))
vals

In [None]:
vals[:10], vals[-10:]

In [None]:
train_counts.values[train_counts.values>1000] = 1000

In [None]:
np.where(train_counts>1)

In [59]:
# actually run the script and read the results back in
prediction_fp = os.path.join(SANDBOX_DIR, 'seurat_predictions.txt')
subprocess.check_output(('Rscript', SEURAT_SCRIPT, train_counts_fp, train_annotations_fp,
                    val_counts_fp, val_annotations_fp, prediction_fp))



In [65]:
# format the predictions dataframe
df = pd.read_csv(prediction_fp, sep='\t')
df.index = [x.replace('.', '-') for x in df.index]
# also remove that weird X thing seurat sometimes puts there
df.index = [x[1:] if x[:2]=='X_' else x for x in df.index]
df = pd.merge(df, val.obs, left_index=True, right_index=True)
df['cell_id'] = df.index.to_list()
df = df[['cell_id', 'cell_type', 'predicted.id', 'prediction.score.max']]        
df.columns = ['cell_id', 'groundtruth', 'predicted', 'probability']
df

Unnamed: 0,cell_id,groundtruth,predicted,probability
_HT062B1_S1PA_AAACGAATCGTGGCGT-1,_HT062B1_S1PA_AAACGAATCGTGGCGT-1,CD8 T cell,CD8 T cell,0.928408
_HT062B1_S1PA_AAAGAACAGCTATCTG-1,_HT062B1_S1PA_AAAGAACAGCTATCTG-1,NK,NK,1.000000
_HT062B1_S1PA_AAAGGATTCTGCCTGT-1,_HT062B1_S1PA_AAAGGATTCTGCCTGT-1,Endothelial,Endothelial,1.000000
_HT062B1_S1PA_AAAGGTAGTGGAAGTC-1,_HT062B1_S1PA_AAAGGTAGTGGAAGTC-1,Treg,Treg,0.951956
_HT062B1_S1PA_AACAACCTCACTCACC-1,_HT062B1_S1PA_AACAACCTCACTCACC-1,Endothelial,Endothelial,1.000000
...,...,...,...,...
_HT171B1_BC2_TTTCGATGTGTAGGAC-1,_HT171B1_BC2_TTTCGATGTGTAGGAC-1,CD4 T cell,CD4 T cell,0.916704
_HT171B1_BC2_TTTGACTGTCGATTTG-1,_HT171B1_BC2_TTTGACTGTCGATTTG-1,CD8 T cell,CD8 T cell,0.931814
_HT171B1_BC2_TTTGACTGTCGGTGAA-1,_HT171B1_BC2_TTTGACTGTCGGTGAA-1,CD4 T cell,CD4 T cell,0.914046
_HT171B1_BC2_TTTGGTTCATAGACTC-1,_HT171B1_BC2_TTTGGTTCATAGACTC-1,CD4 T cell,CD4 T cell,0.801086


In [61]:
val.obs

Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,percent.mito,nCount_SCT,nFeature_SCT,SCT_snn_res.0.5,seurat_clusters,sample,tissue_type,cell_type,cell_type_specific,Piece_ID,Clinical_Subtype,Bulk_PAM50,doublet_score,predicted_doublet,ident
_HT062B1_S1PA_AAACGAATCGTGGCGT-1,TWCE-HT062B1-S1PAA1A1Z1B1,3098.0,1172,0.096191,3487.0,1172,2,2,HT062B1,Tumor tissue,CD8 T cell,NK,HT062B1_S1PA,TNBC,Her2,0.030261,0,2
_HT062B1_S1PA_AAAGAACAGCTATCTG-1,TWCE-HT062B1-S1PAA1A1Z1B1,4761.0,1779,0.051250,4226.0,1779,11,11,HT062B1,Tumor tissue,NK,NK,HT062B1_S1PA,TNBC,Her2,0.018460,0,11
_HT062B1_S1PA_AAAGGATTCTGCCTGT-1,TWCE-HT062B1-S1PAA1A1Z1B1,74559.0,9098,0.003527,4737.0,2302,4,4,HT062B1,Tumor tissue,Endothelial,Endothelial,HT062B1_S1PA,TNBC,Her2,0.083898,0,4
_HT062B1_S1PA_AAAGGTAGTGGAAGTC-1,TWCE-HT062B1-S1PAA1A1Z1B1,3204.0,1250,0.031835,3514.0,1250,0,0,HT062B1,Tumor tissue,Treg,Treg,HT062B1_S1PA,TNBC,Her2,0.030261,0,0
_HT062B1_S1PA_AACAACCTCACTCACC-1,TWCE-HT062B1-S1PAA1A1Z1B1,15285.0,4271,0.054498,4281.0,1968,4,4,HT062B1,Tumor tissue,Endothelial,Endothelial,HT062B1_S1PA,TNBC,Her2,0.031609,0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
_HT171B1_BC2_TTTCGATGTGTAGGAC-1,TWCE-HT171B1-BC2,3090.0,1231,0.030421,3479.0,1231,0,0,HT171B1,Tumor tissue,CD4 T cell,CD4_T,HT171B1_S1H8,TNBC,Basal,0.066288,0,0
_HT171B1_BC2_TTTGACTGTCGATTTG-1,TWCE-HT171B1-BC2,3594.0,1549,0.059265,3664.0,1549,8,8,HT171B1,Tumor tissue,CD8 T cell,NK,HT171B1_S1H8,TNBC,Basal,0.053524,0,8
_HT171B1_BC2_TTTGACTGTCGGTGAA-1,TWCE-HT171B1-BC2,1751.0,715,0.083952,3437.0,725,0,0,HT171B1,Tumor tissue,CD4 T cell,CD4_T,HT171B1_S1H8,TNBC,Basal,0.016301,0,0
_HT171B1_BC2_TTTGGTTCATAGACTC-1,TWCE-HT171B1-BC2,3185.0,1283,0.058713,3511.0,1283,0,0,HT171B1,Tumor tissue,CD4 T cell,CD4_T,HT171B1_S1H8,TNBC,Basal,0.072103,0,0


##### SingleCellNet

In [2]:
# !pip install git+https://github.com/pcahan1/PySingleCellNet/

In [12]:
import pySingleCellNet as pySCN

In [13]:
def run_SingleCellNet(train, val, cell_type_key):
    # save the input data for the seurat script
    cgenesA, xpairs, tspRF = pySCN.scn_train(train,
            nTopGenes=100, nRand=100, nTrees=1000, nTopGenePairs=100,
            dLevel=cell_type_key, stratify=True, limitToHVG=True, )
    predictions = pySCN.scn_classify(val, cgenesA, xpairs, tspRF, nrand = 0)
    
    df = pd.merge(predictions.obs[['SCN_class']], val.obs, left_index=True, right_index=True)
    
    df = df[['cell_type', 'SCN_class']]
    df.columns = ['groundtruth', 'predicted']
    df['cell_id'] = df.index.to_list()
    df['probability'] = [np.nan] * df.shape[0]
    df = df[['cell_id', 'groundtruth', 'predicted', 'probability']]
    

    return df

In [14]:
run_workflow_for_datasets(adata_map, run_SingleCellNet, 'SingleCellNet', RESULTS_DIR)

scRNAseq brca (2600, 27131) (11253, 27131)
HVG
Matrix normalized
There are  1104  classification genes

B cell
CD4 T cell
CD8 T cell
Dendritic
Endothelial
Erythrocyte
Fibroblast
Malignant
Mast
Monocyte
NK
Plasma
Treg
There are 1270 top gene pairs

Finished pair transforming the data

scRNAseq cesc (1941, 22928) (8449, 22928)
HVG
Matrix normalized
There are  976  classification genes

CD4 T cell
CD8 T cell
Endothelial
Epithelial
Erythrocyte
Fibroblast
Malignant
Mast
Monocyte
NK
Plasma
There are 1073 top gene pairs

Finished pair transforming the data

scRNAseq hnscc (2200, 26929) (10288, 26929)
HVG
Matrix normalized
There are  1100  classification genes

B cell
CD4 T cell
CD8 T cell
Endothelial
Erythrocyte
Malignant
Mast
Monocyte
NK
Plasma
Treg
There are 1073 top gene pairs

Finished pair transforming the data

scRNAseq melanoma (2000, 23452) (6735, 23452)
HVG
Matrix normalized
There are  900  classification genes

B cell
CD4 T cell
CD8 T cell
Dendritic
Fibroblast
Malignant
Monocyte
NK


###### testing stuff

In [None]:
train, val = adata_map['scRNAseq']['pbmc']['train'].copy(), adata_map['scRNAseq']['pbmc']['val'].copy()

In [None]:
cgenesA, xpairs, tspRF = pySCN.scn_train(train,
            nTopGenes = 100, nRand = 100, nTrees = 1000 ,nTopGenePairs = 100,
            dLevel = "cell_type", stratify=True, limitToHVG=True, )

In [None]:
predictions = pySCN.scn_classify(val, cgenesA, xpairs, tspRF, nrand = 0)


In [None]:
predictions.obs

In [None]:
df = pd.merge(predictions.obs[['SCN_class']], val.obs, left_index=True, right_index=True)

df = df[['cell_type', 'SCN_class']]
df.index.name = 'cell_id'
df.columns = ['groundtruth', 'predictions']
df['probability'] = [np.nan] * df.shape[0]
df

##### pollock

###### testing stuff