In [1]:
from pathlib import Path
from collections import Counter
import os
import re
import random
import subprocess

import anndata
import scanpy as sc
import pandas as pd
import numpy as np

import mgitools.os_helpers as os_helpers

In [2]:
# !pip install git+https://github.com/estorrs/mgitools
# !pip install tensorflow==2.1.0

In [3]:
%load_ext autoreload

In [4]:
%autoreload 2

In [5]:
# !pip install -e /home/estorrs/pollock/
import pollock
from pollock.models.model import PollockDataset, PollockModel, load_from_directory, predict_from_anndata

In [6]:
# !conda install -y scanpy

In [7]:
# !pip install git+https://github.com/estorrs/mgitools

In [8]:
CELL_TYPE_KEY = 'cell_type'
N_PER_CELL_TYPE = 200
DATA_DIR = '/home/estorrs/pollock/benchmarking/data/10232020_harmonized/teir_1/'
RESULTS_DIR = '/home/estorrs/pollock/benchmarking/results/10272020_teir1'
RESULTS_CROSS_DISEASE_DIR = '/home/estorrs/pollock/benchmarking/results/10272020_teir1_cross_disease'
RESULTS_CROSS_DTYPE_DIR = '/home/estorrs/pollock/benchmarking/results/10272020_teir1_cross_datatype'
SANDBOX_DIR = '/home/estorrs/pollock/benchmarking/sandbox'

Path(RESULTS_DIR).mkdir(parents=True, exist_ok=True)
Path(RESULTS_CROSS_DISEASE_DIR).mkdir(parents=True, exist_ok=True)
Path(RESULTS_CROSS_DTYPE_DIR).mkdir(parents=True, exist_ok=True)

##### create training and validation datasets

only run if you haven't created these datasets yet

In [None]:
def cap_list(ls, n=100, split=.8, oversample=True):
    """
    Grabs items from a pool.
    
    if split * pool size is greater than n, then just randomly sample 80% of the pool
    otherwise sample 80% of the pool, then oversample so you end up with a final size of n
    """
    # just return list if it is of length 1
    if len(ls) <= 1: return ls
    cap = int(len(ls) * split)
    if cap > n:
        return random.sample(ls, n)

    if oversample:
        pool = random.sample(ls, cap)
        ## oversample to
        return random.choices(pool, k=n)

    return random.sample(ls, cap)

def balancedish_training_generator(adata, cell_type_key, n_per_cell_type, oversample=True, split=.8):
    """
    Return balanced train and validation sets
    """
    cell_type_to_idxs = {}
    for cell_id, cell_type in zip(adata.obs.index, adata.obs[cell_type_key]):
        if cell_type not in cell_type_to_idxs:
            cell_type_to_idxs[cell_type] = [cell_id]
        else:
            cell_type_to_idxs[cell_type].append(cell_id)

    cell_type_to_idxs = {k:cap_list(ls, n_per_cell_type, oversample=oversample, split=split)
                         for k, ls in cell_type_to_idxs.items()}

    train_ids = np.asarray([x for ls in cell_type_to_idxs.values() for x in ls])
    train_idxs = np.arange(adata.shape[0])[np.isin(np.asarray(adata.obs.index), train_ids)]
    val_idxs = np.delete(np.arange(adata.shape[0]), train_idxs)

    train_adata = adata[train_idxs, :]
    val_adata = adata[val_idxs, :]

    return train_adata, val_adata

# def create_train_val_datasets(adata, cell_type_key, oversample=True):
#     counts = Counter(adata.obs[cell_type_key])
#     min_count = counts.most_common()[-1][1]
#     n_per_cell_type = max(min_count, )
#     train_adata, val_adata = balancedish_training_generator(adata, cell_type_key,
#                                                             n_per_cell_type, oversample=oversample)
#     return train_adata, val_adata

In [None]:
fps = sorted(os_helpers.listfiles(DATA_DIR, regex='.h5ad$'))
fp_map = {fp.split('/')[-2]:{} for fp in fps}
for fp in fps:
    if '/_train.h5ad' not in fp and '/_val.h5ad' not in fp:
        dtype = fp.split('/')[-2]
        disease = fp.split('/')[-1].replace('.h5ad', '')
        fp_map[dtype][disease] = fp
fp_map

In [None]:
for dtype, d in fp_map.items():
    for disease, fp in d.items():
        print(dtype, disease)
        adata = sc.read_h5ad(fp)
        # check for cell type key
        if CELL_TYPE_KEY not in adata.obs: raise RuntimeError(f'{CELL_TYPE_KEY} not in {fp}')
        
        train_adata, val_adata = balancedish_training_generator(adata, CELL_TYPE_KEY, N_PER_CELL_TYPE)
        # resample validation data to make dataset smaller while keeping rare cell types
        val_adata, _ = balancedish_training_generator(val_adata, CELL_TYPE_KEY, 1000, oversample=False,
                                                     split=1.)
        train_adata.write_h5ad(fp.replace('.h5ad', '_train.h5ad'))
        val_adata.write_h5ad(fp.replace('.h5ad', '_val.h5ad'))
        

##### load in training and validation datasets

In [9]:
fps = sorted(os_helpers.listfiles(DATA_DIR, regex='.h5ad$'))
adata_map = {fp.split('/')[-2]:{} for fp in fps}
for fp in fps:
    dtype = fp.split('/')[-2]
    disease = re.sub(r'^(.*)((_train)|(_val)).h5ad$', r'\1', fp.split('/')[-1])
    if disease not in adata_map[dtype] and '.h5ad' not in disease: adata_map[dtype][disease] = {}
    if 'train.h5ad' in fp:
        adata_map[dtype][disease]['train'] = fp
    if 'val.h5ad' in fp:
        adata_map[dtype][disease]['val'] = fp
for dtype, d in adata_map.items():
    for disease, m in d.items():
        print(dtype, disease)

scRNAseq brca
scRNAseq cesc
scRNAseq hnscc
scRNAseq melanoma
scRNAseq pbmc
scRNAseq pdac
snATACseq brca
snATACseq ccrcc
snATACseq gbm
snRNAseq brca
snRNAseq ccrcc
snRNAseq gbm


In [10]:
adata_map['snATACseq'].pop('ccrcc')

{'train': '/home/estorrs/pollock/benchmarking/data/10232020_harmonized/teir_1/snATACseq/ccrcc_train.h5ad',
 'val': '/home/estorrs/pollock/benchmarking/data/10232020_harmonized/teir_1/snATACseq/ccrcc_val.h5ad'}

### run workflows

In [11]:
def run_workflow_for_datasets(adata_map, workflow, workflow_identifier, output_dir):
    for dtype, d in adata_map.items():
#         if dtype != 'snATACseq':
        for disease, m in d.items():
            # make dir if doesnt exist yet
            directory = os.path.join(output_dir, dtype, disease)
            Path(directory).mkdir(parents=True, exist_ok=True)
            train, val = sc.read_h5ad(m['train']), sc.read_h5ad(m['val'])

            print(dtype, disease, train.shape, val.shape)
            run_workflow(workflow, workflow_identifier,
                train, val, directory)
            
def run_workflow_for_cross_disease(adata_map, workflow, workflow_identifier, output_dir):
    for dtype, d in adata_map.items():
        for disease1, m1 in d.items():
            for disease2, m2 in d.items():
                if disease1 != disease2:
                    # make dir if doesnt exist yet
                    directory = os.path.join(output_dir, dtype, f'{disease1}_train_{disease2}_val')
                    Path(directory).mkdir(parents=True, exist_ok=True)
                    train, val = sc.read_h5ad(m1['train']), sc.read_h5ad(m2['val'])

                    print(dtype, f'{disease1}_train_{disease2}_val', train.shape, val.shape)
                    run_workflow(workflow, workflow_identifier,
                        train, val, directory)
                    
                    
def run_workflow_for_cross_datatype(adata_map, workflow, workflow_identifier, output_dir):
    for dtype1, d1 in adata_map.items():
        for dtype2, d2 in adata_map.items():
            for disease1, m1 in d1.items():
                for disease2, m2 in d2.items():
                    if dtype1 != dtype2:
                        # make dir if doesnt exist yet
                        directory = os.path.join(output_dir, dtype, f'{dtype1}_{disease1}_train_{dtype2}_{disease2}_val')
                        Path(directory).mkdir(parents=True, exist_ok=True)
                        train, val = sc.read_h5ad(m1['train']), sc.read_h5ad(m2['val'])

                        print(dtype, f'{dtype1}_{disease1}_train_{dtype2}_{disease2}_val', train.shape, val.shape)
                        run_workflow(workflow, workflow_identifier,
                            train, val, directory)

def run_workflow(workflow, workflow_identifier, train, val, output_dir):
    """
    Run the workflow defined by the workflow function.
    
    workflow function takes a train adata and a val adata as inputs,
    and returns dataframe with cell_id, groundtruth, predicted, and probability columns
    """
    # if it is pollock it needs to know where to save the module
    if workflow_identifier == 'pollock':
        df = workflow(train, val, CELL_TYPE_KEY, os.path.join(output_dir, f'{workflow_identifier}_module'))
    else:
        df = workflow(train, val, CELL_TYPE_KEY)
    df.to_csv(os.path.join(output_dir, f'{workflow_identifier}.tsv'), sep='\t', index=False, header=True)

##### pollock

In [40]:
a = set(['a', 'b', 'c', 'd'])
b = set(['b', 'd', 'c', 'e'])
a.intersection(b)
a.union(b)

{'a', 'b', 'c', 'd', 'e'}

In [45]:
def run_pollock_workflow(train, val, cell_type_key, module_fp):
    train.obs['is_validation'] = [False] * train.shape[0]
    val.obs['is_validation'] = [True] * val.shape[0]
    combined = train.concatenate(val)
    
    pds = PollockDataset(combined.copy(), cell_type_key=cell_type_key,
                     dataset_type='training', validation_key='is_validation')
    
    pm = PollockModel(pds.cell_types, pds.train_adata.shape[1], alpha=.0001, latent_dim=25)
    
    pm.fit(pds, epochs=20)
    
    # only score validation if cell types match
    train_cells = set(train.obs[cell_type_key])
    val_cells = set(val.obs[cell_type_key])
    score_val = True if len(train_cells.intersection(val_cells)) == len(train_cells.union(val_cells)) else False
    print(score_val)
    pm.save(pds, module_fp, score_train=True, score_val=score_val)

    preds = predict_from_anndata(val.copy(), module_fp, adata_batch_size=10000)
    
    df = pd.DataFrame.from_dict({
        'cell_id': preds.index.to_list(),
        'groundtruth': val.obs.loc[preds.index][cell_type_key].to_list(),
        'predicted': preds['predicted_cell_type'],
        'probability': preds['cell_type_probability']
    })

    return df

In [46]:
# run_workflow_for_datasets(adata_map, run_pollock_workflow, 'pollock', RESULTS_DIR)

In [47]:
# run_workflow_for_cross_disease(adata_map, run_pollock_workflow, 'pollock', RESULTS_CROSS_DISEASE_DIR)

In [None]:
run_workflow_for_cross_datatype(adata_map, run_pollock_workflow, 'pollock', RESULTS_CROSS_DTYPE_DIR)

snRNAseq scRNAseq_brca_train_snATACseq_brca_val (2600, 27131) (9028, 19891)


2020-11-12 12:53:26,986 input dataset shape: (11628, 17565)
2020-11-12 12:53:26,988 possible cell types: ['B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Erythrocyte', 'Fibroblast', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Plasma', 'Treg']
2020-11-12 12:53:26,989 using validation key
  if not is_categorical(df_full[k]):
2020-11-12 12:53:27,009 train shape: (2600, 17565), val shape: (9028, 17565)
2020-11-12 12:53:27,010 train labels: ['B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Erythrocyte', 'Fibroblast', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Plasma', 'Treg']
2020-11-12 12:53:27,011 val labels: ['B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Fibroblast', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Treg']
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 12:53:38,123 epoch: 1, train loss: 14.148760795593262, val loss: 112.47914123535156
  if not is_categorical(df_full[k]):
  if not is_categorica

  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 12:55:09,609 epoch: 13, train loss: 11.504253387451172, val loss: 92.74026489257812
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 12:55:17,234 epoch: 14, train loss: 11.456613540649414, val loss: 92.35957336425781
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 12:55:24,831 epoch: 15, train loss: 11.401772499084473, val loss: 92.06980895996094
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 12:55:32,457 epoch: 16, train loss: 11.35120964050293, val loss: 91.33361053466797
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 12:55:40,194 epoch: 17, train loss: 11.304377555847168, val loss: 91.21499633789062
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 12:55:47,758 epoch: 18, train loss: 11.249260902404785, val loss: 90.88

False


  d['descr'] = dtype_to_descr(array.dtype)
  if not is_categorical(df_full[k]):
2020-11-12 12:56:08,624 0 genes in training set are missing from prediction set
2020-11-12 12:56:09,111 starting batch 1 of 1
  if not is_categorical(df_full[k]):
2020-11-12 12:56:09,188 /home/estorrs/pollock/benchmarking/results/10272020_teir1_cross_datatype/snRNAseq/scRNAseq_brca_train_snATACseq_brca_val/pollock_module
2020-11-12 12:56:09,189 ['B cell' 'CD4 T cell' 'CD8 T cell' 'Dendritic' 'Endothelial'
 'Erythrocyte' 'Fibroblast' 'Malignant' 'Mast' 'Monocyte' 'NK' 'Plasma'
 'Treg']
2020-11-12 12:56:09,379 0 genes in training set are missing from prediction set
2020-11-12 12:56:44,023 (9028, 13)
2020-11-12 12:56:44,024 {'Monocyte', 'Endothelial', 'Treg', 'Malignant', 'Dendritic', 'B cell', 'Erythrocyte', 'Fibroblast', 'Mast', 'Plasma', 'NK'}


snRNAseq scRNAseq_brca_train_snATACseq_gbm_val (2600, 27131) (5650, 19891)


2020-11-12 12:56:50,748 input dataset shape: (8250, 17565)
2020-11-12 12:56:50,750 possible cell types: ['B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Erythrocyte', 'Fibroblast', 'Malignant', 'Mast', 'Microglia', 'Monocyte', 'NK', 'Neuron', 'Oligodendrocytes', 'Plasma', 'T cells', 'Treg']
2020-11-12 12:56:50,751 using validation key
  if not is_categorical(df_full[k]):
2020-11-12 12:56:50,769 train shape: (2600, 17565), val shape: (5650, 17565)
2020-11-12 12:56:50,771 train labels: ['B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Erythrocyte', 'Fibroblast', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Plasma', 'Treg']
2020-11-12 12:56:50,772 val labels: ['B cell', 'Endothelial', 'Fibroblast', 'Malignant', 'Microglia', 'Neuron', 'Oligodendrocytes', 'T cells']
  if not is_categorical(df_full[k]):




2020-11-12 12:56:57,941 5 out of the last 50 calls to <function compute_loss at 0x7fbfea59c3b0> triggered tf.function retracing. Tracing is expensive and the excessive number of tracings is likely due to passing python objects instead of tensors. Also, tf.function has experimental_relax_shapes=True option that relaxes argument shapes that can avoid unnecessary retracing. Please refer to https://www.tensorflow.org/tutorials/customization/performance#python_or_tensor_args and https://www.tensorflow.org/api_docs/python/tf/function for more details.
  if not is_categorical(df_full[k]):
2020-11-12 12:56:59,283 epoch: 1, train loss: 15.155159950256348, val loss: 35.349430084228516
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 12:57:05,619 epoch: 2, train loss: 14.320100784301758, val loss: 32.50354766845703
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 12:57:11,827 epoch: 3, train loss: 14.200750350952148, val loss

  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 12:58:19,398 epoch: 14, train loss: 12.479121208190918, val loss: 30.325542449951172
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 12:58:25,475 epoch: 15, train loss: 12.428862571716309, val loss: 30.23723602294922
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 12:58:31,527 epoch: 16, train loss: 12.367630004882812, val loss: 30.27475929260254
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 12:58:37,711 epoch: 17, train loss: 12.321467399597168, val loss: 30.229286193847656
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 12:58:43,782 epoch: 18, train loss: 12.268769264221191, val loss: 30.209583282470703
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 12:58:49,899 epoch: 19, train loss: 12.220223426818848, val loss: 3

False


  d['descr'] = dtype_to_descr(array.dtype)
  if not is_categorical(df_full[k]):
2020-11-12 12:59:01,085 0 genes in training set are missing from prediction set
2020-11-12 12:59:01,487 starting batch 1 of 1
  if not is_categorical(df_full[k]):
2020-11-12 12:59:01,557 /home/estorrs/pollock/benchmarking/results/10272020_teir1_cross_datatype/snRNAseq/scRNAseq_brca_train_snATACseq_gbm_val/pollock_module
2020-11-12 12:59:01,558 ['B cell' 'CD4 T cell' 'CD8 T cell' 'Dendritic' 'Endothelial'
 'Erythrocyte' 'Fibroblast' 'Malignant' 'Mast' 'Monocyte' 'NK' 'Plasma'
 'Treg']
2020-11-12 12:59:01,675 0 genes in training set are missing from prediction set
2020-11-12 12:59:22,224 (5650, 13)
2020-11-12 12:59:22,225 {'Monocyte', 'Endothelial', 'CD4 T cell', 'Treg', 'CD8 T cell', 'Malignant', 'Dendritic', 'B cell', 'Fibroblast', 'Plasma', 'NK'}


snRNAseq scRNAseq_cesc_train_snATACseq_brca_val (1941, 22928) (9028, 19891)


2020-11-12 12:59:31,213 input dataset shape: (10969, 16509)
2020-11-12 12:59:31,216 possible cell types: ['B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Epithelial', 'Erythrocyte', 'Fibroblast', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Plasma', 'Treg']
2020-11-12 12:59:31,217 using validation key
  if not is_categorical(df_full[k]):
2020-11-12 12:59:31,246 train shape: (1941, 16509), val shape: (9028, 16509)
2020-11-12 12:59:31,248 train labels: ['CD4 T cell', 'CD8 T cell', 'Endothelial', 'Epithelial', 'Erythrocyte', 'Fibroblast', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Plasma']
2020-11-12 12:59:31,249 val labels: ['B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Fibroblast', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Treg']
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 12:59:38,574 epoch: 1, train loss: 19.727582931518555, val loss: 95.818359375
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full

  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:00:41,432 epoch: 13, train loss: 16.256467819213867, val loss: 83.27252197265625
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:00:46,612 epoch: 14, train loss: 16.13418197631836, val loss: 82.97295379638672
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:00:51,764 epoch: 15, train loss: 16.06208038330078, val loss: 82.79888916015625
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:00:56,853 epoch: 16, train loss: 16.007375717163086, val loss: 82.58894348144531
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:01:02,043 epoch: 17, train loss: 15.935098648071289, val loss: 82.4585189819336
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:01:07,282 epoch: 18, train loss: 15.860664367675781, val loss: 82.3445

False


  d['descr'] = dtype_to_descr(array.dtype)
  if not is_categorical(df_full[k]):
2020-11-12 13:01:22,484 0 genes in training set are missing from prediction set
2020-11-12 13:01:22,889 starting batch 1 of 1
  if not is_categorical(df_full[k]):
2020-11-12 13:01:22,951 /home/estorrs/pollock/benchmarking/results/10272020_teir1_cross_datatype/snRNAseq/scRNAseq_cesc_train_snATACseq_brca_val/pollock_module
2020-11-12 13:01:22,952 ['CD4 T cell' 'CD8 T cell' 'Endothelial' 'Epithelial' 'Erythrocyte'
 'Fibroblast' 'Malignant' 'Mast' 'Monocyte' 'NK' 'Plasma']
2020-11-12 13:01:23,136 0 genes in training set are missing from prediction set
2020-11-12 13:01:59,548 (9028, 11)
2020-11-12 13:01:59,548 {'Monocyte', 'Endothelial', 'CD4 T cell', 'CD8 T cell', 'Malignant', 'Fibroblast', 'Mast', 'Plasma', 'Epithelial'}


snRNAseq scRNAseq_cesc_train_snATACseq_gbm_val (1941, 22928) (5650, 19891)


2020-11-12 13:02:05,495 input dataset shape: (7591, 16509)
2020-11-12 13:02:05,498 possible cell types: ['B cell', 'CD4 T cell', 'CD8 T cell', 'Endothelial', 'Epithelial', 'Erythrocyte', 'Fibroblast', 'Malignant', 'Mast', 'Microglia', 'Monocyte', 'NK', 'Neuron', 'Oligodendrocytes', 'Plasma', 'T cells']
2020-11-12 13:02:05,499 using validation key
  if not is_categorical(df_full[k]):
2020-11-12 13:02:05,515 train shape: (1941, 16509), val shape: (5650, 16509)
2020-11-12 13:02:05,516 train labels: ['CD4 T cell', 'CD8 T cell', 'Endothelial', 'Epithelial', 'Erythrocyte', 'Fibroblast', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Plasma']
2020-11-12 13:02:05,517 val labels: ['B cell', 'Endothelial', 'Fibroblast', 'Malignant', 'Microglia', 'Neuron', 'Oligodendrocytes', 'T cells']
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:02:11,923 epoch: 1, train loss: 21.458484649658203, val loss: 30.824466705322266
  if not is_categorical(df_full[k]):
  if not is_ca

  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:03:07,850 epoch: 13, train loss: 17.55634307861328, val loss: 27.370681762695312
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:03:12,478 epoch: 14, train loss: 17.556720733642578, val loss: 27.3211669921875
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:03:17,157 epoch: 15, train loss: 17.387252807617188, val loss: 27.24822425842285
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:03:21,723 epoch: 16, train loss: 17.27630043029785, val loss: 27.209815979003906
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:03:26,319 epoch: 17, train loss: 17.219758987426758, val loss: 27.179840087890625
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:03:30,943 epoch: 18, train loss: 17.161897659301758, val loss: 27.1

False


  d['descr'] = dtype_to_descr(array.dtype)
  if not is_categorical(df_full[k]):
2020-11-12 13:03:44,170 0 genes in training set are missing from prediction set
2020-11-12 13:03:44,479 starting batch 1 of 1
  if not is_categorical(df_full[k]):
2020-11-12 13:03:44,535 /home/estorrs/pollock/benchmarking/results/10272020_teir1_cross_datatype/snRNAseq/scRNAseq_cesc_train_snATACseq_gbm_val/pollock_module
2020-11-12 13:03:44,536 ['CD4 T cell' 'CD8 T cell' 'Endothelial' 'Epithelial' 'Erythrocyte'
 'Fibroblast' 'Malignant' 'Mast' 'Monocyte' 'NK' 'Plasma']
2020-11-12 13:03:44,636 0 genes in training set are missing from prediction set
2020-11-12 13:04:05,966 (5650, 11)
2020-11-12 13:04:05,967 {'Monocyte', 'Endothelial', 'CD4 T cell', 'CD8 T cell', 'Malignant', 'Fibroblast', 'Erythrocyte', 'Plasma', 'Epithelial'}


snRNAseq scRNAseq_hnscc_train_snATACseq_brca_val (2200, 26929) (9028, 19891)


2020-11-12 13:04:15,494 input dataset shape: (11228, 17615)
2020-11-12 13:04:15,496 possible cell types: ['B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Erythrocyte', 'Fibroblast', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Plasma', 'Treg']
2020-11-12 13:04:15,497 using validation key
  if not is_categorical(df_full[k]):
2020-11-12 13:04:15,518 train shape: (2200, 17615), val shape: (9028, 17615)
2020-11-12 13:04:15,520 train labels: ['B cell', 'CD4 T cell', 'CD8 T cell', 'Endothelial', 'Erythrocyte', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Plasma', 'Treg']
2020-11-12 13:04:15,521 val labels: ['B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Fibroblast', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Treg']
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:04:23,499 epoch: 1, train loss: 15.698946952819824, val loss: 98.63905334472656
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13

  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:05:33,171 epoch: 13, train loss: 12.814157485961914, val loss: 87.10173034667969
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:05:38,928 epoch: 14, train loss: 12.83912181854248, val loss: 86.98909759521484
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:05:44,680 epoch: 15, train loss: 12.698729515075684, val loss: 86.81157684326172
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:05:50,475 epoch: 16, train loss: 12.627863883972168, val loss: 86.6489486694336
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:05:56,194 epoch: 17, train loss: 12.577813148498535, val loss: 86.44992065429688
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:06:01,908 epoch: 18, train loss: 12.570856094360352, val loss: 86.221

False


  d['descr'] = dtype_to_descr(array.dtype)
  if not is_categorical(df_full[k]):
2020-11-12 13:06:18,893 0 genes in training set are missing from prediction set
2020-11-12 13:06:19,296 starting batch 1 of 1
  if not is_categorical(df_full[k]):
2020-11-12 13:06:19,362 /home/estorrs/pollock/benchmarking/results/10272020_teir1_cross_datatype/snRNAseq/scRNAseq_hnscc_train_snATACseq_brca_val/pollock_module
2020-11-12 13:06:19,363 ['B cell' 'CD4 T cell' 'CD8 T cell' 'Endothelial' 'Erythrocyte'
 'Malignant' 'Mast' 'Monocyte' 'NK' 'Plasma' 'Treg']
2020-11-12 13:06:19,550 0 genes in training set are missing from prediction set
2020-11-12 13:06:55,652 (9028, 11)
2020-11-12 13:06:55,654 {'Monocyte', 'Endothelial', 'CD4 T cell', 'Treg', 'Malignant', 'B cell', 'Erythrocyte', 'Plasma'}


snRNAseq scRNAseq_hnscc_train_snATACseq_gbm_val (2200, 26929) (5650, 19891)


2020-11-12 13:07:02,222 input dataset shape: (7850, 17615)
2020-11-12 13:07:02,224 possible cell types: ['B cell', 'CD4 T cell', 'CD8 T cell', 'Endothelial', 'Erythrocyte', 'Fibroblast', 'Malignant', 'Mast', 'Microglia', 'Monocyte', 'NK', 'Neuron', 'Oligodendrocytes', 'Plasma', 'T cells', 'Treg']
2020-11-12 13:07:02,225 using validation key
  if not is_categorical(df_full[k]):
2020-11-12 13:07:02,238 train shape: (2200, 17615), val shape: (5650, 17615)
2020-11-12 13:07:02,240 train labels: ['B cell', 'CD4 T cell', 'CD8 T cell', 'Endothelial', 'Erythrocyte', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Plasma', 'Treg']
2020-11-12 13:07:02,240 val labels: ['B cell', 'Endothelial', 'Fibroblast', 'Malignant', 'Microglia', 'Neuron', 'Oligodendrocytes', 'T cells']
  if not is_categorical(df_full[k]):




2020-11-12 13:07:08,300 5 out of the last 43 calls to <function compute_loss at 0x7fbfea59c3b0> triggered tf.function retracing. Tracing is expensive and the excessive number of tracings is likely due to passing python objects instead of tensors. Also, tf.function has experimental_relax_shapes=True option that relaxes argument shapes that can avoid unnecessary retracing. Please refer to https://www.tensorflow.org/tutorials/customization/performance#python_or_tensor_args and https://www.tensorflow.org/api_docs/python/tf/function for more details.
  if not is_categorical(df_full[k]):
2020-11-12 13:07:09,367 epoch: 1, train loss: 16.912015914916992, val loss: 31.947193145751953
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:07:14,577 epoch: 2, train loss: 15.931706428527832, val loss: 30.945762634277344
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:07:19,738 epoch: 3, train loss: 15.810691833496094, val los

  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:08:16,341 epoch: 14, train loss: 14.006688117980957, val loss: 28.808042526245117
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:08:21,481 epoch: 15, train loss: 13.953377723693848, val loss: 28.807296752929688
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:08:26,570 epoch: 16, train loss: 13.864034652709961, val loss: 28.786693572998047
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:08:31,719 epoch: 17, train loss: 13.776205062866211, val loss: 28.812501907348633
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:08:36,882 epoch: 18, train loss: 13.753978729248047, val loss: 28.812397003173828
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:08:41,986 epoch: 19, train loss: 13.669191360473633, val loss:

False


  d['descr'] = dtype_to_descr(array.dtype)
  if not is_categorical(df_full[k]):
2020-11-12 13:08:51,647 0 genes in training set are missing from prediction set
2020-11-12 13:08:52,110 starting batch 1 of 1
  if not is_categorical(df_full[k]):
2020-11-12 13:08:52,169 /home/estorrs/pollock/benchmarking/results/10272020_teir1_cross_datatype/snRNAseq/scRNAseq_hnscc_train_snATACseq_gbm_val/pollock_module
2020-11-12 13:08:52,170 ['B cell' 'CD4 T cell' 'CD8 T cell' 'Endothelial' 'Erythrocyte'
 'Malignant' 'Mast' 'Monocyte' 'NK' 'Plasma' 'Treg']
2020-11-12 13:08:52,285 0 genes in training set are missing from prediction set
2020-11-12 13:09:14,404 (5650, 11)
2020-11-12 13:09:14,405 {'Monocyte', 'Endothelial', 'CD4 T cell', 'Treg', 'Malignant', 'B cell', 'Plasma', 'NK'}


snRNAseq scRNAseq_melanoma_train_snATACseq_brca_val (2000, 23452) (9028, 19891)


2020-11-12 13:09:23,311 input dataset shape: (11028, 16275)
2020-11-12 13:09:23,313 possible cell types: ['B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Fibroblast', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Plasma', 'Treg']
2020-11-12 13:09:23,314 using validation key
  if not is_categorical(df_full[k]):
2020-11-12 13:09:23,334 train shape: (2000, 16275), val shape: (9028, 16275)
2020-11-12 13:09:23,336 train labels: ['B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Fibroblast', 'Malignant', 'Monocyte', 'NK', 'Plasma', 'Treg']
2020-11-12 13:09:23,336 val labels: ['B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Fibroblast', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Treg']
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:09:30,692 epoch: 1, train loss: 14.030254364013672, val loss: 102.9658432006836
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:09:36,037 epoch: 2, train

  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:10:33,173 epoch: 13, train loss: 9.539754867553711, val loss: 93.7333984375
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:10:38,376 epoch: 14, train loss: 9.491135597229004, val loss: 93.7606430053711
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:10:43,624 epoch: 15, train loss: 9.401084899902344, val loss: 93.48091888427734
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:10:48,864 epoch: 16, train loss: 9.385795593261719, val loss: 92.90782928466797
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:10:54,108 epoch: 17, train loss: 9.317974090576172, val loss: 92.86665344238281
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:10:59,240 epoch: 18, train loss: 9.29171085357666, val loss: 92.4864425659179

False


  d['descr'] = dtype_to_descr(array.dtype)
  if not is_categorical(df_full[k]):
2020-11-12 13:11:14,241 0 genes in training set are missing from prediction set
2020-11-12 13:11:14,580 starting batch 1 of 1
  if not is_categorical(df_full[k]):
2020-11-12 13:11:14,647 /home/estorrs/pollock/benchmarking/results/10272020_teir1_cross_datatype/snRNAseq/scRNAseq_melanoma_train_snATACseq_brca_val/pollock_module
2020-11-12 13:11:14,648 ['B cell' 'CD4 T cell' 'CD8 T cell' 'Dendritic' 'Fibroblast' 'Malignant'
 'Monocyte' 'NK' 'Plasma' 'Treg']
2020-11-12 13:11:14,790 0 genes in training set are missing from prediction set
2020-11-12 13:11:50,040 (9028, 10)
2020-11-12 13:11:50,042 {'Monocyte', 'Treg', 'CD8 T cell', 'Dendritic', 'Malignant', 'B cell', 'Fibroblast', 'Plasma', 'NK'}


snRNAseq scRNAseq_melanoma_train_snATACseq_gbm_val (2000, 23452) (5650, 19891)


2020-11-12 13:11:56,869 input dataset shape: (7650, 16275)
2020-11-12 13:11:56,873 possible cell types: ['B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Fibroblast', 'Malignant', 'Microglia', 'Monocyte', 'NK', 'Neuron', 'Oligodendrocytes', 'Plasma', 'T cells', 'Treg']
2020-11-12 13:11:56,875 using validation key
  if not is_categorical(df_full[k]):
2020-11-12 13:11:56,898 train shape: (2000, 16275), val shape: (5650, 16275)
2020-11-12 13:11:56,900 train labels: ['B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Fibroblast', 'Malignant', 'Monocyte', 'NK', 'Plasma', 'Treg']
2020-11-12 13:11:56,901 val labels: ['B cell', 'Endothelial', 'Fibroblast', 'Malignant', 'Microglia', 'Neuron', 'Oligodendrocytes', 'T cells']
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:12:03,349 epoch: 1, train loss: 14.799345970153809, val loss: 34.46803665161133
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:1

  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:12:57,607 epoch: 13, train loss: 10.373287200927734, val loss: 30.81534194946289
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:13:02,082 epoch: 14, train loss: 10.349776268005371, val loss: 30.71155548095703
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:13:06,635 epoch: 15, train loss: 10.341835021972656, val loss: 30.671064376831055
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:13:11,169 epoch: 16, train loss: 10.293456077575684, val loss: 30.618833541870117
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:13:15,604 epoch: 17, train loss: 10.244905471801758, val loss: 30.58670997619629
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:13:20,166 epoch: 18, train loss: 10.138236045837402, val loss: 30

False


  d['descr'] = dtype_to_descr(array.dtype)
  if not is_categorical(df_full[k]):
2020-11-12 13:13:32,992 0 genes in training set are missing from prediction set
2020-11-12 13:13:33,444 starting batch 1 of 1
  if not is_categorical(df_full[k]):
2020-11-12 13:13:33,501 /home/estorrs/pollock/benchmarking/results/10272020_teir1_cross_datatype/snRNAseq/scRNAseq_melanoma_train_snATACseq_gbm_val/pollock_module
2020-11-12 13:13:33,502 ['B cell' 'CD4 T cell' 'CD8 T cell' 'Dendritic' 'Fibroblast' 'Malignant'
 'Monocyte' 'NK' 'Plasma' 'Treg']
2020-11-12 13:13:33,585 0 genes in training set are missing from prediction set
2020-11-12 13:13:53,803 (5650, 10)
2020-11-12 13:13:53,804 {'Monocyte', 'Malignant', 'Dendritic', 'B cell', 'Fibroblast', 'Plasma'}


snRNAseq scRNAseq_pbmc_train_snATACseq_brca_val (940, 32738) (9028, 19891)


2020-11-12 13:14:05,591 input dataset shape: (9968, 18919)
2020-11-12 13:14:05,594 possible cell types: ['B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Fibroblast', 'Malignant', 'Mast', 'Megakaryocyte', 'Monocyte', 'NK', 'Treg']
2020-11-12 13:14:05,595 using validation key
  if not is_categorical(df_full[k]):
2020-11-12 13:14:05,613 train shape: (940, 18919), val shape: (9028, 18919)
2020-11-12 13:14:05,615 train labels: ['B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Megakaryocyte', 'Monocyte', 'NK']
2020-11-12 13:14:05,616 val labels: ['B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Fibroblast', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Treg']
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:14:11,959 epoch: 1, train loss: 9.464248657226562, val loss: 104.01560974121094
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:14:16,024 epoch: 2, train loss: 8.45656013488769

  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:14:54,675 epoch: 13, train loss: 6.155814170837402, val loss: 118.42164611816406
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:14:58,231 epoch: 14, train loss: 6.08473014831543, val loss: 119.12492370605469
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:15:01,763 epoch: 15, train loss: 6.002589225769043, val loss: 118.47379302978516
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:15:05,312 epoch: 16, train loss: 5.955848217010498, val loss: 118.38456726074219
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:15:08,853 epoch: 17, train loss: 5.935043811798096, val loss: 117.41747283935547
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:15:12,386 epoch: 18, train loss: 5.902036190032959, val loss: 116.88

False


  d['descr'] = dtype_to_descr(array.dtype)
  if not is_categorical(df_full[k]):
2020-11-12 13:15:23,551 0 genes in training set are missing from prediction set
2020-11-12 13:15:24,023 starting batch 1 of 1
  if not is_categorical(df_full[k]):
2020-11-12 13:15:24,080 /home/estorrs/pollock/benchmarking/results/10272020_teir1_cross_datatype/snRNAseq/scRNAseq_pbmc_train_snATACseq_brca_val/pollock_module
2020-11-12 13:15:24,081 ['B cell' 'CD4 T cell' 'CD8 T cell' 'Dendritic' 'Megakaryocyte' 'Monocyte'
 'NK']
2020-11-12 13:15:24,214 0 genes in training set are missing from prediction set
2020-11-12 13:16:03,717 (9028, 7)
2020-11-12 13:16:03,721 {'Monocyte', 'CD8 T cell', 'Dendritic', 'B cell', 'NK'}


snRNAseq scRNAseq_pbmc_train_snATACseq_gbm_val (940, 32738) (5650, 19891)


2020-11-12 13:16:09,680 input dataset shape: (6590, 18919)
2020-11-12 13:16:09,683 possible cell types: ['B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Fibroblast', 'Malignant', 'Megakaryocyte', 'Microglia', 'Monocyte', 'NK', 'Neuron', 'Oligodendrocytes', 'T cells']
2020-11-12 13:16:09,684 using validation key
  if not is_categorical(df_full[k]):
2020-11-12 13:16:09,695 train shape: (940, 18919), val shape: (5650, 18919)
2020-11-12 13:16:09,697 train labels: ['B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Megakaryocyte', 'Monocyte', 'NK']
2020-11-12 13:16:09,698 val labels: ['B cell', 'Endothelial', 'Fibroblast', 'Malignant', 'Microglia', 'Neuron', 'Oligodendrocytes', 'T cells']
  if not is_categorical(df_full[k]):




2020-11-12 13:16:13,853 5 out of the last 22 calls to <function compute_loss at 0x7fbfea59c3b0> triggered tf.function retracing. Tracing is expensive and the excessive number of tracings is likely due to passing python objects instead of tensors. Also, tf.function has experimental_relax_shapes=True option that relaxes argument shapes that can avoid unnecessary retracing. Please refer to https://www.tensorflow.org/tutorials/customization/performance#python_or_tensor_args and https://www.tensorflow.org/api_docs/python/tf/function for more details.




2020-11-12 13:16:13,943 6 out of the last 23 calls to <function compute_loss at 0x7fbfea59c3b0> triggered tf.function retracing. Tracing is expensive and the excessive number of tracings is likely due to passing python objects instead of tensors. Also, tf.function has experimental_relax_shapes=True option that relaxes argument shapes that can avoid unnecessary retracing. Please refer to https://www.tensorflow.org/tutorials/customization/performance#python_or_tensor_args and https://www.tensorflow.org/api_docs/python/tf/function for more details.
  if not is_categorical(df_full[k]):
2020-11-12 13:16:14,566 epoch: 1, train loss: 10.023908615112305, val loss: 34.77587127685547
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:16:17,427 epoch: 2, train loss: 8.883955955505371, val loss: 33.68832015991211
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:16:20,323 epoch: 3, train loss: 7.594048500061035, val loss: 3

  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:16:51,308 epoch: 14, train loss: 6.606990337371826, val loss: 36.844032287597656
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:16:54,123 epoch: 15, train loss: 6.5328369140625, val loss: 36.90156173706055
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:16:56,914 epoch: 16, train loss: 6.4414801597595215, val loss: 36.76554489135742
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:16:59,720 epoch: 17, train loss: 6.402044296264648, val loss: 36.53397750854492
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:17:02,502 epoch: 18, train loss: 6.368263244628906, val loss: 36.56194305419922
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:17:05,291 epoch: 19, train loss: 6.334833145141602, val loss: 36.4484024

False


  d['descr'] = dtype_to_descr(array.dtype)
  if not is_categorical(df_full[k]):
2020-11-12 13:17:11,137 0 genes in training set are missing from prediction set
2020-11-12 13:17:11,604 starting batch 1 of 1
  if not is_categorical(df_full[k]):
2020-11-12 13:17:11,653 /home/estorrs/pollock/benchmarking/results/10272020_teir1_cross_datatype/snRNAseq/scRNAseq_pbmc_train_snATACseq_gbm_val/pollock_module
2020-11-12 13:17:11,654 ['B cell' 'CD4 T cell' 'CD8 T cell' 'Dendritic' 'Megakaryocyte' 'Monocyte'
 'NK']
2020-11-12 13:17:11,735 0 genes in training set are missing from prediction set
2020-11-12 13:17:35,610 (5650, 7)
2020-11-12 13:17:35,612 {'Monocyte', 'CD8 T cell', 'Dendritic', 'B cell', 'NK'}


snRNAseq scRNAseq_pdac_train_snATACseq_brca_val (3296, 28756) (9028, 19891)


2020-11-12 13:17:49,828 input dataset shape: (12324, 17904)
2020-11-12 13:17:49,833 possible cell types: ['Acinar', 'B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Epithelial', 'Erythrocyte', 'Fibroblast', 'Islet', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Plasma', 'Treg', 'Tuft']
2020-11-12 13:17:49,834 using validation key
  if not is_categorical(df_full[k]):
2020-11-12 13:17:49,856 train shape: (3296, 17904), val shape: (9028, 17904)
2020-11-12 13:17:49,857 train labels: ['Acinar', 'B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Epithelial', 'Erythrocyte', 'Fibroblast', 'Islet', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Plasma', 'Treg', 'Tuft']
2020-11-12 13:17:49,858 val labels: ['B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Fibroblast', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Treg']
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:18:01,342 epoch: 1, train loss: 14.770638465881348, val lo

  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:19:38,432 epoch: 13, train loss: 12.29512882232666, val loss: 87.2849349975586
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:19:46,457 epoch: 14, train loss: 12.21860408782959, val loss: 87.12706756591797
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:19:54,408 epoch: 15, train loss: 12.159963607788086, val loss: 87.2307357788086
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:20:02,359 epoch: 16, train loss: 12.093198776245117, val loss: 87.09492492675781
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:20:10,286 epoch: 17, train loss: 12.046549797058105, val loss: 86.93444061279297
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:20:18,071 epoch: 18, train loss: 12.005940437316895, val loss: 87.08102

False


  d['descr'] = dtype_to_descr(array.dtype)
  if not is_categorical(df_full[k]):
2020-11-12 13:20:40,320 0 genes in training set are missing from prediction set
2020-11-12 13:20:40,810 starting batch 1 of 1
  if not is_categorical(df_full[k]):
2020-11-12 13:20:40,900 /home/estorrs/pollock/benchmarking/results/10272020_teir1_cross_datatype/snRNAseq/scRNAseq_pdac_train_snATACseq_brca_val/pollock_module
2020-11-12 13:20:40,901 ['Acinar' 'B cell' 'CD4 T cell' 'CD8 T cell' 'Dendritic' 'Endothelial'
 'Epithelial' 'Erythrocyte' 'Fibroblast' 'Islet' 'Malignant' 'Mast'
 'Monocyte' 'NK' 'Plasma' 'Treg' 'Tuft']
2020-11-12 13:20:41,091 0 genes in training set are missing from prediction set
2020-11-12 13:21:15,950 (9028, 17)
2020-11-12 13:21:15,952 {'Monocyte', 'Endothelial', 'Treg', 'Acinar', 'CD8 T cell', 'Dendritic', 'Malignant', 'Epithelial', 'Tuft', 'Fibroblast', 'Erythrocyte', 'Plasma', 'Islet', 'NK'}


snRNAseq scRNAseq_pdac_train_snATACseq_gbm_val (3296, 28756) (5650, 19891)


2020-11-12 13:21:23,242 input dataset shape: (8946, 17904)
2020-11-12 13:21:23,244 possible cell types: ['Acinar', 'B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Epithelial', 'Erythrocyte', 'Fibroblast', 'Islet', 'Malignant', 'Mast', 'Microglia', 'Monocyte', 'NK', 'Neuron', 'Oligodendrocytes', 'Plasma', 'T cells', 'Treg', 'Tuft']
2020-11-12 13:21:23,245 using validation key
  if not is_categorical(df_full[k]):
2020-11-12 13:21:23,265 train shape: (3296, 17904), val shape: (5650, 17904)
2020-11-12 13:21:23,266 train labels: ['Acinar', 'B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Epithelial', 'Erythrocyte', 'Fibroblast', 'Islet', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Plasma', 'Treg', 'Tuft']
2020-11-12 13:21:23,267 val labels: ['B cell', 'Endothelial', 'Fibroblast', 'Malignant', 'Microglia', 'Neuron', 'Oligodendrocytes', 'T cells']
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:21:32,644 epoch: 1, train 

  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:22:55,118 epoch: 13, train loss: 13.453126907348633, val loss: 29.41242027282715
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:23:01,952 epoch: 14, train loss: 13.405933380126953, val loss: 29.379850387573242
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:23:08,607 epoch: 15, train loss: 13.339804649353027, val loss: 29.340347290039062
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:23:15,474 epoch: 16, train loss: 13.266844749450684, val loss: 29.264053344726562
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:23:22,194 epoch: 17, train loss: 13.2191162109375, val loss: 29.191967010498047
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:23:28,953 epoch: 18, train loss: 13.163954734802246, val loss: 29

False


  d['descr'] = dtype_to_descr(array.dtype)
  if not is_categorical(df_full[k]):
2020-11-12 13:23:48,093 0 genes in training set are missing from prediction set
2020-11-12 13:23:48,560 starting batch 1 of 1
  if not is_categorical(df_full[k]):
2020-11-12 13:23:48,634 /home/estorrs/pollock/benchmarking/results/10272020_teir1_cross_datatype/snRNAseq/scRNAseq_pdac_train_snATACseq_gbm_val/pollock_module
2020-11-12 13:23:48,635 ['Acinar' 'B cell' 'CD4 T cell' 'CD8 T cell' 'Dendritic' 'Endothelial'
 'Epithelial' 'Erythrocyte' 'Fibroblast' 'Islet' 'Malignant' 'Mast'
 'Monocyte' 'NK' 'Plasma' 'Treg' 'Tuft']
2020-11-12 13:23:48,717 0 genes in training set are missing from prediction set
2020-11-12 13:24:09,442 (5650, 17)
2020-11-12 13:24:09,443 {'Monocyte', 'Endothelial', 'Acinar', 'CD8 T cell', 'Malignant', 'Dendritic', 'Epithelial', 'B cell', 'Erythrocyte', 'Fibroblast', 'Plasma', 'Islet', 'NK'}


snRNAseq scRNAseq_brca_train_snRNAseq_brca_val (2600, 27131) (9490, 29175)


2020-11-12 13:24:21,244 input dataset shape: (12090, 25674)
2020-11-12 13:24:21,247 possible cell types: ['Adipocyte', 'B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Erythrocyte', 'Fibroblast', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Plasma', 'Treg']
2020-11-12 13:24:21,247 using validation key
  if not is_categorical(df_full[k]):
2020-11-12 13:24:21,261 train shape: (2600, 25674), val shape: (9490, 25674)
2020-11-12 13:24:21,263 train labels: ['B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Erythrocyte', 'Fibroblast', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Plasma', 'Treg']
2020-11-12 13:24:21,264 val labels: ['Adipocyte', 'B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Fibroblast', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Plasma', 'Treg']
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:24:32,474 epoch: 1, train loss: 30.03271484375, val loss: 54.04741668701172
  if not is_categorical(df_f

2020-11-12 13:26:04,026 epoch: 12, train loss: 26.763952255249023, val loss: 50.38566970825195
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:26:12,453 epoch: 13, train loss: 26.683664321899414, val loss: 50.336673736572266
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:26:20,976 epoch: 14, train loss: 26.56901741027832, val loss: 50.27118682861328
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:26:29,316 epoch: 15, train loss: 26.454654693603516, val loss: 50.244476318359375
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:26:37,675 epoch: 16, train loss: 26.352073669433594, val loss: 50.25196838378906
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:26:46,120 epoch: 17, train loss: 26.240346908569336, val loss: 50.28573989868164
  if not is_categorical(df_full[k]):
  if not is_categorica

False


  d['descr'] = dtype_to_descr(array.dtype)
  if not is_categorical(df_full[k]):
2020-11-12 13:27:18,139 0 genes in training set are missing from prediction set
2020-11-12 13:27:18,726 starting batch 1 of 1
  if not is_categorical(df_full[k]):
2020-11-12 13:27:18,796 /home/estorrs/pollock/benchmarking/results/10272020_teir1_cross_datatype/snRNAseq/scRNAseq_brca_train_snRNAseq_brca_val/pollock_module
2020-11-12 13:27:18,797 ['B cell' 'CD4 T cell' 'CD8 T cell' 'Dendritic' 'Endothelial'
 'Erythrocyte' 'Fibroblast' 'Malignant' 'Mast' 'Monocyte' 'NK' 'Plasma'
 'Treg']
2020-11-12 13:27:18,862 0 genes in training set are missing from prediction set
2020-11-12 13:27:43,416 (9490, 13)
2020-11-12 13:27:43,417 {'Monocyte', 'Endothelial', 'CD4 T cell', 'Treg', 'CD8 T cell', 'Dendritic', 'Malignant', 'Erythrocyte', 'Fibroblast', 'Mast', 'Plasma', 'NK'}


snRNAseq scRNAseq_brca_train_snRNAseq_ccrcc_val (2600, 27131) (8605, 33538)


2020-11-12 13:27:54,748 input dataset shape: (11205, 27131)
2020-11-12 13:27:54,751 possible cell types: ['B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Epithelial', 'Erythrocyte', 'Fibroblast', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Plasma', 'Treg']
2020-11-12 13:27:54,752 using validation key
  if not is_categorical(df_full[k]):
2020-11-12 13:27:54,766 train shape: (2600, 27131), val shape: (8605, 27131)
2020-11-12 13:27:54,768 train labels: ['B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Erythrocyte', 'Fibroblast', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Plasma', 'Treg']
2020-11-12 13:27:54,768 val labels: ['CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Epithelial', 'Fibroblast', 'Malignant', 'Monocyte', 'NK', 'Plasma', 'Treg']
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:28:06,008 epoch: 1, train loss: 33.53917694091797, val loss: 65.31121826171875
  if not is_categorical(df_full[k]):
  if

2020-11-12 13:29:42,155 epoch: 12, train loss: 30.421327590942383, val loss: 61.147274017333984
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:29:50,899 epoch: 13, train loss: 30.312049865722656, val loss: 61.214599609375
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:29:59,634 epoch: 14, train loss: 30.164737701416016, val loss: 61.33417510986328
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:30:08,492 epoch: 15, train loss: 30.05417251586914, val loss: 61.30610656738281
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:30:17,201 epoch: 16, train loss: 29.96186637878418, val loss: 61.333274841308594
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:30:25,957 epoch: 17, train loss: 29.859785079956055, val loss: 61.531166076660156
  if not is_categorical(df_full[k]):
  if not is_categorical(

False


  d['descr'] = dtype_to_descr(array.dtype)
  if not is_categorical(df_full[k]):
2020-11-12 13:30:59,675 0 genes in training set are missing from prediction set
2020-11-12 13:31:00,285 starting batch 1 of 1
  if not is_categorical(df_full[k]):
2020-11-12 13:31:00,364 /home/estorrs/pollock/benchmarking/results/10272020_teir1_cross_datatype/snRNAseq/scRNAseq_brca_train_snRNAseq_ccrcc_val/pollock_module
2020-11-12 13:31:00,365 ['B cell' 'CD4 T cell' 'CD8 T cell' 'Dendritic' 'Endothelial'
 'Erythrocyte' 'Fibroblast' 'Malignant' 'Mast' 'Monocyte' 'NK' 'Plasma'
 'Treg']
2020-11-12 13:31:00,430 0 genes in training set are missing from prediction set
2020-11-12 13:31:24,146 (8605, 13)
2020-11-12 13:31:24,148 {'Monocyte', 'Endothelial', 'CD4 T cell', 'Treg', 'CD8 T cell', 'Dendritic', 'Malignant', 'B cell', 'Erythrocyte', 'Fibroblast', 'Plasma', 'NK'}


snRNAseq scRNAseq_brca_train_snRNAseq_gbm_val (2600, 27131) (6810, 29748)


2020-11-12 13:31:33,487 input dataset shape: (9410, 25705)
2020-11-12 13:31:33,490 possible cell types: ['B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Erythrocyte', 'Fibroblast', 'Malignant', 'Mast', 'Microglia', 'Monocyte', 'NK', 'Neuron', 'Oligodendrocytes', 'Plasma', 'T cells', 'Treg']
2020-11-12 13:31:33,491 using validation key
  if not is_categorical(df_full[k]):
2020-11-12 13:31:33,512 train shape: (2600, 25705), val shape: (6810, 25705)
2020-11-12 13:31:33,514 train labels: ['B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Erythrocyte', 'Fibroblast', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Plasma', 'Treg']
2020-11-12 13:31:33,515 val labels: ['B cell', 'Endothelial', 'Fibroblast', 'Malignant', 'Microglia', 'Monocyte', 'Neuron', 'Oligodendrocytes', 'T cells']
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:31:43,649 epoch: 1, train loss: 32.74538040161133, val loss: 49.0059928894043
  if not is_catego

2020-11-12 13:33:09,029 epoch: 12, train loss: 29.413063049316406, val loss: 46.04637908935547
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:33:16,762 epoch: 13, train loss: 29.306753158569336, val loss: 46.024810791015625
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:33:24,461 epoch: 14, train loss: 29.191587448120117, val loss: 46.02787780761719
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:33:32,386 epoch: 15, train loss: 29.095373153686523, val loss: 46.015357971191406
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:33:40,408 epoch: 16, train loss: 28.95354461669922, val loss: 46.14450454711914
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:33:48,207 epoch: 17, train loss: 28.830507278442383, val loss: 46.166969299316406
  if not is_categorical(df_full[k]):
  if not is_categoric

False


  d['descr'] = dtype_to_descr(array.dtype)
  if not is_categorical(df_full[k]):
2020-11-12 13:34:18,127 0 genes in training set are missing from prediction set
2020-11-12 13:34:18,689 starting batch 1 of 1
  if not is_categorical(df_full[k]):
2020-11-12 13:34:18,743 /home/estorrs/pollock/benchmarking/results/10272020_teir1_cross_datatype/snRNAseq/scRNAseq_brca_train_snRNAseq_gbm_val/pollock_module
2020-11-12 13:34:18,744 ['B cell' 'CD4 T cell' 'CD8 T cell' 'Dendritic' 'Endothelial'
 'Erythrocyte' 'Fibroblast' 'Malignant' 'Mast' 'Monocyte' 'NK' 'Plasma'
 'Treg']
2020-11-12 13:34:18,790 0 genes in training set are missing from prediction set
2020-11-12 13:34:36,633 (6810, 13)
2020-11-12 13:34:36,634 {'Monocyte', 'Endothelial', 'CD4 T cell', 'Treg', 'Malignant', 'Dendritic', 'B cell', 'Fibroblast', 'Erythrocyte', 'Plasma', 'NK'}


snRNAseq scRNAseq_cesc_train_snRNAseq_brca_val (1941, 22928) (9490, 29175)


2020-11-12 13:34:46,525 input dataset shape: (11431, 22001)
2020-11-12 13:34:46,527 possible cell types: ['Adipocyte', 'B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Epithelial', 'Erythrocyte', 'Fibroblast', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Plasma', 'Treg']
2020-11-12 13:34:46,528 using validation key
  if not is_categorical(df_full[k]):
2020-11-12 13:34:46,538 train shape: (1941, 22001), val shape: (9490, 22001)
2020-11-12 13:34:46,540 train labels: ['CD4 T cell', 'CD8 T cell', 'Endothelial', 'Epithelial', 'Erythrocyte', 'Fibroblast', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Plasma']
2020-11-12 13:34:46,541 val labels: ['Adipocyte', 'B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Fibroblast', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Plasma', 'Treg']
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:34:54,950 epoch: 1, train loss: 39.202301025390625, val loss: 47.81297302246094
  if not is_categorical(df_

2020-11-12 13:36:01,885 epoch: 12, train loss: 34.53059005737305, val loss: 44.43861770629883
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:36:08,007 epoch: 13, train loss: 34.22859191894531, val loss: 44.39051055908203
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:36:14,143 epoch: 14, train loss: 34.242210388183594, val loss: 44.43193817138672
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:36:20,241 epoch: 15, train loss: 34.15685272216797, val loss: 44.41658401489258
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:36:26,360 epoch: 16, train loss: 34.03807830810547, val loss: 44.42867660522461
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:36:32,576 epoch: 17, train loss: 33.80657196044922, val loss: 44.435768127441406
  if not is_categorical(df_full[k]):
  if not is_categorical(df_

False


  d['descr'] = dtype_to_descr(array.dtype)
  if not is_categorical(df_full[k]):
2020-11-12 13:36:56,722 0 genes in training set are missing from prediction set
2020-11-12 13:36:57,254 starting batch 1 of 1
  if not is_categorical(df_full[k]):
2020-11-12 13:36:57,313 /home/estorrs/pollock/benchmarking/results/10272020_teir1_cross_datatype/snRNAseq/scRNAseq_cesc_train_snRNAseq_brca_val/pollock_module
2020-11-12 13:36:57,314 ['CD4 T cell' 'CD8 T cell' 'Endothelial' 'Epithelial' 'Erythrocyte'
 'Fibroblast' 'Malignant' 'Mast' 'Monocyte' 'NK' 'Plasma']
2020-11-12 13:36:57,376 0 genes in training set are missing from prediction set
2020-11-12 13:37:23,202 (9490, 11)
2020-11-12 13:37:23,202 {'Monocyte', 'Endothelial', 'CD4 T cell', 'CD8 T cell', 'Malignant', 'Mast', 'Fibroblast', 'Erythrocyte', 'Plasma', 'Epithelial'}


snRNAseq scRNAseq_cesc_train_snRNAseq_ccrcc_val (1941, 22928) (8605, 33538)


2020-11-12 13:37:32,699 input dataset shape: (10546, 22919)
2020-11-12 13:37:32,701 possible cell types: ['CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Epithelial', 'Erythrocyte', 'Fibroblast', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Plasma', 'Treg']
2020-11-12 13:37:32,702 using validation key
  if not is_categorical(df_full[k]):
2020-11-12 13:37:32,719 train shape: (1941, 22919), val shape: (8605, 22919)
2020-11-12 13:37:32,720 train labels: ['CD4 T cell', 'CD8 T cell', 'Endothelial', 'Epithelial', 'Erythrocyte', 'Fibroblast', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Plasma']
2020-11-12 13:37:32,721 val labels: ['CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Epithelial', 'Fibroblast', 'Malignant', 'Monocyte', 'NK', 'Plasma', 'Treg']
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:37:41,542 epoch: 1, train loss: 41.570030212402344, val loss: 57.091583251953125
  if not is_categorical(df_full[k]):
  if not is_categorical(df_fu

  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:38:56,028 epoch: 13, train loss: 36.934600830078125, val loss: 53.257808685302734
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:39:02,345 epoch: 14, train loss: 36.534446716308594, val loss: 53.34333419799805
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:39:08,397 epoch: 15, train loss: 36.74250411987305, val loss: 53.37657928466797
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-11-12 13:39:14,626 epoch: 16, train loss: 36.515846252441406, val loss: 53.339935302734375


###### testing stuff

In [22]:
# a = sc.read_h5ad(adata_map['snATACseq']['gbm']['train'])
# a

In [13]:
train, val = sc.read_h5ad(adata_map['scRNAseq']['pbmc']['train']), sc.read_h5ad(adata_map['scRNAseq']['brca']['val'])

In [14]:
module_dir = os.path.join(SANDBOX_DIR, 'temp_module')

In [15]:
train.obs['is_validation'] = [False] * train.shape[0]
val.obs['is_validation'] = [True] * val.shape[0]
combined = train.concatenate(val)
combined

AnnData object with n_obs × n_vars = 12193 × 18511
    obs: 'leiden', 'cell_type', 'is_validation', 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'percent.mito', 'nCount_SCT', 'nFeature_SCT', 'SCT_snn_res.0.5', 'seurat_clusters', 'sample', 'tissue_type', 'cell_type_specific', 'Piece_ID', 'Clinical_Subtype', 'Bulk_PAM50', 'doublet_score', 'predicted_doublet', 'ident', 'batch'
    var: 'gene_ids-0', 'sct.detection_rate-1', 'sct.gmean-1', 'sct.variance-1', 'sct.residual_mean-1', 'sct.residual_variance-1', 'sct.variable-1'

In [18]:
train.shape, val.shape

((940, 32738), (11253, 27131))

In [17]:
np.count_nonzero(combined.obs['is_validation']), np.count_nonzero(~combined.obs['is_validation'])

(11253, 940)

In [None]:
# pds = PollockDataset(train, cell_type_key=CELL_TYPE_KEY,
#                      dataset_type='training')

In [24]:
val.shape

(11253, 27131)

In [26]:
pds = PollockDataset(combined, cell_type_key=CELL_TYPE_KEY,
                     dataset_type='training', validation_key='is_validation')

2020-11-12 09:46:56,627 normalizing the expression counts for model training
2020-11-12 09:47:01,436 input dataset shape: (12193, 18511)
2020-11-12 09:47:01,439 possible cell types: ['B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Erythrocyte', 'Fibroblast', 'Malignant', 'Mast', 'Megakaryocyte', 'Monocyte', 'NK', 'Plasma', 'Treg']
  if not is_categorical(df_full[k]):
2020-11-12 09:47:01,452 train shape: (940, 18511), val shape: (11253, 18511)
2020-11-12 09:47:03,561 training dataset shape: (940, 18511)
2020-11-12 09:47:03,563 validation dataset shape: (11253, 18511)


In [None]:
pm = PollockModel(pds.cell_types, pds.train_adata.shape[1], alpha=.0001, latent_dim=25)

In [None]:
pm.fit(pds, epochs=2)

In [None]:
pm.save(pds, module_dir)

In [None]:
val.shape

In [None]:
preds = predict_from_anndata(val.copy(),
        '/home/estorrs/pollock/benchmarking/sandbox/temp_module', adata_batch_size=10000)
preds

In [None]:
df = pd.DataFrame.from_dict({
    'cell_id': preds.index.to_list(),
    'groundtruth': val.obs.loc[preds.index][CELL_TYPE_KEY].to_list(),
    'predicted': preds['predicted_cell_type'],
    'probability': preds['cell_type_probability']
})
df

##### scanpy ingest

In [None]:
def ingest_preprocess(adata):
    adata.var['mt'] = adata.var_names.str.startswith('MT-')
    sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)
    sc.pp.normalize_total(adata, target_sum=1e4)
    sc.pp.log1p(adata)
    sc.pp.highly_variable_genes(adata, flavor="seurat", n_top_genes=2500)
    adata.raw = adata
    adata = adata[:, adata.var.highly_variable]
    sc.pp.regress_out(adata, ['total_counts', 'pct_counts_mt'])
    sc.pp.scale(adata)
    
    return adata

def run_scanpy_workflow(train, val, cell_type_key):
    var_names = train.var_names.intersection(val.var_names)
    train = train[:, var_names]
    val = val[:, var_names]
    
    groundtruth = val.obs[cell_type_key].to_list()

    sc.pp.pca(train)
    sc.pp.neighbors(train)
    sc.tl.umap(train)
    
    sc.tl.ingest(val, train, obs=cell_type_key)
    
    df = pd.DataFrame.from_dict({
        'cell_id': val.obs.index.to_list(),
        'groundtruth': groundtruth,
        'predicted': val.obs[cell_type_key].to_list(),
        'probability': [np.nan] * val.shape[0]
    })
    
    return df

In [None]:
run_workflow_for_datasets(adata_map, run_scanpy_workflow, 'scanpy_ingest', RESULTS_DIR)

In [None]:
run_workflow_for_cross_disease(adata_map, run_scanpy_workflow, 'scanpy_ingest', RESULTS_CROSS_DISEASE_DIR)

In [None]:
run_workflow_for_cross_datatype(adata_map, run_scanpy_workflow, 'scanpy_ingest', RESULTS_CROSS_DTYPE_DIR)

###### testing stuff

In [None]:
train, val = adata_map['scRNAseq']['pbmc']['train'].copy(), adata_map['scRNAseq']['pbmc']['val'].copy()

In [None]:
train, val = ingest_preprocess(train), ingest_preprocess(val)

var_names = train.var_names.intersection(val.var_names)
train = train[:, var_names]
val = val[:, var_names]

sc.pp.pca(train)
sc.pp.neighbors(train)
sc.tl.umap(train)

In [None]:
sc.pl.umap(train, color='cell_type')

In [None]:
sc.tl.ingest(val, train, obs=CELL_TYPE_KEY)
val.uns[f'{CELL_TYPE_KEY}_colors'] = train.uns[f'{CELL_TYPE_KEY}_colors']

In [None]:
sc.pl.umap(val, color=[CELL_TYPE_KEY], wspace=0.5)


In [None]:
val

In [None]:
val.obs

##### ACTINN

In [None]:
def run_actinn_workflow(train, val, cell_type_key):
    X = train.X.toarray() if 'sparse' in str(type(train.X)) else train.X
    train_counts_df = pd.DataFrame(data=X.transpose(), index=train.var.index.to_list(),
                        columns=train.obs.index.to_list())
    X = val.X.toarray() if 'sparse' in str(type(val.X)) else val.X
    val_counts_df = pd.DataFrame(data=X.transpose(), index=val.var.index.to_list(),
                        columns=val.obs.index.to_list())
    
    train_counts_fp = os.path.join(SANDBOX_DIR, 'train_counts.txt')
    val_counts_fp = os.path.join(SANDBOX_DIR, 'val_counts.txt')
    train_counts_df.to_csv(train_counts_fp, sep='\t')
    val_counts_df.to_csv(val_counts_fp, sep='\t')
    
    train_h5_fp = os.path.join(SANDBOX_DIR, 'train.h5')
    train_annotations_fp = os.path.join(SANDBOX_DIR, 'train_annotations.txt')
    val_h5_fp = os.path.join(SANDBOX_DIR, 'val.h5')

    train.obs[[CELL_TYPE_KEY]].to_csv(train_annotations_fp, sep='\t', index=True, header=False)

    subprocess.check_output(('python', ACTINN_FORMAT, '-i', train_counts_fp,
                            '-o', train_h5_fp.replace('.h5', ''), '-f', 'txt'))
    subprocess.check_output(('python', ACTINN_FORMAT, '-i', val_counts_fp,
                            '-o', val_h5_fp.replace('.h5', ''), '-f', 'txt'))
    # dont use probablity argument or it breaks
    subprocess.check_output(('python', ACTINN_PREDICT, '-trs', train_h5_fp,
                            '-trl', train_annotations_fp, '-ts', val_h5_fp))
    
    prediction_df = pd.read_csv('predicted_label.txt', sep='\t')
    
    df = pd.DataFrame.from_dict({
        'cell_id': prediction_df['cellname'].to_list(),
        'predicted': prediction_df['celltype'].to_list(),
        'probability': [np.nan] * prediction_df.shape[0]
    })
    
    df = pd.merge(df, val.obs, left_on='cell_id', right_index=True)
    df = df[['cell_id', 'cell_type', 'predicted', 'probability']]
    df.columns = ['cell_id', 'groundtruth', 'predicted', 'probability']
    
    return df
    
    
    
    


In [None]:
ACTINN_FORMAT = '/home/estorrs/ACTINN/actinn_format.py'
ACTINN_PREDICT = '/home/estorrs/ACTINN/actinn_predict.py'

run_workflow_for_datasets(adata_map, run_actinn_workflow, 'actinn', RESULTS_DIR)

###### testing stuff

In [None]:
train, val = adata_map['scRNAseq']['pbmc']['train'].copy(), adata_map['scRNAseq']['pbmc']['val'].copy()

In [None]:
# train.obs['dataset'] = ['train'] * train.shape[0]
# val.obs['dataset'] = ['val'] * val.shape[0]
# combined = train.concatenate(val)
# combined

In [None]:
train_counts_df = pd.DataFrame(data=train.X.transpose().toarray(), index=train.var.index.to_list(),
                        columns=train.obs.index.to_list())
val_counts_df = pd.DataFrame(data=val.X.transpose().toarray(), index=val.var.index.to_list(),
                        columns=val.obs.index.to_list())
train_counts_df

In [None]:
train_counts_fp = os.path.join(SANDBOX_DIR, 'train_counts.txt')
val_counts_fp = os.path.join(SANDBOX_DIR, 'val_counts.txt')
train_counts_df.to_csv(train_counts_fp, sep='\t')
val_counts_df.to_csv(val_counts_fp, sep='\t')

python actinn_format.py -i input_file -o output_prefix -f format

python actinn_format.py -i ./test_data/train_set.txt.gz -o train_set -f txt


In [None]:
train_h5_fp = os.path.join(SANDBOX_DIR, 'train.h5')
train_annotations_fp = os.path.join(SANDBOX_DIR, 'train_annotations.txt')
val_h5_fp = os.path.join(SANDBOX_DIR, 'val.h5')

train.obs[[CELL_TYPE_KEY]].to_csv(train_annotations_fp, sep='\t', index=True, header=False)

subprocess.check_output(('python', '/home/estorrs/ACTINN/actinn_format.py', '-i', train_counts_fp,
                        '-o', train_h5_fp.replace('.h5', ''), '-f', 'txt'))

In [None]:
subprocess.check_output(('python', '/home/estorrs/ACTINN/actinn_format.py', '-i', val_counts_fp,
                        '-o', val_h5_fp.replace('.h5', ''), '-f', 'txt'))

In [None]:
train.obs[[CELL_TYPE_KEY]]

python actinn_predict.py -trs training_set -trl training_label -ts test_set -lr learning_rat -ne num_epoch -ms minibatch_size -pc print_cost -op output_probability


-trs Path to the training set, must be HDF5 format with key "dge".

-trl Path to the training label (the cell types for the training set), must be tab separated text file with no column and row names.

-ts Path to test sets, must be HDF5 format with key "dge".

-lr Learning rate (default: 0.0001). We can increase the learning rate if the cost drops too slow, or decrease the learning rate if the cost drops super fast in the beginning and starts to fluctuate in later epochs.

-ne Number of epochs (default: 50). The number of epochs can be determined by looking at the cost after each epoch. If the cost starts to decrease very slowly after ceartain epoch, then the "ne" parameter should be set to that epoch number.

-ms Minibatch size (default: 128). This parameter can be set larger when training a large dataset.

-pc Print cost (default: True). Whether to print cost after each 5 epochs.

-op Output probabilities for each cell being the cell types in the training data (default: False).


In [None]:
subprocess.check_output(('python', '/home/estorrs/ACTINN/actinn_predict.py', '-trs', train_h5_fp,
                        '-trl', train_annotations_fp, '-ts', val_h5_fp))

In [None]:
' '.join(('python', '/home/estorrs/ACTINN/actinn_predict.py', '-trs', train_h5_fp,
                        '-trl', train_annotations_fp, '-ts', val_h5_fp,
                        '-op', 'True'))

In [None]:
prediction_df = pd.read_csv('predicted_label.txt', sep='\t')
prediction_df

In [None]:
df = pd.DataFrame.from_dict({
        'cell_id': prediction_df['cellname'].to_list(),
        'prediction': prediction_df['celltype'].to_list(),
        'probability': [np.nan] * val.shape[0]
    })
df

In [None]:
val.obs

In [None]:
df = pd.merge(df, val.obs, left_on='cell_id', right_index=True)
df = df[['cell_id', 'cell_type', 'prediction', 'probability']]
df.columns = ['cell_id', 'groundtruth', 'prediction', 'probability']
df


##### Seurat

In [None]:
def run_seurat_transfer(train, val, cell_type_key):
    # save the input data for the seurat script
    train_counts_fp, val_counts_fp = (os.path.join(SANDBOX_DIR, 'train_counts.txt'),
                                        os.path.join(SANDBOX_DIR, 'val_counts.txt'))
    train_annotations_fp, val_annotations_fp = (os.path.join(SANDBOX_DIR, 'train_annotations.txt'),
                                                os.path.join(SANDBOX_DIR, 'val_annotations.txt'))

    ## prepare train and val count matrices
    X = train.X.toarray() if 'sparse' in str(type(train.X)) else train.X
    train_counts = pd.DataFrame(data=X.transpose().astype(np.int32), index=train.var.index,
                                columns=train.obs.index)
    train_counts.index.name = ''
    # for some reason SCTransform fails if the integer values are too high, so capping them here
    cap = pow(2, 14)
    train_counts.values[train_counts.values>cap] = cap
    train_counts.to_csv(train_counts_fp, sep='\t', header=True, index=True)
    
    X = val.X.toarray() if 'sparse' in str(type(val.X)) else val.X
    val_counts = pd.DataFrame(data=X.transpose().astype(np.int32), index=val.var.index,
                                columns=val.obs.index)
    val_counts.index.name = ''
    val_counts.values[val_counts.values>cap] = cap
    val_counts.to_csv(val_counts_fp, sep='\t', header=True, index=True)

    train.obs[[CELL_TYPE_KEY]].to_csv(train_annotations_fp, sep='\t', header=False, index=False)
    val.obs[[CELL_TYPE_KEY]].to_csv(val_annotations_fp, sep='\t', header=False, index=False)
    
    # actually run the script and read the results back in
    prediction_fp = os.path.join(SANDBOX_DIR, 'seurat_predictions.txt')
    try:
        subprocess.check_output(('Rscript', SEURAT_SCRIPT, train_counts_fp, train_annotations_fp,
                            val_counts_fp, val_annotations_fp, prediction_fp))
    except subprocess.CalledProcessError as e:
        print(f'called process error', e)
        return pd.DataFrame()
    
    # format the predictions dataframe
    df = pd.read_csv(prediction_fp, sep='\t')
    df.index = [x.replace('.', '-') for x in df.index]
    # also remove that weird X thing seurat sometimes puts there if first char is _
    df.index = [x[1:] if x[:2]=='X_' else x for x in df.index]
    df = pd.merge(df, val.obs, left_index=True, right_index=True)
    df['cell_id'] = df.index.to_list()
    try:
        df = df[['cell_id', 'cell_type', 'predicted.id', 'prediction.score.max']]        
        df.columns = ['cell_id', 'groundtruth', 'predicted', 'probability']
        return df
    except KeyError as e:
        print(f'key error', e)
        return pd.DataFrame()

In [None]:
SEURAT_SCRIPT = '/home/estorrs/pollock/benchmarking/tools/run_seurat_workflow.R'
run_workflow_for_datasets(adata_map, run_seurat_transfer, 'seurat_transfer', RESULTS_DIR)

In [None]:
run_workflow_for_cross_disease(adata_map, run_seurat_transfer, 'seurat_transfer', RESULTS_CROSS_DISEASE_DIR)

In [None]:
run_workflow_for_cross_datatype(adata_map, run_seurat_transfer, 'seurat_transfer', RESULTS_CROSS_DTYPE_DIR)

###### testing stuff

In [None]:
# train, val = adata_map['scRNAseq']['pbmc']['train'].copy(), adata_map['scRNAseq']['pbmc']['val'].copy()
train, val = sc.read_h5ad(adata_map['scRNAseq']['brca']['train']), sc.read_h5ad(adata_map['scRNAseq']['brca']['val'])

In [None]:
pow(2, 14)

In [None]:
# save the input data for the seurat script
train_counts_fp, val_counts_fp = (os.path.join(SANDBOX_DIR, 'train_counts.txt'),
                                    os.path.join(SANDBOX_DIR, 'val_counts.txt'))
train_annotations_fp, val_annotations_fp = (os.path.join(SANDBOX_DIR, 'train_annotations.txt'),
                                            os.path.join(SANDBOX_DIR, 'val_annotations.txt'))

## prepare train and val count matrices
X = train.X.toarray() if 'sparse' in str(type(train.X)) else train.X
train_counts = pd.DataFrame(data=X.transpose().astype(np.int32), index=train.var.index,
                            columns=train.obs.index)
train_counts.index.name = ''
# for some reason SCTransform fails if the integer values are too high, so capping them here
cap = pow(2, 14)
train_counts.values[train_counts.values>cap] = cap
train_counts.to_csv(train_counts_fp, sep='\t', header=True, index=True)

X = val.X.toarray() if 'sparse' in str(type(val.X)) else val.X
val_counts = pd.DataFrame(data=X.transpose().astype(np.int32), index=val.var.index,
                            columns=val.obs.index)
val_counts.index.name = ''
val_counts.values[val_counts.values>cap] = cap
val_counts.to_csv(val_counts_fp, sep='\t', header=True, index=True)

train.obs[[CELL_TYPE_KEY]].to_csv(train_annotations_fp, sep='\t', header=False, index=False)
val.obs[[CELL_TYPE_KEY]].to_csv(val_annotations_fp, sep='\t', header=False, index=False)

In [None]:
train_counts

In [None]:
train_counts

In [None]:
type(train_counts.values), type(train_counts.values[0, 0])

In [None]:
vals = sorted(set(train_counts.values.flatten()))
vals

In [None]:
vals[:10], vals[-10:]

In [None]:
train_counts.values[train_counts.values>1000] = 1000

In [None]:
np.where(train_counts>1)

In [None]:
# actually run the script and read the results back in
prediction_fp = os.path.join(SANDBOX_DIR, 'seurat_predictions.txt')
subprocess.check_output(('Rscript', SEURAT_SCRIPT, train_counts_fp, train_annotations_fp,
                    val_counts_fp, val_annotations_fp, prediction_fp))

In [None]:
# format the predictions dataframe
df = pd.read_csv(prediction_fp, sep='\t')
df.index = [x.replace('.', '-') for x in df.index]
# also remove that weird X thing seurat sometimes puts there
df.index = [x[1:] if x[:2]=='X_' else x for x in df.index]
df = pd.merge(df, val.obs, left_index=True, right_index=True)
df['cell_id'] = df.index.to_list()
df = df[['cell_id', 'cell_type', 'predicted.id', 'prediction.score.max']]        
df.columns = ['cell_id', 'groundtruth', 'predicted', 'probability']
df

In [None]:
val.obs

##### SingleCellNet

In [None]:
# !pip install git+https://github.com/pcahan1/PySingleCellNet/

In [None]:
import pySingleCellNet as pySCN

In [None]:
def run_SingleCellNet(train, val, cell_type_key):
    # save the input data for the seurat script
    cgenesA, xpairs, tspRF = pySCN.scn_train(train,
            nTopGenes=100, nRand=100, nTrees=1000, nTopGenePairs=100,
            dLevel=cell_type_key, stratify=True, limitToHVG=True, )
    predictions = pySCN.scn_classify(val, cgenesA, xpairs, tspRF, nrand = 0)
    
    df = pd.merge(predictions.obs[['SCN_class']], val.obs, left_index=True, right_index=True)
    
    df = df[['cell_type', 'SCN_class']]
    df.columns = ['groundtruth', 'predicted']
    df['cell_id'] = df.index.to_list()
    df['probability'] = [np.nan] * df.shape[0]
    df = df[['cell_id', 'groundtruth', 'predicted', 'probability']]
    

    return df

In [None]:
run_workflow_for_datasets(adata_map, run_SingleCellNet, 'SingleCellNet', RESULTS_DIR)

In [None]:
run_workflow_for_cross_disease(adata_map, run_SingleCellNet, 'SingleCellNet', RESULTS_CROSS_DISEASE_DIR)

In [None]:
run_workflow_for_cross_datatype(adata_map, run_SingleCellNet, 'SingleCellNet', RESULTS_CROSS_DTYPE_DIR)

###### testing stuff

In [None]:
train, val = adata_map['scRNAseq']['pbmc']['train'].copy(), adata_map['scRNAseq']['pbmc']['val'].copy()

In [None]:
cgenesA, xpairs, tspRF = pySCN.scn_train(train,
            nTopGenes = 100, nRand = 100, nTrees = 1000 ,nTopGenePairs = 100,
            dLevel = "cell_type", stratify=True, limitToHVG=True, )

In [None]:
predictions = pySCN.scn_classify(val, cgenesA, xpairs, tspRF, nrand = 0)


In [None]:
predictions.obs

In [None]:
df = pd.merge(predictions.obs[['SCN_class']], val.obs, left_index=True, right_index=True)

df = df[['cell_type', 'SCN_class']]
df.index.name = 'cell_id'
df.columns = ['groundtruth', 'predictions']
df['probability'] = [np.nan] * df.shape[0]
df

##### pollock

###### testing stuff