In [1]:
from pathlib import Path
from collections import Counter
import os
import re
import random
import subprocess

import anndata
import scanpy as sc
import pandas as pd
import numpy as np
import scipy

import mgitools.os_helpers as os_helpers

In [2]:
# !conda install -y scanpy
# !pip install git+https://github.com/estorrs/mgitools

In [3]:
# !pip install git+https://github.com/estorrs/mgitools
# !pip install tensorflow==2.1.0

In [4]:
%load_ext autoreload

In [5]:
%autoreload 2

In [6]:
# !pip install -e /home/estorrs/pollock/
import pollock
from pollock.models.model import PollockDataset, PollockModel, load_from_directory, predict_from_anndata

In [7]:
# !conda install -y scanpy

In [8]:
# !pip install git+https://github.com/estorrs/mgitools

In [2]:
CELL_TYPE_KEY = 'cell_type'
N_PER_CELL_TYPE = 500
DATA_DIR = '/home/estorrs/pollock/benchmarking/data/01272021_harmonized/teir_1/'
RESULTS_DIR = '/home/estorrs/pollock/benchmarking/results/01272021_harmonized_v2/'
RESULTS_CROSS_DISEASE_DIR = '/home/estorrs/pollock/benchmarking/results/01272021_teir1_cross_disease_v2'
# RESULTS_CROSS_DTYPE_DIR = '/home/estorrs/pollock/benchmarking/results/11302020_teir1_cross_datatype'
SANDBOX_DIR = '/home/estorrs/pollock/benchmarking/sandbox'

Path(RESULTS_DIR).mkdir(parents=True, exist_ok=True)
Path(RESULTS_CROSS_DISEASE_DIR).mkdir(parents=True, exist_ok=True)
# Path(RESULTS_CROSS_DTYPE_DIR).mkdir(parents=True, exist_ok=True)

##### create training and validation datasets

only run if you haven't created these datasets yet

In [12]:
def cap_list(ls, n=100, split=.8, oversample=True):
    """
    Grabs items from a pool.
    
    if split * pool size is greater than n, then just randomly sample 80% of the pool
    otherwise sample 80% of the pool, then oversample so you end up with a final size of n
    """
    # just return list if it is of length 1
    if len(ls) <= 1: return ls
    cap = int(len(ls) * split)
    if cap > n:
        return random.sample(ls, n)

    if oversample:
        pool = random.sample(ls, cap)
        ## oversample to
        return random.choices(pool, k=n)

    return random.sample(ls, cap)

def balancedish_training_generator(adata, cell_type_key, n_per_cell_type, oversample=True, split=.8):
    """
    Return balanced train and validation sets
    """
    cell_type_to_idxs = {}
    for cell_id, cell_type in zip(adata.obs.index, adata.obs[cell_type_key]):
        if cell_type not in cell_type_to_idxs:
            cell_type_to_idxs[cell_type] = [cell_id]
        else:
            cell_type_to_idxs[cell_type].append(cell_id)

    cell_type_to_idxs = {k:cap_list(ls, n_per_cell_type, oversample=oversample, split=split)
                         for k, ls in cell_type_to_idxs.items()}

    train_ids = np.asarray([x for ls in cell_type_to_idxs.values() for x in ls])
    train_idxs = np.arange(adata.shape[0])[np.isin(np.asarray(adata.obs.index), train_ids)]
    val_idxs = np.delete(np.arange(adata.shape[0]), train_idxs)

    train_adata = adata[train_idxs, :]
    val_adata = adata[val_idxs, :]

    return train_adata, val_adata

# def create_train_val_datasets(adata, cell_type_key, oversample=True):
#     counts = Counter(adata.obs[cell_type_key])
#     min_count = counts.most_common()[-1][1]
#     n_per_cell_type = max(min_count, )
#     train_adata, val_adata = balancedish_training_generator(adata, cell_type_key,
#                                                             n_per_cell_type, oversample=oversample)
#     return train_adata, val_adata

In [13]:
fps = sorted(os_helpers.listfiles(DATA_DIR, regex='.h5ad$'))
fp_map = {fp.split('/')[-2]:{} for fp in fps}
for fp in fps:
    if '/_train.h5ad' not in fp and '/_val.h5ad' not in fp:
        dtype = fp.split('/')[-2]
        disease = fp.split('/')[-1].replace('.h5ad', '')
        fp_map[dtype][disease] = fp
fp_map

{'scRNAseq': {'brca': '/home/estorrs/pollock/benchmarking/data/01272021_harmonized/teir_1/scRNAseq/brca.h5ad',
  'cesc': '/home/estorrs/pollock/benchmarking/data/01272021_harmonized/teir_1/scRNAseq/cesc.h5ad',
  'hnscc': '/home/estorrs/pollock/benchmarking/data/01272021_harmonized/teir_1/scRNAseq/hnscc.h5ad',
  'melanoma': '/home/estorrs/pollock/benchmarking/data/01272021_harmonized/teir_1/scRNAseq/melanoma.h5ad',
  'myeloma': '/home/estorrs/pollock/benchmarking/data/01272021_harmonized/teir_1/scRNAseq/myeloma.h5ad',
  'pdac': '/home/estorrs/pollock/benchmarking/data/01272021_harmonized/teir_1/scRNAseq/pdac.h5ad'},
 'snATACseq': {'brca': '/home/estorrs/pollock/benchmarking/data/01272021_harmonized/teir_1/snATACseq/brca.h5ad',
  'brca_gene_activity': '/home/estorrs/pollock/benchmarking/data/01272021_harmonized/teir_1/snATACseq/brca_gene_activity.h5ad',
  'brca_motif': '/home/estorrs/pollock/benchmarking/data/01272021_harmonized/teir_1/snATACseq/brca_motif.h5ad',
  'brca_peaks': '/home/e

In [14]:
for dtype, d in fp_map.items():
    for disease, fp in d.items():
        print(dtype, disease)
        adata = sc.read_h5ad(fp)
        # check for cell type key
        if CELL_TYPE_KEY not in adata.obs: raise RuntimeError(f'{CELL_TYPE_KEY} not in {fp}')
        
        train_adata, val_adata = balancedish_training_generator(adata, CELL_TYPE_KEY, N_PER_CELL_TYPE)
        # resample validation data to make dataset smaller while keeping rare cell types
        val_adata, _ = balancedish_training_generator(val_adata, CELL_TYPE_KEY, 500, oversample=False,
                                                     split=1.)
        train_adata.write_h5ad(fp.replace('.h5ad', '_train.h5ad'))
        val_adata.write_h5ad(fp.replace('.h5ad', '_val.h5ad'))
        

scRNAseq brca


  if not is_categorical(df_full[k]):


scRNAseq cesc
scRNAseq hnscc
scRNAseq melanoma
scRNAseq myeloma
scRNAseq pdac
snATACseq brca
snATACseq brca_gene_activity
snATACseq brca_motif
snATACseq brca_peaks
snATACseq ccrcc
snATACseq ccrcc_gene_activity
snATACseq ccrcc_motif
snATACseq ccrcc_peaks
snATACseq gbm
snATACseq gbm_gene_activity
snATACseq gbm_motif
snATACseq gbm_peaks
snRNAseq brca
snRNAseq ccrcc


  if is_string_dtype(df[key]) and not is_categorical(df[key])


snRNAseq gbm


##### load in training and validation datasets

In [3]:
fps = sorted(os_helpers.listfiles(DATA_DIR, regex='.h5ad$'))
adata_map = {fp.split('/')[-2]:{} for fp in fps}
for fp in fps:
    dtype = fp.split('/')[-2]
    disease = re.sub(r'^(.*)((_train)|(_val)).h5ad$', r'\1', fp.split('/')[-1])
    if 'peaks' not in disease:
        if disease not in adata_map[dtype] and '.h5ad' not in disease: adata_map[dtype][disease] = {}
        if 'train.h5ad' in fp:
            adata_map[dtype][disease]['train'] = fp
        if 'val.h5ad' in fp:
            adata_map[dtype][disease]['val'] = fp
for dtype, d in adata_map.items():
    for disease, m in d.items():
        print(dtype, disease)

scRNAseq brca
scRNAseq cesc
scRNAseq hnscc
scRNAseq melanoma
scRNAseq myeloma
scRNAseq pdac
snATACseq brca_gene_activity
snATACseq brca_motif
snATACseq brca
snATACseq ccrcc_gene_activity
snATACseq ccrcc_motif
snATACseq ccrcc
snATACseq gbm_gene_activity
snATACseq gbm_motif
snATACseq gbm
snRNAseq brca
snRNAseq ccrcc
snRNAseq gbm


In [4]:
# remove any datasets if you have to
adata_map['snATACseq'].pop('brca')
adata_map['snATACseq'].pop('ccrcc')
adata_map['snATACseq'].pop('gbm')

{'train': '/home/estorrs/pollock/benchmarking/data/01272021_harmonized/teir_1/snATACseq/gbm_train.h5ad',
 'val': '/home/estorrs/pollock/benchmarking/data/01272021_harmonized/teir_1/snATACseq/gbm_val.h5ad'}

### run workflows

In [5]:
import traceback
def run_workflow_for_datasets(adata_map, workflow, workflow_identifier, output_dir):
    for dtype, d in adata_map.items():
#         if dtype != 'snATACseq':
        for disease, m in d.items():
            # make dir if doesnt exist yet
            directory = os.path.join(output_dir, dtype, disease)
            Path(directory).mkdir(parents=True, exist_ok=True)
            train, val = sc.read_h5ad(m['train']), sc.read_h5ad(m['val'])

            print(dtype, disease, train.shape, val.shape)
            try:
                run_workflow(workflow, workflow_identifier,
                    train, val, directory)
            except:
                print(traceback.print_exc())
            
def run_workflow_for_cross_disease(adata_map, workflow, workflow_identifier, output_dir):
    for dtype, d in adata_map.items():
        for disease1, m1 in d.items():
            for disease2, m2 in d.items():
#                 if disease1 != disease2:
                # make dir if doesnt exist yet
                directory = os.path.join(output_dir, dtype, f'{disease1}_train_{disease2}_val')
                Path(directory).mkdir(parents=True, exist_ok=True)
                train, val = sc.read_h5ad(m1['train']), sc.read_h5ad(m2['val'])

                print(dtype, f'{disease1}_train_{disease2}_val', train.shape, val.shape)
                try:
                    run_workflow(workflow, workflow_identifier,
                        train, val, directory)
                except:
                   print(traceback.print_exc())
                    
                    
def run_workflow_for_cross_datatype(adata_map, workflow, workflow_identifier, output_dir):
    for dtype1, d1 in adata_map.items():
        for dtype2, d2 in adata_map.items():
            for disease1, m1 in d1.items():
                for disease2, m2 in d2.items():
                    # make dir if doesnt exist yet
                    directory = os.path.join(output_dir, f'{dtype1}_{dtype2}',
                                             f'{dtype1}_{disease1}_train_{dtype2}_{disease2}_val')
                    Path(directory).mkdir(parents=True, exist_ok=True)
                    train, val = sc.read_h5ad(m1['train']), sc.read_h5ad(m2['val'])

                    print(f'{dtype1}_{dtype2}',
                          f'{dtype1}_{disease1}_train_{dtype2}_{disease2}_val', train.shape, val.shape)
                    run_workflow(workflow, workflow_identifier,
                        train, val, directory)

def run_workflow(workflow, workflow_identifier, train, val, output_dir):
    """
    Run the workflow defined by the workflow function.
    
    workflow function takes a train adata and a val adata as inputs,
    and returns dataframe with cell_id, groundtruth, predicted, and probability columns
    """
#     try:
        # if it is pollock it needs to know where to save the module
    if workflow_identifier == 'pollock':
        df = workflow(train, val, CELL_TYPE_KEY, os.path.join(output_dir, f'{workflow_identifier}_module'))
    else:
        df = workflow(train, val, CELL_TYPE_KEY)

    df.to_csv(os.path.join(output_dir, f'{workflow_identifier}.tsv'), sep='\t', index=False, header=True)
#     except Exception as e:
#         print('failed ' + os.path.join(output_dir, f'{workflow_identifier}.tsv'))
#         print(e)

##### pollock

In [13]:
def run_pollock_workflow(train, val, cell_type_key, module_fp):
    train.obs['is_validation'] = [False] * train.shape[0]
    val.obs['is_validation'] = [True] * val.shape[0]
    print(train.shape, val.shape)
    combined = train.concatenate(val)
    
    pds = PollockDataset(combined.copy(), cell_type_key=cell_type_key,
                     dataset_type='training', validation_key='is_validation')
    
    pm = PollockModel(pds.cell_types, pds.train_adata.shape[1], alpha=.005, latent_dim=25, learning_rate=0.0005)
    
    pm.fit(pds, epochs=15)
    
    # only score validation if cell types match
    train_cells = set(train.obs[cell_type_key])
    val_cells = set(val.obs[cell_type_key])
    score_val = True if len(train_cells.intersection(val_cells)) == len(train_cells.union(val_cells)) else False
    print(score_val)
    pm.save(pds, module_fp, score_train=True, score_val=score_val)

    preds = predict_from_anndata(val.copy(), module_fp, adata_batch_size=2500)
    
    df = pd.DataFrame.from_dict({
        'cell_id': preds.index.to_list(),
        'groundtruth': val.obs.loc[preds.index][cell_type_key].to_list(),
        'predicted': preds['predicted_cell_type'],
        'probability': preds['cell_type_probability']
    })

    return df

In [14]:
run_workflow_for_datasets(adata_map, run_pollock_workflow, 'pollock', RESULTS_DIR)

scRNAseq brca (6105, 27131) (5748, 27131)
(6105, 27131) (5748, 27131)


2021-05-14 10:24:12,075 input dataset shape: (11853, 27131)
2021-05-14 10:24:12,077 possible cell types: ['B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Erythrocyte', 'Fibroblast', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Plasma', 'Treg']
2021-05-14 10:24:12,078 using validation key
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2021-05-14 10:24:37,371 epoch: 1, train loss: 36.868736267089844, val loss: 43.18110656738281
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2021-05-14 10:24:55,531 epoch: 2, train loss: 35.9722900390625, val loss: 42.396827697753906
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2021-05-14 10:25:13,633 epoch: 3, train loss: 35.59969711303711, val loss: 42.07976531982422
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2021-05-14 10:25:32,143 epoch: 4, train loss: 35.257022857666016, val loss: 41.67965

2021-05-14 10:28:15,575 epoch: 13, train loss: 33.786991119384766, val loss: 41.217529296875
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2021-05-14 10:28:33,645 epoch: 14, train loss: 33.698116302490234, val loss: 41.27376937866211
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2021-05-14 10:28:51,675 epoch: 15, train loss: 33.47968673706055, val loss: 41.206398010253906


True


  d['descr'] = dtype_to_descr(array.dtype)
  if not is_categorical(df_full[k]):
2021-05-14 10:29:06,339 0 genes in training set are missing from prediction set
2021-05-14 10:29:06,884 starting batch 1 of 3
  if not is_categorical(df_full[k]):
2021-05-14 10:29:06,991 0 genes in training set are missing from prediction set
2021-05-14 10:29:13,909 starting batch 2 of 3
  if not is_categorical(df_full[k]):
2021-05-14 10:29:14,023 0 genes in training set are missing from prediction set
2021-05-14 10:29:20,823 starting batch 3 of 3
  if not is_categorical(df_full[k]):
2021-05-14 10:29:20,935 0 genes in training set are missing from prediction set


scRNAseq cesc (4661, 22928) (4276, 22928)
(4661, 22928) (4276, 22928)


2021-05-14 10:29:30,856 input dataset shape: (8937, 22928)
2021-05-14 10:29:30,858 possible cell types: ['CD4 T cell', 'CD8 T cell', 'Endothelial', 'Epithelial', 'Erythrocyte', 'Fibroblast', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Plasma']
2021-05-14 10:29:30,858 using validation key
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2021-05-14 10:29:45,654 epoch: 1, train loss: 37.37144470214844, val loss: 46.58943176269531
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2021-05-14 10:29:57,136 epoch: 2, train loss: 36.386112213134766, val loss: 44.93820571899414
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2021-05-14 10:30:10,141 epoch: 3, train loss: 35.98854064941406, val loss: 44.371681213378906
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2021-05-14 10:30:22,292 epoch: 4, train loss: 35.61865997314453, val loss: 43.91381072998047
  if not

2021-05-14 10:32:08,056 epoch: 13, train loss: 33.9305305480957, val loss: 43.21536636352539
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2021-05-14 10:32:20,145 epoch: 14, train loss: 33.707645416259766, val loss: 43.2344856262207
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2021-05-14 10:32:31,867 epoch: 15, train loss: 33.51203536987305, val loss: 43.243045806884766


True


  d['descr'] = dtype_to_descr(array.dtype)
  if not is_categorical(df_full[k]):
2021-05-14 10:32:41,481 0 genes in training set are missing from prediction set
2021-05-14 10:32:42,025 starting batch 1 of 2
  if not is_categorical(df_full[k]):
2021-05-14 10:32:42,147 0 genes in training set are missing from prediction set
2021-05-14 10:32:48,787 starting batch 2 of 2
  if not is_categorical(df_full[k]):
2021-05-14 10:32:48,892 0 genes in training set are missing from prediction set


scRNAseq hnscc (5287, 26929) (5201, 26929)
(5287, 26929) (5201, 26929)


2021-05-14 10:33:04,083 input dataset shape: (10488, 26929)
2021-05-14 10:33:04,085 possible cell types: ['B cell', 'CD4 T cell', 'CD8 T cell', 'Endothelial', 'Erythrocyte', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Plasma', 'Treg']
2021-05-14 10:33:04,086 using validation key
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2021-05-14 10:33:21,976 epoch: 1, train loss: 32.2972412109375, val loss: 19.609146118164062
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2021-05-14 10:33:37,648 epoch: 2, train loss: 31.821483612060547, val loss: 19.45030975341797
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2021-05-14 10:33:52,881 epoch: 3, train loss: 31.399686813354492, val loss: 19.30805778503418
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2021-05-14 10:34:08,089 epoch: 4, train loss: 31.085073471069336, val loss: 19.24856948852539
  if not is_cate

2021-05-14 10:36:27,700 epoch: 13, train loss: 29.792301177978516, val loss: 19.066303253173828
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2021-05-14 10:36:43,257 epoch: 14, train loss: 29.66650390625, val loss: 19.064800262451172
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2021-05-14 10:36:59,174 epoch: 15, train loss: 29.582975387573242, val loss: 19.05150032043457


True


  d['descr'] = dtype_to_descr(array.dtype)
  if not is_categorical(df_full[k]):
2021-05-14 10:37:12,172 0 genes in training set are missing from prediction set
2021-05-14 10:37:12,722 starting batch 1 of 3
  if not is_categorical(df_full[k]):
2021-05-14 10:37:12,816 0 genes in training set are missing from prediction set
2021-05-14 10:37:19,499 starting batch 2 of 3
  if not is_categorical(df_full[k]):
2021-05-14 10:37:19,606 0 genes in training set are missing from prediction set
2021-05-14 10:37:26,844 starting batch 3 of 3
  if not is_categorical(df_full[k]):
2021-05-14 10:37:26,923 0 genes in training set are missing from prediction set


scRNAseq melanoma (4218, 23452) (3517, 23452)
(4218, 23452) (3517, 23452)


2021-05-14 10:37:34,242 input dataset shape: (7735, 23452)
2021-05-14 10:37:34,244 possible cell types: ['B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Fibroblast', 'Malignant', 'Monocyte', 'NK', 'Plasma', 'Treg']
2021-05-14 10:37:34,245 using validation key
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2021-05-14 10:37:46,293 epoch: 1, train loss: 39.10851287841797, val loss: 45.80719757080078
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2021-05-14 10:37:57,332 epoch: 2, train loss: 38.10134506225586, val loss: 44.90483856201172
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2021-05-14 10:38:08,136 epoch: 3, train loss: 37.66139221191406, val loss: 44.55841064453125
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2021-05-14 10:38:18,743 epoch: 4, train loss: 37.322059631347656, val loss: 44.32564926147461
  if not is_categorical(df_ful

2021-05-14 10:39:57,040 epoch: 13, train loss: 35.60994338989258, val loss: 43.981170654296875
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2021-05-14 10:40:07,727 epoch: 14, train loss: 35.40237808227539, val loss: 44.05834197998047
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2021-05-14 10:40:19,089 epoch: 15, train loss: 35.21580123901367, val loss: 44.0709228515625


True


  d['descr'] = dtype_to_descr(array.dtype)
  if not is_categorical(df_full[k]):
2021-05-14 10:40:27,764 0 genes in training set are missing from prediction set
2021-05-14 10:40:28,322 starting batch 1 of 2
  if not is_categorical(df_full[k]):
2021-05-14 10:40:28,425 0 genes in training set are missing from prediction set
2021-05-14 10:40:33,920 starting batch 2 of 2
  if not is_categorical(df_full[k]):
2021-05-14 10:40:34,025 0 genes in training set are missing from prediction set


scRNAseq myeloma (3617, 24020) (3312, 24020)
(3617, 24020) (3312, 24020)


2021-05-14 10:40:42,608 input dataset shape: (6929, 24020)
2021-05-14 10:40:42,610 possible cell types: ['B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Erythrocyte', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Plasma', 'Platlete']
2021-05-14 10:40:42,611 using validation key
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):




2021-05-14 10:40:53,708 5 out of the last 19 calls to <function compute_loss at 0x7f1626d9fb00> triggered tf.function retracing. Tracing is expensive and the excessive number of tracings is likely due to passing python objects instead of tensors. Also, tf.function has experimental_relax_shapes=True option that relaxes argument shapes that can avoid unnecessary retracing. Please refer to https://www.tensorflow.org/tutorials/customization/performance#python_or_tensor_args and https://www.tensorflow.org/api_docs/python/tf/function for more details.
  if not is_categorical(df_full[k]):
2021-05-14 10:40:54,606 epoch: 1, train loss: 45.774818420410156, val loss: 34.74489212036133
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2021-05-14 10:41:03,898 epoch: 2, train loss: 44.7939453125, val loss: 34.287071228027344
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2021-05-14 10:41:13,205 epoch: 3, train loss: 44.3701057434082, val loss: 34.08

  if not is_categorical(df_full[k]):
2021-05-14 10:42:58,801 epoch: 14, train loss: 42.1046028137207, val loss: 33.192962646484375
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2021-05-14 10:43:08,376 epoch: 15, train loss: 41.73040008544922, val loss: 33.15297317504883


True


  d['descr'] = dtype_to_descr(array.dtype)
  _warn_prf(average, modifier, msg_start, len(result))
  if not is_categorical(df_full[k]):
2021-05-14 10:43:16,130 0 genes in training set are missing from prediction set
2021-05-14 10:43:16,681 starting batch 1 of 2
  if not is_categorical(df_full[k]):
2021-05-14 10:43:16,775 0 genes in training set are missing from prediction set
2021-05-14 10:43:22,133 starting batch 2 of 2
  if not is_categorical(df_full[k]):
2021-05-14 10:43:22,202 0 genes in training set are missing from prediction set


scRNAseq pdac (7940, 28756) (7823, 28756)
(7940, 28756) (7823, 28756)


2021-05-14 10:43:43,505 input dataset shape: (15763, 28756)
2021-05-14 10:43:43,509 possible cell types: ['Acinar', 'B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Epithelial', 'Erythrocyte', 'Fibroblast', 'Islet', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Plasma', 'Treg', 'Tuft']
2021-05-14 10:43:43,510 using validation key
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2021-05-14 10:44:12,051 epoch: 1, train loss: 30.97758674621582, val loss: 29.75945281982422
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2021-05-14 10:44:35,005 epoch: 2, train loss: 30.46671485900879, val loss: 29.259843826293945
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2021-05-14 10:44:59,125 epoch: 3, train loss: 30.063251495361328, val loss: 29.040002822875977
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2021-05-14 10:45:23,195 epoch: 4, train l

2021-05-14 10:49:04,302 epoch: 13, train loss: 28.645261764526367, val loss: 28.169137954711914
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2021-05-14 10:49:29,703 epoch: 14, train loss: 28.462318420410156, val loss: 28.198009490966797
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2021-05-14 10:49:54,603 epoch: 15, train loss: 28.35052490234375, val loss: 28.189624786376953


True


  d['descr'] = dtype_to_descr(array.dtype)
  if not is_categorical(df_full[k]):
2021-05-14 10:50:14,262 0 genes in training set are missing from prediction set
2021-05-14 10:50:14,618 starting batch 1 of 4
  if not is_categorical(df_full[k]):
2021-05-14 10:50:14,761 0 genes in training set are missing from prediction set
2021-05-14 10:50:23,243 starting batch 2 of 4
  if not is_categorical(df_full[k]):
2021-05-14 10:50:23,398 0 genes in training set are missing from prediction set
2021-05-14 10:50:30,902 starting batch 3 of 4
  if not is_categorical(df_full[k]):
2021-05-14 10:50:31,081 0 genes in training set are missing from prediction set
2021-05-14 10:50:38,624 starting batch 4 of 4
  if not is_categorical(df_full[k]):
2021-05-14 10:50:38,766 0 genes in training set are missing from prediction set


snATACseq brca_gene_activity (3576, 19891) (3519, 19891)
(3576, 19891) (3519, 19891)


2021-05-14 10:50:45,845 input dataset shape: (7095, 19891)
2021-05-14 10:50:45,847 possible cell types: ['B cell', 'Dendritic', 'Endothelial', 'Fibroblast', 'Malignant', 'Mast', 'Monocyte', 'T cells']
2021-05-14 10:50:45,847 using validation key
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2021-05-14 10:50:56,460 epoch: 1, train loss: 47.29044723510742, val loss: 64.82695770263672
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2021-05-14 10:51:04,936 epoch: 2, train loss: 46.715457916259766, val loss: 64.0495834350586
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2021-05-14 10:51:13,137 epoch: 3, train loss: 46.1207275390625, val loss: 63.44219970703125
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2021-05-14 10:51:21,163 epoch: 4, train loss: 46.18802261352539, val loss: 63.899112701416016
  if not is_categorical(df_full[k]):
  if not is_

  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2021-05-14 10:52:44,270 epoch: 14, train loss: 44.623104095458984, val loss: 63.3206787109375
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2021-05-14 10:52:52,588 epoch: 15, train loss: 44.274131774902344, val loss: 63.26984405517578


True


  d['descr'] = dtype_to_descr(array.dtype)
  if not is_categorical(df_full[k]):
2021-05-14 10:52:59,638 0 genes in training set are missing from prediction set
2021-05-14 10:53:00,051 starting batch 1 of 2
  if not is_categorical(df_full[k]):
2021-05-14 10:53:00,236 0 genes in training set are missing from prediction set
2021-05-14 10:53:11,447 starting batch 2 of 2
  if not is_categorical(df_full[k]):
2021-05-14 10:53:11,548 0 genes in training set are missing from prediction set
2021-05-14 10:53:16,032 input dataset shape: (7095, 633)
2021-05-14 10:53:16,034 possible cell types: ['B cell', 'Dendritic', 'Endothelial', 'Fibroblast', 'Malignant', 'Mast', 'Monocyte', 'T cells']
2021-05-14 10:53:16,035 using validation key


snATACseq brca_motif (3576, 633) (3519, 633)
(3576, 633) (3519, 633)


  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2021-05-14 10:53:18,788 epoch: 1, train loss: 7.961370944976807, val loss: 8.081696510314941
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2021-05-14 10:53:20,609 epoch: 2, train loss: 7.121569633483887, val loss: 7.179506301879883
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2021-05-14 10:53:22,406 epoch: 3, train loss: 6.509657382965088, val loss: 6.606589317321777
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2021-05-14 10:53:24,213 epoch: 4, train loss: 6.087996482849121, val loss: 6.209468841552734
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2021-05-14 10:53:25,969 epoch: 5, train loss: 5.818121433258057, val loss: 5.972517967224121
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2021-05-14 10:53:27,687 epoch: 6, train loss: 5.611422

  if not is_categorical(df_full[k]):
2021-05-14 10:53:41,973 epoch: 14, train loss: 4.951474666595459, val loss: 5.375993728637695
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2021-05-14 10:53:43,774 epoch: 15, train loss: 4.902556419372559, val loss: 5.356853008270264


True


  d['descr'] = dtype_to_descr(array.dtype)
  if not is_categorical(df_full[k]):
2021-05-14 10:53:47,064 0 genes in training set are missing from prediction set
2021-05-14 10:53:47,207 starting batch 1 of 2
  if not is_categorical(df_full[k]):
2021-05-14 10:53:47,275 0 genes in training set are missing from prediction set
2021-05-14 10:53:47,456 starting batch 2 of 2
  if not is_categorical(df_full[k]):
2021-05-14 10:53:47,525 0 genes in training set are missing from prediction set


snATACseq ccrcc_gene_activity (3000, 19843) (3000, 19843)
(3000, 19843) (3000, 19843)


2021-05-14 10:53:52,425 input dataset shape: (6000, 19843)
2021-05-14 10:53:52,427 possible cell types: ['Endothelial', 'Epithelial', 'Fibroblast', 'Malignant', 'Monocyte', 'T cells']
2021-05-14 10:53:52,428 using validation key
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2021-05-14 10:54:01,155 epoch: 1, train loss: 45.66423034667969, val loss: 39.64300537109375
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2021-05-14 10:54:08,077 epoch: 2, train loss: 45.373416900634766, val loss: 39.558433532714844
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2021-05-14 10:54:14,976 epoch: 3, train loss: 45.14961624145508, val loss: 39.52720642089844
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2021-05-14 10:54:21,864 epoch: 4, train loss: 45.0053825378418, val loss: 39.448585510253906
  if not is_categorical(df_full[k]):
  if not is_categorical(df_

  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2021-05-14 10:55:30,950 epoch: 14, train loss: 44.585819244384766, val loss: 39.539588928222656
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2021-05-14 10:55:37,923 epoch: 15, train loss: 43.791629791259766, val loss: 39.08719253540039


True


  d['descr'] = dtype_to_descr(array.dtype)
  if not is_categorical(df_full[k]):
2021-05-14 10:55:43,620 0 genes in training set are missing from prediction set
2021-05-14 10:55:44,117 starting batch 1 of 2
  if not is_categorical(df_full[k]):
2021-05-14 10:55:44,270 0 genes in training set are missing from prediction set
2021-05-14 10:55:53,747 starting batch 2 of 2
  if not is_categorical(df_full[k]):
2021-05-14 10:55:53,820 0 genes in training set are missing from prediction set
2021-05-14 10:55:56,049 input dataset shape: (6000, 633)
2021-05-14 10:55:56,051 possible cell types: ['Endothelial', 'Epithelial', 'Fibroblast', 'Malignant', 'Monocyte', 'T cells']
2021-05-14 10:55:56,052 using validation key


snATACseq ccrcc_motif (3000, 633) (3000, 633)
(3000, 633) (3000, 633)


  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2021-05-14 10:55:58,741 epoch: 1, train loss: 8.06625747680664, val loss: 8.986260414123535
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2021-05-14 10:56:00,338 epoch: 2, train loss: 7.10885763168335, val loss: 7.420302391052246
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2021-05-14 10:56:01,851 epoch: 3, train loss: 6.704901695251465, val loss: 6.942784786224365
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2021-05-14 10:56:03,404 epoch: 4, train loss: 6.245205402374268, val loss: 6.479601860046387
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2021-05-14 10:56:04,959 epoch: 5, train loss: 5.892879962921143, val loss: 6.161915302276611
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2021-05-14 10:56:06,492 epoch: 6, train loss: 5.69280433

  if not is_categorical(df_full[k]):
2021-05-14 10:56:18,757 epoch: 14, train loss: 4.881736755371094, val loss: 5.404792785644531
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2021-05-14 10:56:20,236 epoch: 15, train loss: 4.82399320602417, val loss: 5.377406120300293


True


  d['descr'] = dtype_to_descr(array.dtype)
  if not is_categorical(df_full[k]):
2021-05-14 10:56:22,572 0 genes in training set are missing from prediction set
2021-05-14 10:56:22,722 starting batch 1 of 2
  if not is_categorical(df_full[k]):
2021-05-14 10:56:22,791 0 genes in training set are missing from prediction set
2021-05-14 10:56:22,973 starting batch 2 of 2
  if not is_categorical(df_full[k]):
2021-05-14 10:56:23,041 0 genes in training set are missing from prediction set


snATACseq gbm_gene_activity (3389, 19891) (2876, 19891)
(3389, 19891) (2876, 19891)


2021-05-14 10:56:28,051 input dataset shape: (6265, 19891)
2021-05-14 10:56:28,052 possible cell types: ['B cell', 'Endothelial', 'Fibroblast', 'Malignant', 'Microglia', 'Neuron', 'Oligodendrocytes', 'T cells']
2021-05-14 10:56:28,053 using validation key
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2021-05-14 10:56:37,618 epoch: 1, train loss: 45.82908248901367, val loss: 27.41712760925293
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2021-05-14 10:56:45,025 epoch: 2, train loss: 45.20534896850586, val loss: 27.136226654052734
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2021-05-14 10:56:52,259 epoch: 3, train loss: 44.99018859863281, val loss: 27.163227081298828
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2021-05-14 10:57:00,354 epoch: 4, train loss: 44.808773040771484, val loss: 27.060302734375
  if not is_categorical(df_full[k]):
 

  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2021-05-14 10:58:18,266 epoch: 14, train loss: 43.60057830810547, val loss: 26.810272216796875
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2021-05-14 10:58:26,029 epoch: 15, train loss: 43.507118225097656, val loss: 26.797225952148438


True


  d['descr'] = dtype_to_descr(array.dtype)
  if not is_categorical(df_full[k]):
2021-05-14 10:58:32,330 0 genes in training set are missing from prediction set
2021-05-14 10:58:32,783 starting batch 1 of 2
  if not is_categorical(df_full[k]):
2021-05-14 10:58:32,913 0 genes in training set are missing from prediction set
2021-05-14 10:58:43,083 starting batch 2 of 2
  if not is_categorical(df_full[k]):
2021-05-14 10:58:43,453 0 genes in training set are missing from prediction set
2021-05-14 10:58:45,701 input dataset shape: (6265, 633)
2021-05-14 10:58:45,704 possible cell types: ['B cell', 'Endothelial', 'Fibroblast', 'Malignant', 'Microglia', 'Neuron', 'Oligodendrocytes', 'T cells']
2021-05-14 10:58:45,704 using validation key


snATACseq gbm_motif (3393, 633) (2872, 633)
(3393, 633) (2872, 633)


  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2021-05-14 10:58:48,421 epoch: 1, train loss: 7.364092826843262, val loss: 6.813343048095703
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2021-05-14 10:58:50,038 epoch: 2, train loss: 6.357682228088379, val loss: 6.044459342956543
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2021-05-14 10:58:51,772 epoch: 3, train loss: 6.010678768157959, val loss: 5.844239234924316
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2021-05-14 10:58:53,462 epoch: 4, train loss: 5.544738292694092, val loss: 5.426552772521973
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2021-05-14 10:58:55,147 epoch: 5, train loss: 5.474468231201172, val loss: 5.4228363037109375
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2021-05-14 10:58:56,846 epoch: 6, train loss: 5.22240

  if not is_categorical(df_full[k]):
2021-05-14 10:59:09,867 epoch: 14, train loss: 4.567464828491211, val loss: 4.761177062988281
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2021-05-14 10:59:11,714 epoch: 15, train loss: 4.630905628204346, val loss: 4.867183685302734


True


  d['descr'] = dtype_to_descr(array.dtype)
  if not is_categorical(df_full[k]):
2021-05-14 10:59:15,326 0 genes in training set are missing from prediction set
2021-05-14 10:59:15,453 starting batch 1 of 2
  if not is_categorical(df_full[k]):
2021-05-14 10:59:15,511 0 genes in training set are missing from prediction set
2021-05-14 10:59:15,674 starting batch 2 of 2
  if not is_categorical(df_full[k]):
2021-05-14 10:59:15,732 0 genes in training set are missing from prediction set


snRNAseq brca (5252, 29175) (4893, 29175)
(5252, 29175) (4893, 29175)


2021-05-14 10:59:27,664 input dataset shape: (10145, 29175)
2021-05-14 10:59:27,667 possible cell types: ['Adipocyte', 'B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Fibroblast', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Plasma', 'Treg']
2021-05-14 10:59:27,668 using validation key
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2021-05-14 10:59:47,405 epoch: 1, train loss: 41.4021110534668, val loss: 48.89822006225586
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2021-05-14 11:00:03,690 epoch: 2, train loss: 40.49360656738281, val loss: 48.08216094970703
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2021-05-14 11:00:19,915 epoch: 3, train loss: 40.11518859863281, val loss: 47.90730285644531
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2021-05-14 11:00:36,716 epoch: 4, train loss: 39.81276321411133, val loss: 47.5032768249

2021-05-14 11:03:10,191 epoch: 13, train loss: 38.23927688598633, val loss: 47.008785247802734
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2021-05-14 11:03:27,079 epoch: 14, train loss: 38.05738830566406, val loss: 46.938026428222656
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2021-05-14 11:03:43,433 epoch: 15, train loss: 37.94054412841797, val loss: 47.014102935791016


True


  d['descr'] = dtype_to_descr(array.dtype)
  if not is_categorical(df_full[k]):
2021-05-14 11:03:56,941 0 genes in training set are missing from prediction set
2021-05-14 11:03:57,598 starting batch 1 of 2
  if not is_categorical(df_full[k]):
2021-05-14 11:03:57,707 0 genes in training set are missing from prediction set
2021-05-14 11:04:05,543 starting batch 2 of 2
  if not is_categorical(df_full[k]):
2021-05-14 11:04:06,017 0 genes in training set are missing from prediction set


snRNAseq ccrcc (4754, 33538) (4518, 33538)
(4754, 33538) (4518, 33538)


2021-05-14 11:04:27,040 input dataset shape: (9272, 33538)
2021-05-14 11:04:27,042 possible cell types: ['CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Epithelial', 'Fibroblast', 'Malignant', 'Monocyte', 'NK', 'Plasma', 'Treg']
2021-05-14 11:04:27,042 using validation key
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2021-05-14 11:04:46,555 epoch: 1, train loss: 39.37352752685547, val loss: 57.88219451904297
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2021-05-14 11:05:03,995 epoch: 2, train loss: 38.72649002075195, val loss: 56.877872467041016
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2021-05-14 11:05:20,994 epoch: 3, train loss: 38.35871887207031, val loss: 56.07912826538086
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2021-05-14 11:05:38,056 epoch: 4, train loss: 38.15544891357422, val loss: 55.74953079223633
  if not is

2021-05-14 11:08:11,943 epoch: 13, train loss: 36.70231628417969, val loss: 55.37226486206055
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2021-05-14 11:08:30,291 epoch: 14, train loss: 36.57027816772461, val loss: 55.383338928222656
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2021-05-14 11:08:47,684 epoch: 15, train loss: 36.45988845825195, val loss: 55.310096740722656


True


  d['descr'] = dtype_to_descr(array.dtype)
  if not is_categorical(df_full[k]):
2021-05-14 11:08:59,931 0 genes in training set are missing from prediction set
2021-05-14 11:09:00,631 starting batch 1 of 2
  if not is_categorical(df_full[k]):
2021-05-14 11:09:00,756 0 genes in training set are missing from prediction set
2021-05-14 11:09:09,414 starting batch 2 of 2
  if not is_categorical(df_full[k]):
2021-05-14 11:09:09,532 0 genes in training set are missing from prediction set


snRNAseq gbm (3722, 29748) (3577, 29748)
(3722, 29748) (3577, 29748)


2021-05-14 11:09:27,060 input dataset shape: (7299, 29748)
2021-05-14 11:09:27,064 possible cell types: ['B cell', 'Endothelial', 'Fibroblast', 'Malignant', 'Microglia', 'Monocyte', 'Neuron', 'Oligodendrocytes', 'T cells']
2021-05-14 11:09:27,065 using validation key
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2021-05-14 11:09:42,376 epoch: 1, train loss: 47.32548522949219, val loss: 45.10520553588867
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2021-05-14 11:09:55,107 epoch: 2, train loss: 46.47967529296875, val loss: 44.60308837890625
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2021-05-14 11:10:07,901 epoch: 3, train loss: 45.93484878540039, val loss: 44.459510803222656
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2021-05-14 11:10:20,237 epoch: 4, train loss: 45.55021286010742, val loss: 44.257965087890625
  if not is_categorical(d

2021-05-14 11:12:09,704 epoch: 13, train loss: 43.34651565551758, val loss: 43.93813705444336
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2021-05-14 11:12:22,062 epoch: 14, train loss: 43.00531768798828, val loss: 43.9701042175293
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2021-05-14 11:12:34,032 epoch: 15, train loss: 42.69337463378906, val loss: 43.97084426879883


True


  d['descr'] = dtype_to_descr(array.dtype)
  if not is_categorical(df_full[k]):
2021-05-14 11:12:43,148 0 genes in training set are missing from prediction set
2021-05-14 11:12:43,681 starting batch 1 of 2
  if not is_categorical(df_full[k]):
2021-05-14 11:12:43,785 0 genes in training set are missing from prediction set
2021-05-14 11:12:53,381 starting batch 2 of 2
  if not is_categorical(df_full[k]):
2021-05-14 11:12:53,471 0 genes in training set are missing from prediction set


In [27]:
run_workflow_for_cross_disease(adata_map, run_pollock_workflow, 'pollock', RESULTS_CROSS_DISEASE_DIR)

scRNAseq brca_train_brca_val (6080, 27131) (5773, 27131)


2021-03-29 16:43:36,039 input dataset shape: (11853, 27131)
2021-03-29 16:43:36,041 possible cell types: ['B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Erythrocyte', 'Fibroblast', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Plasma', 'Treg']
2021-03-29 16:43:36,042 using validation key
2021-03-29 16:43:54,621 epoch: 1, train loss: 38.09257888793945, val loss: 45.976531982421875
2021-03-29 16:44:10,634 epoch: 2, train loss: 37.1802978515625, val loss: 44.32764434814453
2021-03-29 16:44:26,617 epoch: 3, train loss: 36.530113220214844, val loss: 43.4514274597168
2021-03-29 16:44:42,776 epoch: 4, train loss: 36.08720397949219, val loss: 42.96310043334961
2021-03-29 16:44:58,743 epoch: 5, train loss: 35.840293884277344, val loss: 42.747711181640625
2021-03-29 16:45:14,678 epoch: 6, train loss: 35.62464141845703, val loss: 42.554988861083984
2021-03-29 16:45:30,603 epoch: 7, train loss: 35.42350769042969, val loss: 42.33623504638672
2021-03-29 16:45:46,562 epoch: 8, train los

True


2021-03-29 16:47:51,135 0 genes in training set are missing from prediction set
2021-03-29 16:47:51,782 starting batch 1 of 3
2021-03-29 16:47:51,885 0 genes in training set are missing from prediction set
2021-03-29 16:47:57,503 starting batch 2 of 3
2021-03-29 16:47:57,640 0 genes in training set are missing from prediction set
2021-03-29 16:48:03,485 starting batch 3 of 3
2021-03-29 16:48:03,605 0 genes in training set are missing from prediction set


scRNAseq brca_train_cesc_val (6080, 27131) (4277, 22928)


2021-03-29 16:48:15,065 input dataset shape: (10357, 22662)
2021-03-29 16:48:15,067 possible cell types: ['B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Epithelial', 'Erythrocyte', 'Fibroblast', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Plasma', 'Treg']
2021-03-29 16:48:15,068 using validation key
2021-03-29 16:48:30,150 epoch: 1, train loss: 25.401212692260742, val loss: 54.72344207763672
2021-03-29 16:48:42,888 epoch: 2, train loss: 24.9804744720459, val loss: 53.31889343261719
2021-03-29 16:48:55,623 epoch: 3, train loss: 24.573949813842773, val loss: 51.886512756347656
2021-03-29 16:49:08,172 epoch: 4, train loss: 24.26601219177246, val loss: 51.30940628051758
2021-03-29 16:49:20,730 epoch: 5, train loss: 24.05350112915039, val loss: 51.07929992675781
2021-03-29 16:49:33,350 epoch: 6, train loss: 23.937894821166992, val loss: 50.913063049316406
2021-03-29 16:49:45,947 epoch: 7, train loss: 23.835390090942383, val loss: 50.81753158569336
2021-03-29 16:49:58,691 epo

False


2021-03-29 16:51:36,780 0 genes in training set are missing from prediction set
2021-03-29 16:51:37,320 starting batch 1 of 2
2021-03-29 16:51:37,448 0 genes in training set are missing from prediction set
2021-03-29 16:51:43,761 starting batch 2 of 2
2021-03-29 16:51:43,879 0 genes in training set are missing from prediction set


scRNAseq brca_train_hnscc_val (6080, 27131) (5203, 26929)


2021-03-29 16:52:02,815 input dataset shape: (11283, 25823)
2021-03-29 16:52:02,817 possible cell types: ['B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Erythrocyte', 'Fibroblast', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Plasma', 'Treg']
2021-03-29 16:52:02,818 using validation key
2021-03-29 16:52:21,094 epoch: 1, train loss: 28.25162696838379, val loss: 21.737442016601562
2021-03-29 16:52:36,203 epoch: 2, train loss: 27.667186737060547, val loss: 21.450454711914062
2021-03-29 16:52:51,446 epoch: 3, train loss: 27.274856567382812, val loss: 21.276588439941406
2021-03-29 16:53:06,749 epoch: 4, train loss: 26.981069564819336, val loss: 21.123579025268555
2021-03-29 16:53:21,988 epoch: 5, train loss: 26.78883934020996, val loss: 21.005821228027344
2021-03-29 16:53:36,983 epoch: 6, train loss: 26.656055450439453, val loss: 20.96187973022461
2021-03-29 16:53:51,929 epoch: 7, train loss: 26.541343688964844, val loss: 20.9296875
2021-03-29 16:54:07,222 epoch: 8, train los

False


2021-03-29 16:56:05,255 0 genes in training set are missing from prediction set
2021-03-29 16:56:05,898 starting batch 1 of 3
2021-03-29 16:56:05,983 0 genes in training set are missing from prediction set
2021-03-29 16:56:12,457 starting batch 2 of 3
2021-03-29 16:56:12,583 0 genes in training set are missing from prediction set
2021-03-29 16:56:19,708 starting batch 3 of 3
2021-03-29 16:56:19,798 0 genes in training set are missing from prediction set


scRNAseq brca_train_melanoma_val (6080, 27131) (3532, 23452)


2021-03-29 16:56:31,306 input dataset shape: (9612, 21535)
2021-03-29 16:56:31,309 possible cell types: ['B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Erythrocyte', 'Fibroblast', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Plasma', 'Treg']
2021-03-29 16:56:31,310 using validation key
2021-03-29 16:56:46,999 epoch: 1, train loss: 41.059566497802734, val loss: 32.81226348876953
2021-03-29 16:56:59,821 epoch: 2, train loss: 40.10809326171875, val loss: 32.33860778808594
2021-03-29 16:57:12,724 epoch: 3, train loss: 39.42149353027344, val loss: 32.070335388183594
2021-03-29 16:57:25,712 epoch: 4, train loss: 38.89492416381836, val loss: 31.790212631225586
2021-03-29 16:57:38,563 epoch: 5, train loss: 38.636817932128906, val loss: 31.659765243530273
2021-03-29 16:57:51,345 epoch: 6, train loss: 38.40359878540039, val loss: 31.602153778076172
2021-03-29 16:58:04,112 epoch: 7, train loss: 38.193965911865234, val loss: 31.54079246520996
2021-03-29 16:58:17,089 epoch: 8, train 

False


2021-03-29 16:59:56,832 0 genes in training set are missing from prediction set
2021-03-29 16:59:57,277 starting batch 1 of 2
2021-03-29 16:59:57,396 0 genes in training set are missing from prediction set
2021-03-29 17:00:01,819 starting batch 2 of 2
2021-03-29 17:00:01,926 0 genes in training set are missing from prediction set


scRNAseq brca_train_myeloma_val (6080, 27131) (3312, 24020)


2021-03-29 17:00:10,665 input dataset shape: (9392, 19480)
2021-03-29 17:00:10,667 possible cell types: ['B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Erythrocyte', 'Fibroblast', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Plasma', 'Platlete', 'Treg']
2021-03-29 17:00:10,667 using validation key
2021-03-29 17:00:23,540 epoch: 1, train loss: 42.33722686767578, val loss: 22.070268630981445
2021-03-29 17:00:34,619 epoch: 2, train loss: 41.27653121948242, val loss: 21.913171768188477
2021-03-29 17:00:45,491 epoch: 3, train loss: 40.59590530395508, val loss: 21.777271270751953
2021-03-29 17:00:56,503 epoch: 4, train loss: 40.11042785644531, val loss: 21.692485809326172
2021-03-29 17:01:07,306 epoch: 5, train loss: 39.842872619628906, val loss: 21.618431091308594
2021-03-29 17:01:18,198 epoch: 6, train loss: 39.59532165527344, val loss: 21.57975959777832
2021-03-29 17:01:29,303 epoch: 7, train loss: 39.37381362915039, val loss: 21.498573303222656
2021-03-29 17:01:40,280 epoc

False


2021-03-29 17:03:04,065 0 genes in training set are missing from prediction set
2021-03-29 17:03:04,576 starting batch 1 of 2
2021-03-29 17:03:04,696 0 genes in training set are missing from prediction set
2021-03-29 17:03:09,947 starting batch 2 of 2
2021-03-29 17:03:10,044 0 genes in training set are missing from prediction set


scRNAseq brca_train_pbmc_val (6080, 27131) (886, 32738)


2021-03-29 17:03:16,402 input dataset shape: (6966, 18511)
2021-03-29 17:03:16,403 possible cell types: ['B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Erythrocyte', 'Fibroblast', 'Malignant', 'Mast', 'Megakaryocyte', 'Monocyte', 'NK', 'Plasma', 'Treg']
2021-03-29 17:03:16,404 using validation key
2021-03-29 17:03:28,928 epoch: 1, train loss: 37.105953216552734, val loss: 16.436721801757812
2021-03-29 17:03:40,088 epoch: 2, train loss: 36.29838562011719, val loss: 16.286388397216797
2021-03-29 17:03:51,146 epoch: 3, train loss: 35.658390045166016, val loss: 16.145179748535156
2021-03-29 17:04:02,324 epoch: 4, train loss: 35.145111083984375, val loss: 16.07301139831543
2021-03-29 17:04:13,448 epoch: 5, train loss: 34.9189453125, val loss: 16.009647369384766
2021-03-29 17:04:24,825 epoch: 6, train loss: 34.72952651977539, val loss: 15.976296424865723
2021-03-29 17:04:35,891 epoch: 7, train loss: 34.52102279663086, val loss: 15.948101043701172
2021-03-29 17:04:46,889 e

False


2021-03-29 17:06:11,816 0 genes in training set are missing from prediction set
2021-03-29 17:06:12,246 starting batch 1 of 1
2021-03-29 17:06:12,342 0 genes in training set are missing from prediction set


scRNAseq brca_train_pdac_val (6080, 27131) (7840, 28756)


2021-03-29 17:06:29,641 input dataset shape: (13920, 26833)
2021-03-29 17:06:29,644 possible cell types: ['Acinar', 'B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Epithelial', 'Erythrocyte', 'Fibroblast', 'Islet', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Plasma', 'Treg', 'Tuft']
2021-03-29 17:06:29,644 using validation key
2021-03-29 17:06:48,702 epoch: 1, train loss: 29.206989288330078, val loss: 36.3941535949707
2021-03-29 17:07:05,161 epoch: 2, train loss: 28.632104873657227, val loss: 35.78704071044922
2021-03-29 17:07:21,673 epoch: 3, train loss: 28.15154457092285, val loss: 34.90985870361328
2021-03-29 17:07:38,035 epoch: 4, train loss: 27.736772537231445, val loss: 34.565826416015625
2021-03-29 17:07:54,268 epoch: 5, train loss: 27.517642974853516, val loss: 34.40777587890625
2021-03-29 17:08:10,636 epoch: 6, train loss: 27.383460998535156, val loss: 34.32059097290039
2021-03-29 17:08:27,618 epoch: 7, train loss: 27.248727798461914, val loss: 34.24432373046875

False


2021-03-29 17:10:49,690 0 genes in training set are missing from prediction set
2021-03-29 17:10:50,324 starting batch 1 of 4
2021-03-29 17:10:50,455 0 genes in training set are missing from prediction set
2021-03-29 17:10:56,297 starting batch 2 of 4
2021-03-29 17:10:56,387 0 genes in training set are missing from prediction set
2021-03-29 17:11:02,164 starting batch 3 of 4
2021-03-29 17:11:02,284 0 genes in training set are missing from prediction set
2021-03-29 17:11:08,002 starting batch 4 of 4
2021-03-29 17:11:08,096 0 genes in training set are missing from prediction set


scRNAseq cesc_train_brca_val (4660, 22928) (5773, 27131)


2021-03-29 17:11:20,258 input dataset shape: (10433, 22662)
2021-03-29 17:11:20,260 possible cell types: ['B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Epithelial', 'Erythrocyte', 'Fibroblast', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Plasma', 'Treg']
2021-03-29 17:11:20,261 using validation key
2021-03-29 17:11:33,548 epoch: 1, train loss: 43.95503616333008, val loss: 30.338809967041016
2021-03-29 17:11:44,251 epoch: 2, train loss: 42.5810546875, val loss: 29.610248565673828
2021-03-29 17:11:54,953 epoch: 3, train loss: 41.67795944213867, val loss: 29.082901000976562
2021-03-29 17:12:05,285 epoch: 4, train loss: 41.10697937011719, val loss: 28.765546798706055
2021-03-29 17:12:15,864 epoch: 5, train loss: 40.77766036987305, val loss: 28.604637145996094
2021-03-29 17:12:26,546 epoch: 6, train loss: 40.548458099365234, val loss: 28.533708572387695
2021-03-29 17:12:36,987 epoch: 7, train loss: 40.36697769165039, val loss: 28.444286346435547
2021-03-29 17:12:47,775 epoc

False


2021-03-29 17:14:10,678 0 genes in training set are missing from prediction set
2021-03-29 17:14:11,245 starting batch 1 of 3
2021-03-29 17:14:11,348 0 genes in training set are missing from prediction set
2021-03-29 17:14:17,418 starting batch 2 of 3
2021-03-29 17:14:17,514 0 genes in training set are missing from prediction set
2021-03-29 17:14:23,770 starting batch 3 of 3
2021-03-29 17:14:23,848 0 genes in training set are missing from prediction set


scRNAseq cesc_train_cesc_val (4660, 22928) (4277, 22928)


2021-03-29 17:14:35,388 input dataset shape: (8937, 22928)
2021-03-29 17:14:35,390 possible cell types: ['CD4 T cell', 'CD8 T cell', 'Endothelial', 'Epithelial', 'Erythrocyte', 'Fibroblast', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Plasma']
2021-03-29 17:14:35,391 using validation key
2021-03-29 17:14:48,622 epoch: 1, train loss: 38.970455169677734, val loss: 48.35221862792969
2021-03-29 17:14:58,600 epoch: 2, train loss: 37.71548843383789, val loss: 46.58669662475586
2021-03-29 17:15:08,473 epoch: 3, train loss: 36.986717224121094, val loss: 45.25285720825195
2021-03-29 17:15:18,274 epoch: 4, train loss: 36.49132537841797, val loss: 44.441261291503906
2021-03-29 17:15:28,105 epoch: 5, train loss: 36.226802825927734, val loss: 44.08770751953125
2021-03-29 17:15:38,031 epoch: 6, train loss: 36.02806091308594, val loss: 43.76536560058594
2021-03-29 17:15:47,746 epoch: 7, train loss: 35.81267547607422, val loss: 43.438392639160156
2021-03-29 17:15:57,621 epoch: 8, train loss: 35.6559829711

True


2021-03-29 17:17:16,404 0 genes in training set are missing from prediction set
2021-03-29 17:17:16,971 starting batch 1 of 2
2021-03-29 17:17:17,100 0 genes in training set are missing from prediction set
2021-03-29 17:17:22,680 starting batch 2 of 2
2021-03-29 17:17:22,786 0 genes in training set are missing from prediction set


scRNAseq cesc_train_hnscc_val (4660, 22928) (5203, 26929)


2021-03-29 17:17:35,343 input dataset shape: (9863, 22756)
2021-03-29 17:17:35,345 possible cell types: ['B cell', 'CD4 T cell', 'CD8 T cell', 'Endothelial', 'Epithelial', 'Erythrocyte', 'Fibroblast', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Plasma', 'Treg']
2021-03-29 17:17:35,346 using validation key
2021-03-29 17:17:47,706 epoch: 1, train loss: 39.59408950805664, val loss: 17.76010513305664
2021-03-29 17:17:57,564 epoch: 2, train loss: 38.372230529785156, val loss: 17.57155418395996
2021-03-29 17:18:07,291 epoch: 3, train loss: 37.6484489440918, val loss: 17.3422794342041
2021-03-29 17:18:17,125 epoch: 4, train loss: 37.09962463378906, val loss: 17.200908660888672
2021-03-29 17:18:27,015 epoch: 5, train loss: 36.82865905761719, val loss: 17.116844177246094
2021-03-29 17:18:36,925 epoch: 6, train loss: 36.63963317871094, val loss: 17.087982177734375
2021-03-29 17:18:46,834 epoch: 7, train loss: 36.45395278930664, val loss: 17.018192291259766
2021-03-29 17:18:56,557 epoch: 8, train los

False


2021-03-29 17:20:13,510 0 genes in training set are missing from prediction set
2021-03-29 17:20:14,094 starting batch 1 of 3
2021-03-29 17:20:14,203 0 genes in training set are missing from prediction set
2021-03-29 17:20:19,965 starting batch 2 of 3
2021-03-29 17:20:20,064 0 genes in training set are missing from prediction set
2021-03-29 17:20:26,289 starting batch 3 of 3
2021-03-29 17:20:26,365 0 genes in training set are missing from prediction set


scRNAseq cesc_train_melanoma_val (4660, 22928) (3532, 23452)


2021-03-29 17:20:33,214 input dataset shape: (8192, 20043)
2021-03-29 17:20:33,216 possible cell types: ['B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Epithelial', 'Erythrocyte', 'Fibroblast', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Plasma', 'Treg']
2021-03-29 17:20:33,216 using validation key
2021-03-29 17:20:44,677 epoch: 1, train loss: 44.5264892578125, val loss: 20.80521011352539
2021-03-29 17:20:54,109 epoch: 2, train loss: 43.13418197631836, val loss: 20.555274963378906
2021-03-29 17:21:03,627 epoch: 3, train loss: 42.16836929321289, val loss: 20.325328826904297
2021-03-29 17:21:13,052 epoch: 4, train loss: 41.58165740966797, val loss: 20.221973419189453
2021-03-29 17:21:22,533 epoch: 5, train loss: 41.3079719543457, val loss: 20.149456024169922
2021-03-29 17:21:32,029 epoch: 6, train loss: 41.0824089050293, val loss: 20.083162307739258
2021-03-29 17:21:41,602 epoch: 7, train loss: 40.858699798583984, val loss: 20.026111602783203
2021-03-29 17:21:51,094 epoch

False


2021-03-29 17:23:04,365 0 genes in training set are missing from prediction set
2021-03-29 17:23:04,786 starting batch 1 of 2
2021-03-29 17:23:04,881 0 genes in training set are missing from prediction set
2021-03-29 17:23:09,596 starting batch 2 of 2
2021-03-29 17:23:09,680 0 genes in training set are missing from prediction set


scRNAseq cesc_train_myeloma_val (4660, 22928) (3312, 24020)


2021-03-29 17:23:17,530 input dataset shape: (7972, 18774)
2021-03-29 17:23:17,532 possible cell types: ['B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Epithelial', 'Erythrocyte', 'Fibroblast', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Plasma', 'Platlete']
2021-03-29 17:23:17,533 using validation key
2021-03-29 17:23:28,235 epoch: 1, train loss: 44.92673110961914, val loss: 13.573894500732422
2021-03-29 17:23:37,346 epoch: 2, train loss: 43.571434020996094, val loss: 13.463911056518555
2021-03-29 17:23:46,478 epoch: 3, train loss: 42.57666778564453, val loss: 13.385458946228027
2021-03-29 17:23:55,661 epoch: 4, train loss: 42.02755355834961, val loss: 13.336674690246582
2021-03-29 17:24:04,623 epoch: 5, train loss: 41.705020904541016, val loss: 13.276580810546875
2021-03-29 17:24:13,699 epoch: 6, train loss: 41.485679626464844, val loss: 13.219743728637695
2021-03-29 17:24:22,788 epoch: 7, train loss: 41.269065856933594, val loss: 13.23572826385498
2021-03-29 17:24:31

False


2021-03-29 17:25:42,096 0 genes in training set are missing from prediction set
2021-03-29 17:25:42,585 starting batch 1 of 2
2021-03-29 17:25:42,682 0 genes in training set are missing from prediction set
2021-03-29 17:25:47,774 starting batch 2 of 2
2021-03-29 17:25:47,850 0 genes in training set are missing from prediction set


scRNAseq cesc_train_pbmc_val (4660, 22928) (886, 32738)


2021-03-29 17:25:53,171 input dataset shape: (5546, 17080)
2021-03-29 17:25:53,173 possible cell types: ['B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Epithelial', 'Erythrocyte', 'Fibroblast', 'Malignant', 'Mast', 'Megakaryocyte', 'Monocyte', 'NK', 'Plasma']
2021-03-29 17:25:53,174 using validation key
2021-03-29 17:26:02,167 epoch: 1, train loss: 39.309391021728516, val loss: 10.388965606689453
2021-03-29 17:26:09,740 epoch: 2, train loss: 38.10360336303711, val loss: 10.32020378112793
2021-03-29 17:26:17,438 epoch: 3, train loss: 37.262718200683594, val loss: 10.176831245422363
2021-03-29 17:26:25,083 epoch: 4, train loss: 36.739227294921875, val loss: 10.164852142333984
2021-03-29 17:26:32,760 epoch: 5, train loss: 36.462196350097656, val loss: 10.115073204040527
2021-03-29 17:26:40,263 epoch: 6, train loss: 36.2695426940918, val loss: 10.048784255981445
2021-03-29 17:26:47,850 epoch: 7, train loss: 36.06563186645508, val loss: 10.010591506958008
2021-03-29 17:2

False


2021-03-29 17:27:55,168 0 genes in training set are missing from prediction set
2021-03-29 17:27:55,668 starting batch 1 of 1
2021-03-29 17:27:55,749 0 genes in training set are missing from prediction set


scRNAseq cesc_train_pdac_val (4660, 22928) (7840, 28756)


2021-03-29 17:28:08,092 input dataset shape: (12500, 22794)
2021-03-29 17:28:08,095 possible cell types: ['Acinar', 'B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Epithelial', 'Erythrocyte', 'Fibroblast', 'Islet', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Plasma', 'Treg', 'Tuft']
2021-03-29 17:28:08,095 using validation key
2021-03-29 17:28:21,613 epoch: 1, train loss: 39.28191375732422, val loss: 27.275489807128906
2021-03-29 17:28:32,624 epoch: 2, train loss: 38.01945495605469, val loss: 26.655475616455078
2021-03-29 17:28:43,580 epoch: 3, train loss: 37.3148078918457, val loss: 26.236244201660156
2021-03-29 17:28:54,545 epoch: 4, train loss: 36.76176452636719, val loss: 25.938926696777344
2021-03-29 17:29:05,567 epoch: 5, train loss: 36.46466827392578, val loss: 25.76913070678711
2021-03-29 17:29:16,531 epoch: 6, train loss: 36.236663818359375, val loss: 25.671993255615234
2021-03-29 17:29:27,402 epoch: 7, train loss: 36.0482177734375, val loss: 25.617599487304688


False


2021-03-29 17:31:02,990 0 genes in training set are missing from prediction set
2021-03-29 17:31:03,565 starting batch 1 of 4
2021-03-29 17:31:03,673 0 genes in training set are missing from prediction set
2021-03-29 17:31:09,700 starting batch 2 of 4
2021-03-29 17:31:09,806 0 genes in training set are missing from prediction set
2021-03-29 17:31:15,902 starting batch 3 of 4
2021-03-29 17:31:15,986 0 genes in training set are missing from prediction set
2021-03-29 17:31:21,786 starting batch 4 of 4
2021-03-29 17:31:21,864 0 genes in training set are missing from prediction set


scRNAseq hnscc_train_brca_val (5285, 26929) (5773, 27131)


2021-03-29 17:31:33,645 input dataset shape: (11058, 25823)
2021-03-29 17:31:33,649 possible cell types: ['B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Erythrocyte', 'Fibroblast', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Plasma', 'Treg']
2021-03-29 17:31:33,649 using validation key
2021-03-29 17:31:49,477 epoch: 1, train loss: 37.21488571166992, val loss: 34.2575798034668
2021-03-29 17:32:02,714 epoch: 2, train loss: 36.223289489746094, val loss: 33.45338439941406
2021-03-29 17:32:16,142 epoch: 3, train loss: 35.676021575927734, val loss: 33.088462829589844
2021-03-29 17:32:29,589 epoch: 4, train loss: 35.32307052612305, val loss: 32.952972412109375
2021-03-29 17:32:43,068 epoch: 5, train loss: 35.065731048583984, val loss: 32.85678482055664
2021-03-29 17:32:56,384 epoch: 6, train loss: 34.783302307128906, val loss: 32.79422378540039
2021-03-29 17:33:09,720 epoch: 7, train loss: 34.57206726074219, val loss: 32.75565719604492
2021-03-29 17:33:23,152 epoch: 8, train l

False


2021-03-29 17:35:07,027 0 genes in training set are missing from prediction set
2021-03-29 17:35:07,659 starting batch 1 of 3
2021-03-29 17:35:07,779 0 genes in training set are missing from prediction set
2021-03-29 17:35:13,857 starting batch 2 of 3
2021-03-29 17:35:13,965 0 genes in training set are missing from prediction set
2021-03-29 17:35:20,043 starting batch 3 of 3
2021-03-29 17:35:20,133 0 genes in training set are missing from prediction set


scRNAseq hnscc_train_cesc_val (5285, 26929) (4277, 22928)


2021-03-29 17:35:30,783 input dataset shape: (9562, 22756)
2021-03-29 17:35:30,785 possible cell types: ['B cell', 'CD4 T cell', 'CD8 T cell', 'Endothelial', 'Epithelial', 'Erythrocyte', 'Fibroblast', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Plasma', 'Treg']
2021-03-29 17:35:30,786 using validation key
2021-03-29 17:35:43,811 epoch: 1, train loss: 29.90556526184082, val loss: 49.27644729614258
2021-03-29 17:35:54,569 epoch: 2, train loss: 29.159217834472656, val loss: 48.34270095825195
2021-03-29 17:36:05,454 epoch: 3, train loss: 28.664880752563477, val loss: 47.32255554199219
2021-03-29 17:36:16,271 epoch: 4, train loss: 28.400081634521484, val loss: 46.82583999633789
2021-03-29 17:36:27,046 epoch: 5, train loss: 28.193864822387695, val loss: 46.65768814086914
2021-03-29 17:36:37,690 epoch: 6, train loss: 27.961153030395508, val loss: 46.46028137207031
2021-03-29 17:36:48,421 epoch: 7, train loss: 27.80679702758789, val loss: 46.283836364746094
2021-03-29 17:36:59,329 epoch: 8, train 

False


2021-03-29 17:38:23,926 0 genes in training set are missing from prediction set
2021-03-29 17:38:24,465 starting batch 1 of 2
2021-03-29 17:38:24,585 0 genes in training set are missing from prediction set
2021-03-29 17:38:30,888 starting batch 2 of 2
2021-03-29 17:38:31,002 0 genes in training set are missing from prediction set


scRNAseq hnscc_train_hnscc_val (5285, 26929) (5203, 26929)


2021-03-29 17:38:48,523 input dataset shape: (10488, 26929)
2021-03-29 17:38:48,525 possible cell types: ['B cell', 'CD4 T cell', 'CD8 T cell', 'Endothelial', 'Erythrocyte', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Plasma', 'Treg']
2021-03-29 17:38:48,526 using validation key
2021-03-29 17:39:06,012 epoch: 1, train loss: 33.069149017333984, val loss: 19.35256576538086
2021-03-29 17:39:20,365 epoch: 2, train loss: 32.28483581542969, val loss: 19.09788703918457
2021-03-29 17:39:34,653 epoch: 3, train loss: 31.745080947875977, val loss: 18.949459075927734
2021-03-29 17:39:49,014 epoch: 4, train loss: 31.519916534423828, val loss: 18.872848510742188
2021-03-29 17:40:03,022 epoch: 5, train loss: 31.319931030273438, val loss: 18.804443359375
2021-03-29 17:40:17,237 epoch: 6, train loss: 31.103418350219727, val loss: 18.728565216064453
2021-03-29 17:40:31,419 epoch: 7, train loss: 30.915218353271484, val loss: 18.659421920776367
2021-03-29 17:40:45,792 epoch: 8, train loss: 30.766456604003906,

True


2021-03-29 17:42:38,689 0 genes in training set are missing from prediction set
2021-03-29 17:42:39,272 starting batch 1 of 3
2021-03-29 17:42:39,361 0 genes in training set are missing from prediction set
2021-03-29 17:42:45,544 starting batch 2 of 3
2021-03-29 17:42:45,667 0 genes in training set are missing from prediction set
2021-03-29 17:42:52,616 starting batch 3 of 3
2021-03-29 17:42:52,723 0 genes in training set are missing from prediction set


scRNAseq hnscc_train_melanoma_val (5285, 26929) (3532, 23452)


2021-03-29 17:43:03,397 input dataset shape: (8817, 21552)
2021-03-29 17:43:03,399 possible cell types: ['B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Erythrocyte', 'Fibroblast', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Plasma', 'Treg']
2021-03-29 17:43:03,400 using validation key
2021-03-29 17:43:16,373 epoch: 1, train loss: 38.524532318115234, val loss: 23.44341278076172
2021-03-29 17:43:26,781 epoch: 2, train loss: 37.58066177368164, val loss: 23.162574768066406
2021-03-29 17:43:37,298 epoch: 3, train loss: 36.93916320800781, val loss: 23.055517196655273
2021-03-29 17:43:47,603 epoch: 4, train loss: 36.68962478637695, val loss: 22.97412109375
2021-03-29 17:43:57,963 epoch: 5, train loss: 36.36305618286133, val loss: 22.844253540039062
2021-03-29 17:44:08,202 epoch: 6, train loss: 36.14926528930664, val loss: 22.77176856994629
2021-03-29 17:44:18,665 epoch: 7, train loss: 35.95075225830078, val loss: 22.717018127441406
2021-03-29 17:44:29,224 epoch: 8, train loss:

False


2021-03-29 17:45:50,472 0 genes in training set are missing from prediction set
2021-03-29 17:45:50,803 starting batch 1 of 2
2021-03-29 17:45:50,914 0 genes in training set are missing from prediction set
2021-03-29 17:45:55,563 starting batch 2 of 2
2021-03-29 17:45:55,660 0 genes in training set are missing from prediction set


scRNAseq hnscc_train_myeloma_val (5285, 26929) (3312, 24020)


2021-03-29 17:46:03,990 input dataset shape: (8597, 19477)
2021-03-29 17:46:03,992 possible cell types: ['B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Erythrocyte', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Plasma', 'Platlete', 'Treg']
2021-03-29 17:46:03,992 using validation key
2021-03-29 17:46:16,665 epoch: 1, train loss: 39.19367980957031, val loss: 15.826425552368164
2021-03-29 17:46:27,357 epoch: 2, train loss: 38.13373565673828, val loss: 15.732495307922363
2021-03-29 17:46:38,139 epoch: 3, train loss: 37.514095306396484, val loss: 15.664765357971191
2021-03-29 17:46:49,045 epoch: 4, train loss: 37.23261260986328, val loss: 15.622005462646484
2021-03-29 17:46:59,871 epoch: 5, train loss: 36.97169876098633, val loss: 15.584677696228027
2021-03-29 17:47:10,626 epoch: 6, train loss: 36.70327377319336, val loss: 15.545831680297852
2021-03-29 17:47:21,517 epoch: 7, train loss: 36.54310989379883, val loss: 15.525693893432617
2021-03-29 17:47:32,240 epoch: 8, train l

False


2021-03-29 17:48:55,466 0 genes in training set are missing from prediction set
2021-03-29 17:48:55,880 starting batch 1 of 2
2021-03-29 17:48:55,991 0 genes in training set are missing from prediction set
2021-03-29 17:49:01,265 starting batch 2 of 2
2021-03-29 17:49:01,364 0 genes in training set are missing from prediction set


scRNAseq hnscc_train_pbmc_val (5285, 26929) (886, 32738)


2021-03-29 17:49:07,127 input dataset shape: (6171, 18521)
2021-03-29 17:49:07,129 possible cell types: ['B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Erythrocyte', 'Malignant', 'Mast', 'Megakaryocyte', 'Monocyte', 'NK', 'Plasma', 'Treg']
2021-03-29 17:49:07,130 using validation key
2021-03-29 17:49:18,469 epoch: 1, train loss: 33.81735610961914, val loss: 11.737594604492188
2021-03-29 17:49:28,392 epoch: 2, train loss: 33.01866149902344, val loss: 11.653855323791504
2021-03-29 17:49:38,225 epoch: 3, train loss: 32.41965866088867, val loss: 11.57262897491455
2021-03-29 17:49:48,167 epoch: 4, train loss: 32.1041374206543, val loss: 11.537128448486328
2021-03-29 17:49:58,106 epoch: 5, train loss: 31.812820434570312, val loss: 11.466057777404785
2021-03-29 17:50:07,989 epoch: 6, train loss: 31.609148025512695, val loss: 11.475940704345703
2021-03-29 17:50:17,961 epoch: 7, train loss: 31.423194885253906, val loss: 11.439579963684082
2021-03-29 17:50:27,886 epoch: 8, tr

False


2021-03-29 17:51:43,281 0 genes in training set are missing from prediction set
2021-03-29 17:51:43,786 starting batch 1 of 1
2021-03-29 17:51:43,881 0 genes in training set are missing from prediction set


scRNAseq hnscc_train_pdac_val (5285, 26929) (7840, 28756)


2021-03-29 17:51:58,596 input dataset shape: (13125, 26359)
2021-03-29 17:51:58,599 possible cell types: ['Acinar', 'B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Epithelial', 'Erythrocyte', 'Fibroblast', 'Islet', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Plasma', 'Treg', 'Tuft']
2021-03-29 17:51:58,600 using validation key
2021-03-29 17:52:15,622 epoch: 1, train loss: 32.717857360839844, val loss: 30.187801361083984
2021-03-29 17:52:29,592 epoch: 2, train loss: 31.960786819458008, val loss: 29.80489730834961
2021-03-29 17:52:43,645 epoch: 3, train loss: 31.334152221679688, val loss: 29.180173873901367
2021-03-29 17:52:57,819 epoch: 4, train loss: 31.097448348999023, val loss: 28.918779373168945
2021-03-29 17:53:11,726 epoch: 5, train loss: 30.84663200378418, val loss: 28.763736724853516
2021-03-29 17:53:25,849 epoch: 6, train loss: 30.63772201538086, val loss: 28.65918731689453
2021-03-29 17:53:39,791 epoch: 7, train loss: 30.42224884033203, val loss: 28.592315673828

False


2021-03-29 17:55:43,662 0 genes in training set are missing from prediction set
2021-03-29 17:55:44,294 starting batch 1 of 4
2021-03-29 17:55:44,416 0 genes in training set are missing from prediction set
2021-03-29 17:55:50,822 starting batch 2 of 4
2021-03-29 17:55:50,931 0 genes in training set are missing from prediction set
2021-03-29 17:55:57,357 starting batch 3 of 4
2021-03-29 17:55:57,445 0 genes in training set are missing from prediction set
2021-03-29 17:56:03,631 starting batch 4 of 4
2021-03-29 17:56:03,718 0 genes in training set are missing from prediction set


scRNAseq melanoma_train_brca_val (4203, 23452) (5773, 27131)


2021-03-29 17:56:13,235 input dataset shape: (9976, 21535)
2021-03-29 17:56:13,237 possible cell types: ['B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Erythrocyte', 'Fibroblast', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Plasma', 'Treg']
2021-03-29 17:56:13,238 using validation key
2021-03-29 17:56:24,987 epoch: 1, train loss: 28.009477615356445, val loss: 52.23123550415039
2021-03-29 17:56:34,395 epoch: 2, train loss: 27.75729751586914, val loss: 51.74180603027344
2021-03-29 17:56:44,074 epoch: 3, train loss: 27.24129295349121, val loss: 50.929290771484375
2021-03-29 17:56:53,771 epoch: 4, train loss: 26.725126266479492, val loss: 50.49611282348633
2021-03-29 17:57:03,351 epoch: 5, train loss: 26.456823348999023, val loss: 50.28691101074219
2021-03-29 17:57:13,034 epoch: 6, train loss: 26.277713775634766, val loss: 50.12602996826172
2021-03-29 17:57:22,714 epoch: 7, train loss: 26.105499267578125, val loss: 49.99877166748047
2021-03-29 17:57:32,309 epoch: 8, train l

False


2021-03-29 17:58:47,564 0 genes in training set are missing from prediction set
2021-03-29 17:58:48,117 starting batch 1 of 3
2021-03-29 17:58:48,219 0 genes in training set are missing from prediction set
2021-03-29 17:58:53,142 starting batch 2 of 3
2021-03-29 17:58:53,235 0 genes in training set are missing from prediction set
2021-03-29 17:58:58,269 starting batch 3 of 3
2021-03-29 17:58:58,350 0 genes in training set are missing from prediction set


scRNAseq melanoma_train_cesc_val (4203, 23452) (4277, 22928)


2021-03-29 17:59:07,805 input dataset shape: (8480, 20043)
2021-03-29 17:59:07,807 possible cell types: ['B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Epithelial', 'Erythrocyte', 'Fibroblast', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Plasma', 'Treg']
2021-03-29 17:59:07,808 using validation key
2021-03-29 17:59:18,575 epoch: 1, train loss: 17.871051788330078, val loss: 59.66471481323242
2021-03-29 17:59:27,446 epoch: 2, train loss: 17.712465286254883, val loss: 59.68572998046875
2021-03-29 17:59:36,137 epoch: 3, train loss: 17.453807830810547, val loss: 58.51425552368164
2021-03-29 17:59:45,012 epoch: 4, train loss: 17.10719871520996, val loss: 57.70698928833008
2021-03-29 17:59:53,842 epoch: 5, train loss: 16.93033218383789, val loss: 57.23457336425781
2021-03-29 18:00:02,629 epoch: 6, train loss: 16.822053909301758, val loss: 57.00825500488281
2021-03-29 18:00:11,461 epoch: 7, train loss: 16.724578857421875, val loss: 56.878700256347656
2021-03-29 18:00:20,311 epo

False


2021-03-29 18:01:28,466 0 genes in training set are missing from prediction set
2021-03-29 18:01:28,958 starting batch 1 of 2
2021-03-29 18:01:29,073 0 genes in training set are missing from prediction set
2021-03-29 18:01:35,129 starting batch 2 of 2
2021-03-29 18:01:35,225 0 genes in training set are missing from prediction set


scRNAseq melanoma_train_hnscc_val (4203, 23452) (5203, 26929)


2021-03-29 18:01:49,746 input dataset shape: (9406, 21552)
2021-03-29 18:01:49,748 possible cell types: ['B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Erythrocyte', 'Fibroblast', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Plasma', 'Treg']
2021-03-29 18:01:49,749 using validation key
2021-03-29 18:02:01,863 epoch: 1, train loss: 20.146774291992188, val loss: 23.100353240966797
2021-03-29 18:02:10,222 epoch: 2, train loss: 19.945924758911133, val loss: 22.97055435180664
2021-03-29 18:02:18,491 epoch: 3, train loss: 19.673479080200195, val loss: 22.873245239257812
2021-03-29 18:02:26,705 epoch: 4, train loss: 19.369192123413086, val loss: 22.674436569213867
2021-03-29 18:02:34,840 epoch: 5, train loss: 19.119173049926758, val loss: 22.59185028076172
2021-03-29 18:02:42,840 epoch: 6, train loss: 19.007762908935547, val loss: 22.560302734375
2021-03-29 18:02:50,856 epoch: 7, train loss: 18.911731719970703, val loss: 22.508136749267578
2021-03-29 18:02:58,783 epoch: 8, trai

False


2021-03-29 18:04:02,317 0 genes in training set are missing from prediction set
2021-03-29 18:04:02,677 starting batch 1 of 3
2021-03-29 18:04:02,776 0 genes in training set are missing from prediction set
2021-03-29 18:04:07,404 starting batch 2 of 3
2021-03-29 18:04:07,501 0 genes in training set are missing from prediction set
2021-03-29 18:04:12,788 starting batch 3 of 3
2021-03-29 18:04:12,871 0 genes in training set are missing from prediction set


scRNAseq melanoma_train_melanoma_val (4203, 23452) (3532, 23452)


2021-03-29 18:04:19,719 input dataset shape: (7735, 23452)
2021-03-29 18:04:19,720 possible cell types: ['B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Fibroblast', 'Malignant', 'Monocyte', 'NK', 'Plasma', 'Treg']
2021-03-29 18:04:19,721 using validation key
2021-03-29 18:04:30,740 epoch: 1, train loss: 39.86789321899414, val loss: 44.068443298339844
2021-03-29 18:04:40,064 epoch: 2, train loss: 39.35175323486328, val loss: 43.653194427490234
2021-03-29 18:04:49,527 epoch: 3, train loss: 38.55326843261719, val loss: 42.94122314453125
2021-03-29 18:04:58,936 epoch: 4, train loss: 37.925140380859375, val loss: 42.48741912841797
2021-03-29 18:05:08,329 epoch: 5, train loss: 37.66120910644531, val loss: 42.35356903076172
2021-03-29 18:05:17,682 epoch: 6, train loss: 37.43810272216797, val loss: 42.24277114868164
2021-03-29 18:05:27,069 epoch: 7, train loss: 37.27463912963867, val loss: 42.116905212402344
2021-03-29 18:05:36,340 epoch: 8, train loss: 37.07342529296875, val loss: 42.036

True


2021-03-29 18:06:48,128 0 genes in training set are missing from prediction set
2021-03-29 18:06:48,594 starting batch 1 of 2
2021-03-29 18:06:48,704 0 genes in training set are missing from prediction set
2021-03-29 18:06:53,366 starting batch 2 of 2
2021-03-29 18:06:53,460 0 genes in training set are missing from prediction set


scRNAseq melanoma_train_myeloma_val (4203, 23452) (3312, 24020)


2021-03-29 18:07:00,477 input dataset shape: (7515, 18083)
2021-03-29 18:07:00,479 possible cell types: ['B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Erythrocyte', 'Fibroblast', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Plasma', 'Platlete', 'Treg']
2021-03-29 18:07:00,480 using validation key
2021-03-29 18:07:10,339 epoch: 1, train loss: 37.81788635253906, val loss: 27.94200897216797
2021-03-29 18:07:18,782 epoch: 2, train loss: 37.43701934814453, val loss: 27.76767349243164
2021-03-29 18:07:27,187 epoch: 3, train loss: 36.66428756713867, val loss: 27.53423500061035
2021-03-29 18:07:35,523 epoch: 4, train loss: 36.07139205932617, val loss: 27.475345611572266
2021-03-29 18:07:44,029 epoch: 5, train loss: 35.755027770996094, val loss: 27.38805389404297
2021-03-29 18:07:52,432 epoch: 6, train loss: 35.556068420410156, val loss: 27.37712860107422
2021-03-29 18:08:00,837 epoch: 7, train loss: 35.3849983215332, val loss: 27.297128677368164
2021-03-29 18:08:09,191 epoch: 8, train loss: 3

False


2021-03-29 18:09:14,107 0 genes in training set are missing from prediction set
2021-03-29 18:09:14,412 starting batch 1 of 2
2021-03-29 18:09:14,499 0 genes in training set are missing from prediction set
2021-03-29 18:09:19,434 starting batch 2 of 2
2021-03-29 18:09:19,511 0 genes in training set are missing from prediction set


scRNAseq melanoma_train_pbmc_val (4203, 23452) (886, 32738)


2021-03-29 18:09:24,109 input dataset shape: (5089, 16803)
2021-03-29 18:09:24,111 possible cell types: ['B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Fibroblast', 'Malignant', 'Megakaryocyte', 'Monocyte', 'NK', 'Plasma', 'Treg']
2021-03-29 18:09:24,112 using validation key
2021-03-29 18:09:33,336 epoch: 1, train loss: 36.32625961303711, val loss: 21.900405883789062
2021-03-29 18:09:41,166 epoch: 2, train loss: 35.87522888183594, val loss: 21.802888870239258
2021-03-29 18:09:49,027 epoch: 3, train loss: 35.207489013671875, val loss: 21.702070236206055
2021-03-29 18:09:56,807 epoch: 4, train loss: 34.689361572265625, val loss: 21.64708137512207
2021-03-29 18:10:04,608 epoch: 5, train loss: 34.422515869140625, val loss: 21.6193904876709
2021-03-29 18:10:12,419 epoch: 6, train loss: 34.23724365234375, val loss: 21.587013244628906
2021-03-29 18:10:20,249 epoch: 7, train loss: 34.03818893432617, val loss: 21.554622650146484
2021-03-29 18:10:28,053 epoch: 8, train loss: 33.876842498779

False


2021-03-29 18:11:27,554 0 genes in training set are missing from prediction set
2021-03-29 18:11:27,868 starting batch 1 of 1
2021-03-29 18:11:27,939 0 genes in training set are missing from prediction set


scRNAseq melanoma_train_pdac_val (4203, 23452) (7840, 28756)


2021-03-29 18:11:38,815 input dataset shape: (12043, 21722)
2021-03-29 18:11:38,817 possible cell types: ['Acinar', 'B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Epithelial', 'Erythrocyte', 'Fibroblast', 'Islet', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Plasma', 'Treg', 'Tuft']
2021-03-29 18:11:38,818 using validation key
2021-03-29 18:11:51,271 epoch: 1, train loss: 21.268186569213867, val loss: 39.236549377441406
2021-03-29 18:12:01,152 epoch: 2, train loss: 21.033279418945312, val loss: 38.98044967651367
2021-03-29 18:12:11,047 epoch: 3, train loss: 20.65072250366211, val loss: 38.26963806152344
2021-03-29 18:12:20,822 epoch: 4, train loss: 20.273828506469727, val loss: 37.650543212890625
2021-03-29 18:12:30,624 epoch: 5, train loss: 20.01540756225586, val loss: 37.42815017700195
2021-03-29 18:12:40,477 epoch: 6, train loss: 19.869762420654297, val loss: 37.251522064208984
2021-03-29 18:12:50,189 epoch: 7, train loss: 19.766510009765625, val loss: 37.151138305664

False


2021-03-29 18:14:15,340 0 genes in training set are missing from prediction set
2021-03-29 18:14:15,893 starting batch 1 of 4
2021-03-29 18:14:15,996 0 genes in training set are missing from prediction set
2021-03-29 18:14:21,183 starting batch 2 of 4
2021-03-29 18:14:21,284 0 genes in training set are missing from prediction set
2021-03-29 18:14:26,716 starting batch 3 of 4
2021-03-29 18:14:26,815 0 genes in training set are missing from prediction set
2021-03-29 18:14:31,914 starting batch 4 of 4
2021-03-29 18:14:31,987 0 genes in training set are missing from prediction set


scRNAseq myeloma_train_brca_val (3617, 24020) (5773, 27131)


2021-03-29 18:14:40,379 input dataset shape: (9390, 19480)
2021-03-29 18:14:40,381 possible cell types: ['B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Erythrocyte', 'Fibroblast', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Plasma', 'Platlete', 'Treg']
2021-03-29 18:14:40,382 using validation key
2021-03-29 18:14:49,966 epoch: 1, train loss: 30.185911178588867, val loss: 52.753562927246094
2021-03-29 18:14:57,378 epoch: 2, train loss: 29.925243377685547, val loss: 52.67713928222656
2021-03-29 18:15:04,931 epoch: 3, train loss: 29.376407623291016, val loss: 52.28114318847656
2021-03-29 18:15:12,419 epoch: 4, train loss: 29.030336380004883, val loss: 51.9623908996582
2021-03-29 18:15:19,898 epoch: 5, train loss: 28.810150146484375, val loss: 51.71141052246094
2021-03-29 18:15:27,347 epoch: 6, train loss: 28.587753295898438, val loss: 51.46903991699219
2021-03-29 18:15:34,868 epoch: 7, train loss: 28.426963806152344, val loss: 51.34388732910156
2021-03-29 18:15:42,390 epoc

False


2021-03-29 18:16:41,513 0 genes in training set are missing from prediction set
2021-03-29 18:16:42,033 starting batch 1 of 3
2021-03-29 18:16:42,133 0 genes in training set are missing from prediction set
2021-03-29 18:16:48,115 starting batch 2 of 3
2021-03-29 18:16:48,206 0 genes in training set are missing from prediction set
2021-03-29 18:16:54,037 starting batch 3 of 3
2021-03-29 18:16:54,113 0 genes in training set are missing from prediction set


scRNAseq myeloma_train_cesc_val (3617, 24020) (4277, 22928)


2021-03-29 18:17:02,985 input dataset shape: (7894, 18774)
2021-03-29 18:17:02,987 possible cell types: ['B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Epithelial', 'Erythrocyte', 'Fibroblast', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Plasma', 'Platlete']
2021-03-29 18:17:02,988 using validation key




2021-03-29 18:17:12,479 5 out of the last 17 calls to <function compute_loss at 0x7fb3c1a1bef0> triggered tf.function retracing. Tracing is expensive and the excessive number of tracings is likely due to passing python objects instead of tensors. Also, tf.function has experimental_relax_shapes=True option that relaxes argument shapes that can avoid unnecessary retracing. Please refer to https://www.tensorflow.org/tutorials/customization/performance#python_or_tensor_args and https://www.tensorflow.org/api_docs/python/tf/function for more details.
2021-03-29 18:17:13,056 epoch: 1, train loss: 18.742956161499023, val loss: 59.788414001464844
2021-03-29 18:17:19,991 epoch: 2, train loss: 18.54998207092285, val loss: 59.704010009765625
2021-03-29 18:17:26,942 epoch: 3, train loss: 18.409576416015625, val loss: 59.50408935546875
2021-03-29 18:17:33,919 epoch: 4, train loss: 18.076269149780273, val loss: 58.971492767333984
2021-03-29 18:17:40,851 epoch: 5, train loss: 17.960939407348633, val 

False


2021-03-29 18:18:55,731 0 genes in training set are missing from prediction set
2021-03-29 18:18:56,044 starting batch 1 of 2
2021-03-29 18:18:56,140 0 genes in training set are missing from prediction set
2021-03-29 18:19:01,880 starting batch 2 of 2
2021-03-29 18:19:01,977 0 genes in training set are missing from prediction set


scRNAseq myeloma_train_hnscc_val (3617, 24020) (5203, 26929)


2021-03-29 18:19:12,626 input dataset shape: (8820, 19477)
2021-03-29 18:19:12,628 possible cell types: ['B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Erythrocyte', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Plasma', 'Platlete', 'Treg']
2021-03-29 18:19:12,628 using validation key
2021-03-29 18:19:22,560 epoch: 1, train loss: 21.41759490966797, val loss: 23.495283126831055
2021-03-29 18:19:30,674 epoch: 2, train loss: 21.2471981048584, val loss: 23.404956817626953
2021-03-29 18:19:38,732 epoch: 3, train loss: 21.137876510620117, val loss: 23.35513687133789
2021-03-29 18:19:46,641 epoch: 4, train loss: 20.7531795501709, val loss: 23.275890350341797
2021-03-29 18:19:54,619 epoch: 5, train loss: 20.589008331298828, val loss: 23.18317222595215
2021-03-29 18:20:02,669 epoch: 6, train loss: 20.468395233154297, val loss: 23.114803314208984
2021-03-29 18:20:10,533 epoch: 7, train loss: 20.34046745300293, val loss: 23.03467559814453
2021-03-29 18:20:18,407 epoch: 8, train loss

False


2021-03-29 18:21:20,009 0 genes in training set are missing from prediction set
2021-03-29 18:21:20,531 starting batch 1 of 3
2021-03-29 18:21:20,628 0 genes in training set are missing from prediction set
2021-03-29 18:21:26,282 starting batch 2 of 3
2021-03-29 18:21:26,384 0 genes in training set are missing from prediction set
2021-03-29 18:21:32,589 starting batch 3 of 3
2021-03-29 18:21:32,661 0 genes in training set are missing from prediction set


scRNAseq myeloma_train_melanoma_val (3617, 24020) (3532, 23452)


2021-03-29 18:21:38,053 input dataset shape: (7149, 18083)
2021-03-29 18:21:38,055 possible cell types: ['B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Erythrocyte', 'Fibroblast', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Plasma', 'Platlete', 'Treg']
2021-03-29 18:21:38,055 using validation key
2021-03-29 18:21:46,890 epoch: 1, train loss: 39.175331115722656, val loss: 43.546653747558594
2021-03-29 18:21:54,229 epoch: 2, train loss: 38.879852294921875, val loss: 43.32748794555664
2021-03-29 18:22:01,546 epoch: 3, train loss: 38.178932189941406, val loss: 42.99253463745117
2021-03-29 18:22:08,762 epoch: 4, train loss: 37.67296600341797, val loss: 42.81608963012695
2021-03-29 18:22:15,939 epoch: 5, train loss: 37.42189025878906, val loss: 42.71858215332031
2021-03-29 18:22:23,290 epoch: 6, train loss: 37.192073822021484, val loss: 42.621925354003906
2021-03-29 18:22:30,553 epoch: 7, train loss: 37.02348709106445, val loss: 42.519081115722656
2021-03-29 18:22:37,850 epoch: 8, train los

False


2021-03-29 18:23:34,269 0 genes in training set are missing from prediction set
2021-03-29 18:23:34,670 starting batch 1 of 2
2021-03-29 18:23:34,759 0 genes in training set are missing from prediction set
2021-03-29 18:23:39,527 starting batch 2 of 2
2021-03-29 18:23:39,604 0 genes in training set are missing from prediction set


scRNAseq myeloma_train_myeloma_val (3617, 24020) (3312, 24020)


2021-03-29 18:23:47,161 input dataset shape: (6929, 24020)
2021-03-29 18:23:47,163 possible cell types: ['B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Erythrocyte', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Plasma', 'Platlete']
2021-03-29 18:23:47,164 using validation key




2021-03-29 18:23:56,867 5 out of the last 19 calls to <function compute_loss at 0x7fb3c1a1bef0> triggered tf.function retracing. Tracing is expensive and the excessive number of tracings is likely due to passing python objects instead of tensors. Also, tf.function has experimental_relax_shapes=True option that relaxes argument shapes that can avoid unnecessary retracing. Please refer to https://www.tensorflow.org/tutorials/customization/performance#python_or_tensor_args and https://www.tensorflow.org/api_docs/python/tf/function for more details.
2021-03-29 18:23:57,655 epoch: 1, train loss: 47.64554977416992, val loss: 34.83928298950195
2021-03-29 18:24:05,803 epoch: 2, train loss: 47.39059066772461, val loss: 34.68264389038086
2021-03-29 18:24:14,046 epoch: 3, train loss: 46.478267669677734, val loss: 34.34748458862305
2021-03-29 18:24:22,147 epoch: 4, train loss: 46.05284118652344, val loss: 34.10527801513672
2021-03-29 18:24:30,268 epoch: 5, train loss: 45.716278076171875, val loss:

True


2021-03-29 18:25:59,437 0 genes in training set are missing from prediction set
2021-03-29 18:25:59,916 starting batch 1 of 2
2021-03-29 18:26:00,018 0 genes in training set are missing from prediction set
2021-03-29 18:26:05,080 starting batch 2 of 2
2021-03-29 18:26:05,162 0 genes in training set are missing from prediction set


scRNAseq myeloma_train_pbmc_val (3617, 24020) (886, 32738)


2021-03-29 18:26:10,004 input dataset shape: (4503, 19413)
2021-03-29 18:26:10,005 possible cell types: ['B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Erythrocyte', 'Malignant', 'Mast', 'Megakaryocyte', 'Monocyte', 'NK', 'Plasma', 'Platlete']
2021-03-29 18:26:10,006 using validation key
2021-03-29 18:26:18,489 epoch: 1, train loss: 46.91442108154297, val loss: 25.635292053222656
2021-03-29 18:26:25,521 epoch: 2, train loss: 46.53828811645508, val loss: 25.413881301879883
2021-03-29 18:26:32,420 epoch: 3, train loss: 45.79205322265625, val loss: 25.20497703552246
2021-03-29 18:26:39,384 epoch: 4, train loss: 45.35774230957031, val loss: 25.133739471435547
2021-03-29 18:26:46,374 epoch: 5, train loss: 45.10458755493164, val loss: 25.121368408203125
2021-03-29 18:26:53,405 epoch: 6, train loss: 44.82707595825195, val loss: 25.0657958984375
2021-03-29 18:27:00,463 epoch: 7, train loss: 44.60515594482422, val loss: 25.015350341796875
2021-03-29 18:27:07,452 epoch: 8, train loss: 44.42

False


2021-03-29 18:28:01,322 0 genes in training set are missing from prediction set
2021-03-29 18:28:01,784 starting batch 1 of 1
2021-03-29 18:28:01,862 0 genes in training set are missing from prediction set


scRNAseq myeloma_train_pdac_val (3617, 24020) (7840, 28756)


2021-03-29 18:28:12,625 input dataset shape: (11457, 19561)
2021-03-29 18:28:12,627 possible cell types: ['Acinar', 'B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Epithelial', 'Erythrocyte', 'Fibroblast', 'Islet', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Plasma', 'Platlete', 'Treg', 'Tuft']
2021-03-29 18:28:12,628 using validation key
2021-03-29 18:28:23,184 epoch: 1, train loss: 22.532623291015625, val loss: 39.44439697265625
2021-03-29 18:28:31,442 epoch: 2, train loss: 22.321430206298828, val loss: 39.452415466308594
2021-03-29 18:28:39,657 epoch: 3, train loss: 22.05670928955078, val loss: 39.25173568725586
2021-03-29 18:28:47,865 epoch: 4, train loss: 21.653165817260742, val loss: 38.84354782104492
2021-03-29 18:28:56,123 epoch: 5, train loss: 21.480716705322266, val loss: 38.71332550048828
2021-03-29 18:29:04,310 epoch: 6, train loss: 21.339391708374023, val loss: 38.49977111816406
2021-03-29 18:29:12,530 epoch: 7, train loss: 21.197805404663086, val loss: 38.3

False


2021-03-29 18:30:26,554 0 genes in training set are missing from prediction set
2021-03-29 18:30:27,076 starting batch 1 of 4
2021-03-29 18:30:27,175 0 genes in training set are missing from prediction set
2021-03-29 18:30:33,243 starting batch 2 of 4
2021-03-29 18:30:33,334 0 genes in training set are missing from prediction set
2021-03-29 18:30:39,325 starting batch 3 of 4
2021-03-29 18:30:39,415 0 genes in training set are missing from prediction set
2021-03-29 18:30:45,305 starting batch 4 of 4
2021-03-29 18:30:45,378 0 genes in training set are missing from prediction set


scRNAseq pbmc_train_brca_val (1609, 32738) (5773, 27131)


2021-03-29 18:30:52,296 input dataset shape: (7382, 18511)
2021-03-29 18:30:52,297 possible cell types: ['B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Erythrocyte', 'Fibroblast', 'Malignant', 'Mast', 'Megakaryocyte', 'Monocyte', 'NK', 'Plasma', 'Treg']
2021-03-29 18:30:52,298 using validation key
2021-03-29 18:30:58,353 epoch: 1, train loss: 16.223684310913086, val loss: 47.192840576171875
2021-03-29 18:31:02,509 epoch: 2, train loss: 15.082178115844727, val loss: 45.089012145996094
2021-03-29 18:31:06,663 epoch: 3, train loss: 14.782570838928223, val loss: 45.73004913330078
2021-03-29 18:31:10,863 epoch: 4, train loss: 14.668508529663086, val loss: 45.745391845703125
2021-03-29 18:31:15,013 epoch: 5, train loss: 14.591130256652832, val loss: 45.76556396484375
2021-03-29 18:31:19,169 epoch: 6, train loss: 14.629130363464355, val loss: 45.64273452758789
2021-03-29 18:31:23,332 epoch: 7, train loss: 14.465635299682617, val loss: 45.67522430419922
2021-03-29 18:31:27,

False


2021-03-29 18:32:00,353 0 genes in training set are missing from prediction set
2021-03-29 18:32:00,862 starting batch 1 of 3
2021-03-29 18:32:01,739 0 genes in training set are missing from prediction set
2021-03-29 18:32:06,012 starting batch 2 of 3
2021-03-29 18:32:06,089 0 genes in training set are missing from prediction set
2021-03-29 18:32:10,497 starting batch 3 of 3
2021-03-29 18:32:10,563 0 genes in training set are missing from prediction set


scRNAseq pbmc_train_cesc_val (1609, 32738) (4277, 22928)


2021-03-29 18:32:15,701 input dataset shape: (5886, 17080)
2021-03-29 18:32:15,703 possible cell types: ['B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Epithelial', 'Erythrocyte', 'Fibroblast', 'Malignant', 'Mast', 'Megakaryocyte', 'Monocyte', 'NK', 'Plasma']
2021-03-29 18:32:15,704 using validation key




2021-03-29 18:32:20,184 5 out of the last 35 calls to <function compute_loss at 0x7fb3c1a1bef0> triggered tf.function retracing. Tracing is expensive and the excessive number of tracings is likely due to passing python objects instead of tensors. Also, tf.function has experimental_relax_shapes=True option that relaxes argument shapes that can avoid unnecessary retracing. Please refer to https://www.tensorflow.org/tutorials/customization/performance#python_or_tensor_args and https://www.tensorflow.org/api_docs/python/tf/function for more details.
2021-03-29 18:32:20,869 epoch: 1, train loss: 10.519759178161621, val loss: 53.25946807861328
2021-03-29 18:32:24,365 epoch: 2, train loss: 9.808408737182617, val loss: 50.85658264160156
2021-03-29 18:32:27,900 epoch: 3, train loss: 9.517233848571777, val loss: 51.93555450439453
2021-03-29 18:32:31,387 epoch: 4, train loss: 9.393987655639648, val loss: 52.24003219604492
2021-03-29 18:32:34,889 epoch: 5, train loss: 9.327272415161133, val loss: 

False


2021-03-29 18:33:12,538 0 genes in training set are missing from prediction set
2021-03-29 18:33:12,817 starting batch 1 of 2
2021-03-29 18:33:12,897 0 genes in training set are missing from prediction set
2021-03-29 18:33:17,881 starting batch 2 of 2
2021-03-29 18:33:17,956 0 genes in training set are missing from prediction set


scRNAseq pbmc_train_hnscc_val (1609, 32738) (5203, 26929)


2021-03-29 18:33:26,417 input dataset shape: (6812, 18521)
2021-03-29 18:33:26,418 possible cell types: ['B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Erythrocyte', 'Malignant', 'Mast', 'Megakaryocyte', 'Monocyte', 'NK', 'Plasma', 'Treg']
2021-03-29 18:33:26,419 using validation key
2021-03-29 18:33:32,397 epoch: 1, train loss: 11.639063835144043, val loss: 19.600915908813477
2021-03-29 18:33:36,597 epoch: 2, train loss: 10.797130584716797, val loss: 18.801790237426758
2021-03-29 18:33:40,801 epoch: 3, train loss: 10.470361709594727, val loss: 18.993595123291016
2021-03-29 18:33:44,973 epoch: 4, train loss: 10.34687614440918, val loss: 18.998638153076172
2021-03-29 18:33:49,173 epoch: 5, train loss: 10.365972518920898, val loss: 18.974260330200195
2021-03-29 18:33:53,405 epoch: 6, train loss: 10.324980735778809, val loss: 18.941879272460938
2021-03-29 18:33:57,590 epoch: 7, train loss: 10.364095687866211, val loss: 18.94573974609375
2021-03-29 18:34:01,700 epoch: 8

False


2021-03-29 18:34:34,602 0 genes in training set are missing from prediction set
2021-03-29 18:34:35,088 starting batch 1 of 3
2021-03-29 18:34:35,171 0 genes in training set are missing from prediction set
2021-03-29 18:34:39,795 starting batch 2 of 3
2021-03-29 18:34:39,876 0 genes in training set are missing from prediction set
2021-03-29 18:34:44,931 starting batch 3 of 3
2021-03-29 18:34:44,986 0 genes in training set are missing from prediction set


scRNAseq pbmc_train_melanoma_val (1609, 32738) (3532, 23452)


2021-03-29 18:34:48,771 input dataset shape: (5141, 16803)
2021-03-29 18:34:48,772 possible cell types: ['B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Fibroblast', 'Malignant', 'Megakaryocyte', 'Monocyte', 'NK', 'Plasma', 'Treg']
2021-03-29 18:34:48,773 using validation key
2021-03-29 18:34:53,976 epoch: 1, train loss: 22.498212814331055, val loss: 40.91919708251953
2021-03-29 18:34:57,770 epoch: 2, train loss: 21.17991828918457, val loss: 38.90745544433594
2021-03-29 18:35:01,600 epoch: 3, train loss: 20.682971954345703, val loss: 38.87410354614258
2021-03-29 18:35:05,362 epoch: 4, train loss: 20.641939163208008, val loss: 38.782073974609375
2021-03-29 18:35:09,170 epoch: 5, train loss: 20.61072540283203, val loss: 38.746910095214844
2021-03-29 18:35:12,947 epoch: 6, train loss: 20.405305862426758, val loss: 38.74568176269531
2021-03-29 18:35:16,741 epoch: 7, train loss: 20.505306243896484, val loss: 38.72682571411133
2021-03-29 18:35:20,484 epoch: 8, train loss: 20.414625167846

False


2021-03-29 18:35:49,839 0 genes in training set are missing from prediction set
2021-03-29 18:35:50,134 starting batch 1 of 2
2021-03-29 18:35:50,207 0 genes in training set are missing from prediction set
2021-03-29 18:35:53,860 starting batch 2 of 2
2021-03-29 18:35:53,922 0 genes in training set are missing from prediction set


scRNAseq pbmc_train_myeloma_val (1609, 32738) (3312, 24020)


2021-03-29 18:35:59,056 input dataset shape: (4921, 19413)
2021-03-29 18:35:59,058 possible cell types: ['B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Erythrocyte', 'Malignant', 'Mast', 'Megakaryocyte', 'Monocyte', 'NK', 'Plasma', 'Platlete']
2021-03-29 18:35:59,059 using validation key
2021-03-29 18:36:04,536 epoch: 1, train loss: 25.900583267211914, val loss: 34.31334686279297
2021-03-29 18:36:08,513 epoch: 2, train loss: 24.421226501464844, val loss: 33.32435989379883
2021-03-29 18:36:12,401 epoch: 3, train loss: 23.75434684753418, val loss: 33.1689567565918
2021-03-29 18:36:16,325 epoch: 4, train loss: 24.069517135620117, val loss: 33.162437438964844
2021-03-29 18:36:20,276 epoch: 5, train loss: 23.642744064331055, val loss: 33.199310302734375
2021-03-29 18:36:24,205 epoch: 6, train loss: 23.882905960083008, val loss: 33.1600341796875
2021-03-29 18:36:28,132 epoch: 7, train loss: 23.84433364868164, val loss: 33.15152359008789
2021-03-29 18:36:32,045 epoch: 8, train loss: 23.6

False


2021-03-29 18:37:03,096 0 genes in training set are missing from prediction set
2021-03-29 18:37:03,569 starting batch 1 of 2
2021-03-29 18:37:03,647 0 genes in training set are missing from prediction set
2021-03-29 18:37:08,888 starting batch 2 of 2
2021-03-29 18:37:08,949 0 genes in training set are missing from prediction set


scRNAseq pbmc_train_pbmc_val (1609, 32738) (886, 32738)


2021-03-29 18:37:13,393 input dataset shape: (2495, 32738)
2021-03-29 18:37:13,395 possible cell types: ['B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Megakaryocyte', 'Monocyte', 'NK']
2021-03-29 18:37:13,395 using validation key




2021-03-29 18:37:19,324 5 out of the last 12 calls to <function compute_loss at 0x7fb3c1a1bef0> triggered tf.function retracing. Tracing is expensive and the excessive number of tracings is likely due to passing python objects instead of tensors. Also, tf.function has experimental_relax_shapes=True option that relaxes argument shapes that can avoid unnecessary retracing. Please refer to https://www.tensorflow.org/tutorials/customization/performance#python_or_tensor_args and https://www.tensorflow.org/api_docs/python/tf/function for more details.




2021-03-29 18:37:19,413 6 out of the last 13 calls to <function compute_loss at 0x7fb3c1a1bef0> triggered tf.function retracing. Tracing is expensive and the excessive number of tracings is likely due to passing python objects instead of tensors. Also, tf.function has experimental_relax_shapes=True option that relaxes argument shapes that can avoid unnecessary retracing. Please refer to https://www.tensorflow.org/tutorials/customization/performance#python_or_tensor_args and https://www.tensorflow.org/api_docs/python/tf/function for more details.




2021-03-29 18:37:19,622 6 out of the last 14 calls to <function compute_loss at 0x7fb3c1a1bef0> triggered tf.function retracing. Tracing is expensive and the excessive number of tracings is likely due to passing python objects instead of tensors. Also, tf.function has experimental_relax_shapes=True option that relaxes argument shapes that can avoid unnecessary retracing. Please refer to https://www.tensorflow.org/tutorials/customization/performance#python_or_tensor_args and https://www.tensorflow.org/api_docs/python/tf/function for more details.
2021-03-29 18:37:20,177 epoch: 1, train loss: 43.251953125, val loss: 40.73833465576172
2021-03-29 18:37:25,312 epoch: 2, train loss: 41.561771392822266, val loss: 39.28146743774414
2021-03-29 18:37:30,492 epoch: 3, train loss: 41.27473831176758, val loss: 38.88013458251953
2021-03-29 18:37:35,775 epoch: 4, train loss: 41.19308090209961, val loss: 38.7736701965332
2021-03-29 18:37:40,734 epoch: 5, train loss: 41.027137756347656, val loss: 38.74

True


2021-03-29 18:38:35,851 0 genes in training set are missing from prediction set
2021-03-29 18:38:36,588 starting batch 1 of 1
2021-03-29 18:38:36,655 0 genes in training set are missing from prediction set


scRNAseq pbmc_train_pdac_val (1609, 32738) (7840, 28756)


2021-03-29 18:38:45,231 input dataset shape: (9449, 18993)
2021-03-29 18:38:45,233 possible cell types: ['Acinar', 'B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Epithelial', 'Erythrocyte', 'Fibroblast', 'Islet', 'Malignant', 'Mast', 'Megakaryocyte', 'Monocyte', 'NK', 'Plasma', 'Treg', 'Tuft']
2021-03-29 18:38:45,234 using validation key
2021-03-29 18:38:51,847 epoch: 1, train loss: 12.48158073425293, val loss: 33.78731918334961
2021-03-29 18:38:56,351 epoch: 2, train loss: 11.178732872009277, val loss: 32.1414909362793
2021-03-29 18:39:00,856 epoch: 3, train loss: 10.862504959106445, val loss: 32.88700485229492
2021-03-29 18:39:05,382 epoch: 4, train loss: 10.73476791381836, val loss: 32.86259841918945
2021-03-29 18:39:09,884 epoch: 5, train loss: 10.77546501159668, val loss: 32.76146697998047
2021-03-29 18:39:14,370 epoch: 6, train loss: 10.72567081451416, val loss: 32.59258270263672
2021-03-29 18:39:18,904 epoch: 7, train loss: 10.649479866027832, val loss: 32.63

False


2021-03-29 18:39:59,230 0 genes in training set are missing from prediction set
2021-03-29 18:39:59,720 starting batch 1 of 4
2021-03-29 18:39:59,805 0 genes in training set are missing from prediction set
2021-03-29 18:40:04,462 starting batch 2 of 4
2021-03-29 18:40:04,544 0 genes in training set are missing from prediction set
2021-03-29 18:40:09,300 starting batch 3 of 4
2021-03-29 18:40:09,376 0 genes in training set are missing from prediction set
2021-03-29 18:40:13,905 starting batch 4 of 4
2021-03-29 18:40:13,963 0 genes in training set are missing from prediction set


scRNAseq pdac_train_brca_val (7923, 28756) (5773, 27131)


2021-03-29 18:40:31,634 input dataset shape: (13696, 26833)
2021-03-29 18:40:31,636 possible cell types: ['Acinar', 'B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Epithelial', 'Erythrocyte', 'Fibroblast', 'Islet', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Plasma', 'Treg', 'Tuft']
2021-03-29 18:40:31,637 using validation key
2021-03-29 18:40:54,912 epoch: 1, train loss: 36.18855667114258, val loss: 34.89524841308594
2021-03-29 18:41:14,722 epoch: 2, train loss: 34.962669372558594, val loss: 33.960227966308594
2021-03-29 18:41:34,734 epoch: 3, train loss: 34.25563430786133, val loss: 33.436553955078125
2021-03-29 18:41:55,224 epoch: 4, train loss: 34.008663177490234, val loss: 33.32666778564453
2021-03-29 18:42:15,499 epoch: 5, train loss: 33.74036407470703, val loss: 33.186927795410156
2021-03-29 18:42:35,939 epoch: 6, train loss: 33.52537155151367, val loss: 33.05973815917969
2021-03-29 18:42:56,336 epoch: 7, train loss: 33.32337188720703, val loss: 32.95464324951172


False


2021-03-29 18:45:51,820 0 genes in training set are missing from prediction set
2021-03-29 18:45:52,451 starting batch 1 of 3
2021-03-29 18:45:52,560 0 genes in training set are missing from prediction set
2021-03-29 18:45:58,136 starting batch 2 of 3
2021-03-29 18:45:58,283 0 genes in training set are missing from prediction set
2021-03-29 18:46:04,009 starting batch 3 of 3
2021-03-29 18:46:04,129 0 genes in training set are missing from prediction set


scRNAseq pdac_train_cesc_val (7923, 28756) (4277, 22928)


2021-03-29 18:46:19,596 input dataset shape: (12200, 22794)
2021-03-29 18:46:19,599 possible cell types: ['Acinar', 'B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Epithelial', 'Erythrocyte', 'Fibroblast', 'Islet', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Plasma', 'Treg', 'Tuft']
2021-03-29 18:46:19,599 using validation key
2021-03-29 18:46:39,591 epoch: 1, train loss: 27.97328758239746, val loss: 48.00811004638672
2021-03-29 18:46:56,227 epoch: 2, train loss: 27.142900466918945, val loss: 46.364593505859375
2021-03-29 18:47:12,551 epoch: 3, train loss: 26.609094619750977, val loss: 45.67044448852539
2021-03-29 18:47:29,188 epoch: 4, train loss: 26.385591506958008, val loss: 45.47735595703125
2021-03-29 18:47:45,952 epoch: 5, train loss: 26.211856842041016, val loss: 45.23320007324219
2021-03-29 18:48:02,923 epoch: 6, train loss: 26.060646057128906, val loss: 45.015472412109375
2021-03-29 18:48:19,793 epoch: 7, train loss: 25.89735984802246, val loss: 44.8372650146484

False


2021-03-29 18:50:47,359 0 genes in training set are missing from prediction set
2021-03-29 18:50:47,713 starting batch 1 of 2
2021-03-29 18:50:47,856 0 genes in training set are missing from prediction set
2021-03-29 18:50:54,126 starting batch 2 of 2
2021-03-29 18:50:54,262 0 genes in training set are missing from prediction set


scRNAseq pdac_train_hnscc_val (7923, 28756) (5203, 26929)


2021-03-29 18:51:12,401 input dataset shape: (13126, 26359)
2021-03-29 18:51:12,403 possible cell types: ['Acinar', 'B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Epithelial', 'Erythrocyte', 'Fibroblast', 'Islet', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Plasma', 'Treg', 'Tuft']
2021-03-29 18:51:12,403 using validation key
2021-03-29 18:51:34,744 epoch: 1, train loss: 30.66773796081543, val loss: 19.07364273071289
2021-03-29 18:51:54,275 epoch: 2, train loss: 29.577966690063477, val loss: 18.864315032958984
2021-03-29 18:52:13,680 epoch: 3, train loss: 29.175716400146484, val loss: 18.730510711669922
2021-03-29 18:52:33,182 epoch: 4, train loss: 28.927330017089844, val loss: 18.657121658325195
2021-03-29 18:52:53,610 epoch: 5, train loss: 28.714731216430664, val loss: 18.56796646118164
2021-03-29 18:53:13,682 epoch: 6, train loss: 28.542953491210938, val loss: 18.513996124267578
2021-03-29 18:53:33,650 epoch: 7, train loss: 28.38966941833496, val loss: 18.46519088745

False


2021-03-29 18:56:25,639 0 genes in training set are missing from prediction set
2021-03-29 18:56:26,268 starting batch 1 of 3
2021-03-29 18:56:26,379 0 genes in training set are missing from prediction set
2021-03-29 18:56:32,400 starting batch 2 of 3
2021-03-29 18:56:32,543 0 genes in training set are missing from prediction set
2021-03-29 18:56:39,031 starting batch 3 of 3
2021-03-29 18:56:39,130 0 genes in training set are missing from prediction set


scRNAseq pdac_train_melanoma_val (7923, 28756) (3532, 23452)


2021-03-29 18:56:49,526 input dataset shape: (11455, 21722)
2021-03-29 18:56:49,528 possible cell types: ['Acinar', 'B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Epithelial', 'Erythrocyte', 'Fibroblast', 'Islet', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Plasma', 'Treg', 'Tuft']
2021-03-29 18:56:49,529 using validation key
2021-03-29 18:57:07,747 epoch: 1, train loss: 36.906211853027344, val loss: 24.35124969482422
2021-03-29 18:57:23,688 epoch: 2, train loss: 35.63584899902344, val loss: 24.05741310119629
2021-03-29 18:57:39,767 epoch: 3, train loss: 34.97053909301758, val loss: 23.788114547729492
2021-03-29 18:57:55,496 epoch: 4, train loss: 34.716094970703125, val loss: 23.687488555908203
2021-03-29 18:58:11,603 epoch: 5, train loss: 34.47416305541992, val loss: 23.61444091796875
2021-03-29 18:58:27,500 epoch: 6, train loss: 34.226131439208984, val loss: 23.587196350097656
2021-03-29 18:58:43,225 epoch: 7, train loss: 34.023128509521484, val loss: 23.5276031494140

False


2021-03-29 19:01:02,321 0 genes in training set are missing from prediction set
2021-03-29 19:01:02,849 starting batch 1 of 2
2021-03-29 19:01:02,997 0 genes in training set are missing from prediction set
2021-03-29 19:01:07,415 starting batch 2 of 2
2021-03-29 19:01:07,537 0 genes in training set are missing from prediction set


scRNAseq pdac_train_myeloma_val (7923, 28756) (3312, 24020)


2021-03-29 19:01:18,928 input dataset shape: (11235, 19561)
2021-03-29 19:01:18,930 possible cell types: ['Acinar', 'B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Epithelial', 'Erythrocyte', 'Fibroblast', 'Islet', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Plasma', 'Platlete', 'Treg', 'Tuft']
2021-03-29 19:01:18,931 using validation key
2021-03-29 19:01:36,307 epoch: 1, train loss: 37.418235778808594, val loss: 16.29100227355957
2021-03-29 19:01:51,425 epoch: 2, train loss: 36.141841888427734, val loss: 16.134843826293945
2021-03-29 19:02:06,400 epoch: 3, train loss: 35.4984016418457, val loss: 16.05000877380371
2021-03-29 19:02:21,536 epoch: 4, train loss: 35.22568130493164, val loss: 15.981456756591797
2021-03-29 19:02:36,571 epoch: 5, train loss: 34.97743606567383, val loss: 15.913684844970703
2021-03-29 19:02:51,806 epoch: 6, train loss: 34.74728775024414, val loss: 15.881803512573242
2021-03-29 19:03:06,929 epoch: 7, train loss: 34.55387878417969, val loss: 15.829

False


2021-03-29 19:05:20,971 0 genes in training set are missing from prediction set
2021-03-29 19:05:21,461 starting batch 1 of 2
2021-03-29 19:05:21,600 0 genes in training set are missing from prediction set
2021-03-29 19:05:26,848 starting batch 2 of 2
2021-03-29 19:05:26,961 0 genes in training set are missing from prediction set


scRNAseq pdac_train_pbmc_val (7923, 28756) (886, 32738)


2021-03-29 19:05:36,122 input dataset shape: (8809, 18993)
2021-03-29 19:05:36,124 possible cell types: ['Acinar', 'B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Epithelial', 'Erythrocyte', 'Fibroblast', 'Islet', 'Malignant', 'Mast', 'Megakaryocyte', 'Monocyte', 'NK', 'Plasma', 'Treg', 'Tuft']
2021-03-29 19:05:36,125 using validation key
2021-03-29 19:05:52,772 epoch: 1, train loss: 32.26777267456055, val loss: 12.131553649902344
2021-03-29 19:06:07,189 epoch: 2, train loss: 31.057533264160156, val loss: 11.991899490356445
2021-03-29 19:06:21,723 epoch: 3, train loss: 30.498069763183594, val loss: 11.906402587890625
2021-03-29 19:06:36,252 epoch: 4, train loss: 30.2569637298584, val loss: 11.842816352844238
2021-03-29 19:06:50,793 epoch: 5, train loss: 30.051855087280273, val loss: 11.851934432983398
2021-03-29 19:07:05,263 epoch: 6, train loss: 29.847761154174805, val loss: 11.810606002807617
2021-03-29 19:07:19,682 epoch: 7, train loss: 29.676923751831055, val los

False


2021-03-29 19:09:27,712 0 genes in training set are missing from prediction set
2021-03-29 19:09:28,216 starting batch 1 of 1
2021-03-29 19:09:28,336 0 genes in training set are missing from prediction set


scRNAseq pdac_train_pdac_val (7923, 28756) (7840, 28756)


2021-03-29 19:09:49,507 input dataset shape: (15763, 28756)
2021-03-29 19:09:49,509 possible cell types: ['Acinar', 'B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Epithelial', 'Erythrocyte', 'Fibroblast', 'Islet', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Plasma', 'Treg', 'Tuft']
2021-03-29 19:09:49,510 using validation key
2021-03-29 19:10:13,294 epoch: 1, train loss: 32.066062927246094, val loss: 31.87655258178711
2021-03-29 19:10:33,395 epoch: 2, train loss: 30.996540069580078, val loss: 30.72779083251953
2021-03-29 19:10:53,109 epoch: 3, train loss: 30.39577865600586, val loss: 30.12775421142578
2021-03-29 19:11:13,042 epoch: 4, train loss: 30.177995681762695, val loss: 29.970703125
2021-03-29 19:11:33,144 epoch: 5, train loss: 29.961402893066406, val loss: 29.849388122558594
2021-03-29 19:11:53,363 epoch: 6, train loss: 29.74871253967285, val loss: 29.67755889892578
2021-03-29 19:12:13,982 epoch: 7, train loss: 29.570425033569336, val loss: 29.536752700805664
202

True


2021-03-29 19:15:13,849 0 genes in training set are missing from prediction set
2021-03-29 19:15:14,361 starting batch 1 of 4
2021-03-29 19:15:14,480 0 genes in training set are missing from prediction set
2021-03-29 19:15:21,479 starting batch 2 of 4
2021-03-29 19:15:21,649 0 genes in training set are missing from prediction set
2021-03-29 19:15:27,738 starting batch 3 of 4
2021-03-29 19:15:27,898 0 genes in training set are missing from prediction set
2021-03-29 19:15:33,833 starting batch 4 of 4
2021-03-29 19:15:33,941 0 genes in training set are missing from prediction set


snATACseq brca_gene_activity_train_brca_gene_activity_val (3576, 19891) (3519, 19891)


2021-03-29 19:15:40,442 input dataset shape: (7095, 19891)
2021-03-29 19:15:40,444 possible cell types: ['B cell', 'Dendritic', 'Endothelial', 'Fibroblast', 'Malignant', 'Mast', 'Monocyte', 'T cells']
2021-03-29 19:15:40,445 using validation key
2021-03-29 19:15:50,296 epoch: 1, train loss: 47.73538589477539, val loss: 61.94577407836914
2021-03-29 19:15:58,259 epoch: 2, train loss: 47.58394241333008, val loss: 61.87312698364258
2021-03-29 19:16:06,240 epoch: 3, train loss: 47.221527099609375, val loss: 61.39146423339844
2021-03-29 19:16:14,314 epoch: 4, train loss: 46.90915298461914, val loss: 61.056007385253906
2021-03-29 19:16:22,296 epoch: 5, train loss: 46.61198806762695, val loss: 60.82291793823242
2021-03-29 19:16:30,201 epoch: 6, train loss: 46.337188720703125, val loss: 60.5770149230957
2021-03-29 19:16:38,065 epoch: 7, train loss: 46.119659423828125, val loss: 60.437625885009766
2021-03-29 19:16:45,860 epoch: 8, train loss: 45.95061492919922, val loss: 60.36602020263672
2021-0

True


2021-03-29 19:17:47,632 0 genes in training set are missing from prediction set
2021-03-29 19:17:47,992 starting batch 1 of 2
2021-03-29 19:17:48,161 0 genes in training set are missing from prediction set
2021-03-29 19:17:57,306 starting batch 2 of 2
2021-03-29 19:17:57,412 0 genes in training set are missing from prediction set


snATACseq brca_gene_activity_train_brca_motif_val (3576, 19891) (3519, 633)
failed
snATACseq brca_gene_activity_train_ccrcc_gene_activity_val (3576, 19891) (3000, 19843)


2021-03-29 19:18:07,116 input dataset shape: (6576, 19815)
2021-03-29 19:18:07,118 possible cell types: ['B cell', 'Dendritic', 'Endothelial', 'Epithelial', 'Fibroblast', 'Malignant', 'Mast', 'Monocyte', 'T cells']
2021-03-29 19:18:07,118 using validation key
2021-03-29 19:18:16,306 epoch: 1, train loss: 54.697166442871094, val loss: 31.780956268310547
2021-03-29 19:18:23,789 epoch: 2, train loss: 54.492706298828125, val loss: 31.68103790283203
2021-03-29 19:18:31,207 epoch: 3, train loss: 54.018699645996094, val loss: 31.600730895996094
2021-03-29 19:18:38,642 epoch: 4, train loss: 53.729007720947266, val loss: 31.53035545349121
2021-03-29 19:18:46,072 epoch: 5, train loss: 53.28997802734375, val loss: 31.427757263183594
2021-03-29 19:18:53,463 epoch: 6, train loss: 52.969703674316406, val loss: 31.360794067382812
2021-03-29 19:19:00,929 epoch: 7, train loss: 52.7608757019043, val loss: 31.357662200927734
2021-03-29 19:19:08,352 epoch: 8, train loss: 52.53130340576172, val loss: 31.33

False


2021-03-29 19:20:06,274 0 genes in training set are missing from prediction set
2021-03-29 19:20:06,784 starting batch 1 of 2
2021-03-29 19:20:06,932 0 genes in training set are missing from prediction set
2021-03-29 19:20:15,106 starting batch 2 of 2
2021-03-29 19:20:15,186 0 genes in training set are missing from prediction set


snATACseq brca_gene_activity_train_ccrcc_motif_val (3576, 19891) (3000, 633)
failed
snATACseq brca_gene_activity_train_gbm_gene_activity_val (3576, 19891) (2875, 19891)


2021-03-29 19:20:22,283 input dataset shape: (6451, 19891)
2021-03-29 19:20:22,285 possible cell types: ['B cell', 'Dendritic', 'Endothelial', 'Fibroblast', 'Malignant', 'Mast', 'Microglia', 'Monocyte', 'Neuron', 'Oligodendrocytes', 'T cells']
2021-03-29 19:20:22,286 using validation key
2021-03-29 19:20:31,324 epoch: 1, train loss: 50.70884323120117, val loss: 31.03231430053711
2021-03-29 19:20:38,765 epoch: 2, train loss: 50.566001892089844, val loss: 30.96866226196289
2021-03-29 19:20:46,250 epoch: 3, train loss: 50.07678985595703, val loss: 30.80815887451172
2021-03-29 19:20:53,761 epoch: 4, train loss: 49.882755279541016, val loss: 30.76535415649414
2021-03-29 19:21:01,236 epoch: 5, train loss: 49.567440032958984, val loss: 30.711570739746094
2021-03-29 19:21:08,748 epoch: 6, train loss: 49.25484085083008, val loss: 30.64720916748047
2021-03-29 19:21:16,152 epoch: 7, train loss: 49.031715393066406, val loss: 30.600191116333008
2021-03-29 19:21:23,637 epoch: 8, train loss: 48.78040

False


2021-03-29 19:22:21,281 0 genes in training set are missing from prediction set
2021-03-29 19:22:21,794 starting batch 1 of 2
2021-03-29 19:22:21,952 0 genes in training set are missing from prediction set
2021-03-29 19:22:30,236 starting batch 2 of 2
2021-03-29 19:22:30,316 0 genes in training set are missing from prediction set


snATACseq brca_gene_activity_train_gbm_motif_val (3576, 19891) (2884, 633)
failed
snATACseq brca_motif_train_brca_gene_activity_val (3576, 633) (3519, 19891)
failed


2021-03-29 19:22:32,883 input dataset shape: (7095, 633)
2021-03-29 19:22:32,885 possible cell types: ['B cell', 'Dendritic', 'Endothelial', 'Fibroblast', 'Malignant', 'Mast', 'Monocyte', 'T cells']
2021-03-29 19:22:32,885 using validation key


snATACseq brca_motif_train_brca_motif_val (3576, 633) (3519, 633)


2021-03-29 19:22:35,604 epoch: 1, train loss: 9.879059791564941, val loss: 11.757186889648438
2021-03-29 19:22:37,332 epoch: 2, train loss: 8.567483901977539, val loss: 9.13492202758789
2021-03-29 19:22:39,095 epoch: 3, train loss: 7.771732807159424, val loss: 8.271858215332031
2021-03-29 19:22:40,815 epoch: 4, train loss: 7.3849287033081055, val loss: 7.748751640319824
2021-03-29 19:22:42,588 epoch: 5, train loss: 7.075709819793701, val loss: 7.345121383666992
2021-03-29 19:22:44,325 epoch: 6, train loss: 6.790960788726807, val loss: 7.041391372680664
2021-03-29 19:22:46,044 epoch: 7, train loss: 6.5323405265808105, val loss: 6.786346435546875
2021-03-29 19:22:47,791 epoch: 8, train loss: 6.3205790519714355, val loss: 6.586673736572266
2021-03-29 19:22:49,565 epoch: 9, train loss: 6.138900279998779, val loss: 6.406862735748291
2021-03-29 19:22:51,296 epoch: 10, train loss: 5.983206272125244, val loss: 6.257016181945801
2021-03-29 19:22:53,013 epoch: 11, train loss: 5.853577613830566, 

True


2021-03-29 19:23:02,734 0 genes in training set are missing from prediction set
2021-03-29 19:23:02,879 starting batch 1 of 2
2021-03-29 19:23:02,952 0 genes in training set are missing from prediction set
2021-03-29 19:23:03,119 starting batch 2 of 2
2021-03-29 19:23:03,192 0 genes in training set are missing from prediction set


snATACseq brca_motif_train_ccrcc_gene_activity_val (3576, 633) (3000, 19843)
failed
snATACseq brca_motif_train_ccrcc_motif_val (3576, 633) (3000, 633)


2021-03-29 19:23:04,125 input dataset shape: (6576, 633)
2021-03-29 19:23:04,127 possible cell types: ['B cell', 'Dendritic', 'Endothelial', 'Epithelial', 'Fibroblast', 'Malignant', 'Mast', 'Monocyte', 'T cells']
2021-03-29 19:23:04,129 using validation key
2021-03-29 19:23:06,791 epoch: 1, train loss: 9.89361572265625, val loss: 9.20771312713623
2021-03-29 19:23:08,426 epoch: 2, train loss: 8.547643661499023, val loss: 7.662665843963623
2021-03-29 19:23:10,043 epoch: 3, train loss: 7.64323091506958, val loss: 7.172702312469482
2021-03-29 19:23:11,662 epoch: 4, train loss: 7.230648517608643, val loss: 6.912325859069824
2021-03-29 19:23:13,269 epoch: 5, train loss: 6.939518451690674, val loss: 6.718210220336914
2021-03-29 19:23:14,903 epoch: 6, train loss: 6.691863536834717, val loss: 6.540562629699707
2021-03-29 19:23:16,502 epoch: 7, train loss: 6.440009593963623, val loss: 6.3503098487854
2021-03-29 19:23:18,122 epoch: 8, train loss: 6.206626892089844, val loss: 6.136599540710449
202

False


2021-03-29 19:23:31,625 0 genes in training set are missing from prediction set
2021-03-29 19:23:31,770 starting batch 1 of 2
2021-03-29 19:23:31,842 0 genes in training set are missing from prediction set
2021-03-29 19:23:32,014 starting batch 2 of 2
2021-03-29 19:23:32,082 0 genes in training set are missing from prediction set


snATACseq brca_motif_train_gbm_gene_activity_val (3576, 633) (2875, 19891)
failed
snATACseq brca_motif_train_gbm_motif_val (3576, 633) (2884, 633)


2021-03-29 19:23:32,909 input dataset shape: (6460, 633)
2021-03-29 19:23:32,911 possible cell types: ['B cell', 'Dendritic', 'Endothelial', 'Fibroblast', 'Malignant', 'Mast', 'Microglia', 'Monocyte', 'Neuron', 'Oligodendrocytes', 'T cells']
2021-03-29 19:23:32,911 using validation key
2021-03-29 19:23:35,404 epoch: 1, train loss: 9.202855110168457, val loss: 9.982168197631836
2021-03-29 19:23:36,959 epoch: 2, train loss: 7.945933818817139, val loss: 9.528047561645508
2021-03-29 19:23:38,543 epoch: 3, train loss: 7.214249134063721, val loss: 8.885322570800781
2021-03-29 19:23:40,106 epoch: 4, train loss: 6.852761745452881, val loss: 8.489568710327148
2021-03-29 19:23:41,711 epoch: 5, train loss: 6.594566822052002, val loss: 8.119216918945312
2021-03-29 19:23:43,275 epoch: 6, train loss: 6.354292869567871, val loss: 7.885705947875977
2021-03-29 19:23:44,842 epoch: 7, train loss: 6.124858379364014, val loss: 7.577522277832031
2021-03-29 19:23:46,413 epoch: 8, train loss: 5.91239690780639

False


2021-03-29 19:24:00,584 0 genes in training set are missing from prediction set
2021-03-29 19:24:00,724 starting batch 1 of 2
2021-03-29 19:24:00,785 0 genes in training set are missing from prediction set
2021-03-29 19:24:00,939 starting batch 2 of 2
2021-03-29 19:24:01,003 0 genes in training set are missing from prediction set


snATACseq ccrcc_gene_activity_train_brca_gene_activity_val (3000, 19843) (3519, 19891)


2021-03-29 19:24:06,738 input dataset shape: (6519, 19815)
2021-03-29 19:24:06,740 possible cell types: ['B cell', 'Dendritic', 'Endothelial', 'Epithelial', 'Fibroblast', 'Malignant', 'Mast', 'Monocyte', 'T cells']
2021-03-29 19:24:06,740 using validation key
2021-03-29 19:24:15,014 epoch: 1, train loss: 36.60536575317383, val loss: 70.65399932861328
2021-03-29 19:24:21,619 epoch: 2, train loss: 36.4796028137207, val loss: 70.50679016113281
2021-03-29 19:24:28,252 epoch: 3, train loss: 36.400264739990234, val loss: 70.30193328857422
2021-03-29 19:24:34,879 epoch: 4, train loss: 36.11872100830078, val loss: 69.97862243652344
2021-03-29 19:24:41,481 epoch: 5, train loss: 35.96126937866211, val loss: 69.60958862304688
2021-03-29 19:24:48,117 epoch: 6, train loss: 35.7824592590332, val loss: 69.28912353515625
2021-03-29 19:24:54,741 epoch: 7, train loss: 35.633121490478516, val loss: 68.99930572509766
2021-03-29 19:25:01,328 epoch: 8, train loss: 35.50425338745117, val loss: 68.84236145019

False


2021-03-29 19:25:52,290 0 genes in training set are missing from prediction set
2021-03-29 19:25:52,725 starting batch 1 of 2
2021-03-29 19:25:52,882 0 genes in training set are missing from prediction set
2021-03-29 19:26:02,214 starting batch 2 of 2
2021-03-29 19:26:02,285 0 genes in training set are missing from prediction set


snATACseq ccrcc_gene_activity_train_brca_motif_val (3000, 19843) (3519, 633)
failed
snATACseq ccrcc_gene_activity_train_ccrcc_gene_activity_val (3000, 19843) (3000, 19843)


2021-03-29 19:26:10,577 input dataset shape: (6000, 19843)
2021-03-29 19:26:10,579 possible cell types: ['Endothelial', 'Epithelial', 'Fibroblast', 'Malignant', 'Monocyte', 'T cells']
2021-03-29 19:26:10,580 using validation key
2021-03-29 19:26:18,681 epoch: 1, train loss: 46.147056579589844, val loss: 39.37275695800781
2021-03-29 19:26:25,170 epoch: 2, train loss: 46.07628631591797, val loss: 39.32566833496094
2021-03-29 19:26:31,609 epoch: 3, train loss: 45.83302688598633, val loss: 39.217010498046875
2021-03-29 19:26:38,027 epoch: 4, train loss: 45.493614196777344, val loss: 39.1090202331543
2021-03-29 19:26:44,536 epoch: 5, train loss: 45.34722137451172, val loss: 39.07698059082031
2021-03-29 19:26:50,987 epoch: 6, train loss: 45.11257553100586, val loss: 38.97314453125
2021-03-29 19:26:57,355 epoch: 7, train loss: 44.89604949951172, val loss: 38.848289489746094
2021-03-29 19:27:03,821 epoch: 8, train loss: 44.828269958496094, val loss: 38.845550537109375
2021-03-29 19:27:10,316 e

True


2021-03-29 19:27:54,579 0 genes in training set are missing from prediction set
2021-03-29 19:27:55,091 starting batch 1 of 2
2021-03-29 19:27:55,235 0 genes in training set are missing from prediction set
2021-03-29 19:28:02,959 starting batch 2 of 2
2021-03-29 19:28:03,039 0 genes in training set are missing from prediction set


snATACseq ccrcc_gene_activity_train_ccrcc_motif_val (3000, 19843) (3000, 633)
failed
snATACseq ccrcc_gene_activity_train_gbm_gene_activity_val (3000, 19843) (2875, 19891)


2021-03-29 19:28:09,721 input dataset shape: (5875, 19815)
2021-03-29 19:28:09,723 possible cell types: ['B cell', 'Endothelial', 'Epithelial', 'Fibroblast', 'Malignant', 'Microglia', 'Monocyte', 'Neuron', 'Oligodendrocytes', 'T cells']
2021-03-29 19:28:09,724 using validation key
2021-03-29 19:28:17,623 epoch: 1, train loss: 40.41608428955078, val loss: 36.47700119018555
2021-03-29 19:28:23,870 epoch: 2, train loss: 40.266143798828125, val loss: 36.400672912597656
2021-03-29 19:28:30,153 epoch: 3, train loss: 40.1932373046875, val loss: 36.3300895690918
2021-03-29 19:28:36,513 epoch: 4, train loss: 39.95098114013672, val loss: 36.20122528076172
2021-03-29 19:28:42,892 epoch: 5, train loss: 39.733150482177734, val loss: 36.145233154296875
2021-03-29 19:28:49,180 epoch: 6, train loss: 39.56770324707031, val loss: 36.032344818115234
2021-03-29 19:28:55,481 epoch: 7, train loss: 39.38338851928711, val loss: 35.97527313232422
2021-03-29 19:29:01,722 epoch: 8, train loss: 39.2247200012207, 

False


2021-03-29 19:29:50,292 0 genes in training set are missing from prediction set
2021-03-29 19:29:50,641 starting batch 1 of 2
2021-03-29 19:29:50,775 0 genes in training set are missing from prediction set
2021-03-29 19:29:59,239 starting batch 2 of 2
2021-03-29 19:29:59,313 0 genes in training set are missing from prediction set


snATACseq ccrcc_gene_activity_train_gbm_motif_val (3000, 19843) (2884, 633)
failed
snATACseq ccrcc_motif_train_brca_gene_activity_val (3000, 633) (3519, 19891)
failed


2021-03-29 19:30:01,931 input dataset shape: (6519, 633)
2021-03-29 19:30:01,933 possible cell types: ['B cell', 'Dendritic', 'Endothelial', 'Epithelial', 'Fibroblast', 'Malignant', 'Mast', 'Monocyte', 'T cells']
2021-03-29 19:30:01,934 using validation key


snATACseq ccrcc_motif_train_brca_motif_val (3000, 633) (3519, 633)


2021-03-29 19:30:04,165 epoch: 1, train loss: 9.062320709228516, val loss: 14.129063606262207
2021-03-29 19:30:05,572 epoch: 2, train loss: 7.363679885864258, val loss: 11.082761764526367
2021-03-29 19:30:06,960 epoch: 3, train loss: 6.873844146728516, val loss: 10.518375396728516
2021-03-29 19:30:08,335 epoch: 4, train loss: 6.611337184906006, val loss: 10.182625770568848
2021-03-29 19:30:09,712 epoch: 5, train loss: 6.339197158813477, val loss: 9.737269401550293
2021-03-29 19:30:11,095 epoch: 6, train loss: 6.124884605407715, val loss: 9.393327713012695
2021-03-29 19:30:12,457 epoch: 7, train loss: 5.94439172744751, val loss: 9.07731819152832
2021-03-29 19:30:13,847 epoch: 8, train loss: 5.776812553405762, val loss: 8.78186321258545
2021-03-29 19:30:15,225 epoch: 9, train loss: 5.613015174865723, val loss: 8.527746200561523
2021-03-29 19:30:16,578 epoch: 10, train loss: 5.470077037811279, val loss: 8.337451934814453
2021-03-29 19:30:17,982 epoch: 11, train loss: 5.350594997406006, va

False


2021-03-29 19:30:25,428 0 genes in training set are missing from prediction set
2021-03-29 19:30:25,572 starting batch 1 of 2
2021-03-29 19:30:25,631 0 genes in training set are missing from prediction set
2021-03-29 19:30:25,796 starting batch 2 of 2
2021-03-29 19:30:25,852 0 genes in training set are missing from prediction set


snATACseq ccrcc_motif_train_ccrcc_gene_activity_val (3000, 633) (3000, 19843)
failed


2021-03-29 19:30:26,653 input dataset shape: (6000, 633)
2021-03-29 19:30:26,655 possible cell types: ['Endothelial', 'Epithelial', 'Fibroblast', 'Malignant', 'Monocyte', 'T cells']
2021-03-29 19:30:26,656 using validation key


snATACseq ccrcc_motif_train_ccrcc_motif_val (3000, 633) (3000, 633)


2021-03-29 19:30:28,976 epoch: 1, train loss: 9.228902816772461, val loss: 9.815589904785156
2021-03-29 19:30:30,444 epoch: 2, train loss: 7.738187789916992, val loss: 8.117961883544922
2021-03-29 19:30:31,915 epoch: 3, train loss: 7.150430202484131, val loss: 7.645640850067139
2021-03-29 19:30:33,398 epoch: 4, train loss: 6.748775959014893, val loss: 7.021673202514648
2021-03-29 19:30:34,878 epoch: 5, train loss: 6.431192398071289, val loss: 6.555154800415039
2021-03-29 19:30:36,354 epoch: 6, train loss: 6.18854284286499, val loss: 6.263713836669922
2021-03-29 19:30:37,825 epoch: 7, train loss: 5.992577075958252, val loss: 6.073598861694336
2021-03-29 19:30:39,289 epoch: 8, train loss: 5.801839828491211, val loss: 5.892019271850586
2021-03-29 19:30:40,745 epoch: 9, train loss: 5.6277265548706055, val loss: 5.745684623718262
2021-03-29 19:30:42,210 epoch: 10, train loss: 5.486968517303467, val loss: 5.626704216003418
2021-03-29 19:30:43,672 epoch: 11, train loss: 5.358108997344971, val

True


2021-03-29 19:30:51,839 0 genes in training set are missing from prediction set
2021-03-29 19:30:51,983 starting batch 1 of 2
2021-03-29 19:30:52,047 0 genes in training set are missing from prediction set
2021-03-29 19:30:52,221 starting batch 2 of 2
2021-03-29 19:30:52,286 0 genes in training set are missing from prediction set


snATACseq ccrcc_motif_train_gbm_gene_activity_val (3000, 633) (2875, 19891)
failed
snATACseq ccrcc_motif_train_gbm_motif_val (3000, 633) (2884, 633)


2021-03-29 19:30:53,046 input dataset shape: (5884, 633)
2021-03-29 19:30:53,048 possible cell types: ['B cell', 'Endothelial', 'Epithelial', 'Fibroblast', 'Malignant', 'Microglia', 'Monocyte', 'Neuron', 'Oligodendrocytes', 'T cells']
2021-03-29 19:30:53,049 using validation key
2021-03-29 19:30:55,350 epoch: 1, train loss: 8.240008354187012, val loss: 10.730701446533203
2021-03-29 19:30:56,692 epoch: 2, train loss: 7.018274784088135, val loss: 10.241304397583008
2021-03-29 19:30:58,045 epoch: 3, train loss: 6.491064071655273, val loss: 9.794795989990234
2021-03-29 19:30:59,419 epoch: 4, train loss: 6.236212730407715, val loss: 9.601320266723633
2021-03-29 19:31:00,759 epoch: 5, train loss: 6.051827907562256, val loss: 9.32052230834961
2021-03-29 19:31:02,131 epoch: 6, train loss: 5.895059585571289, val loss: 9.107036590576172
2021-03-29 19:31:03,479 epoch: 7, train loss: 5.737613201141357, val loss: 8.856704711914062
2021-03-29 19:31:04,813 epoch: 8, train loss: 5.572751998901367, val

False


2021-03-29 19:31:16,205 0 genes in training set are missing from prediction set
2021-03-29 19:31:16,344 starting batch 1 of 2
2021-03-29 19:31:16,398 0 genes in training set are missing from prediction set
2021-03-29 19:31:16,558 starting batch 2 of 2
2021-03-29 19:31:16,613 0 genes in training set are missing from prediction set


snATACseq gbm_gene_activity_train_brca_gene_activity_val (3390, 19891) (3519, 19891)


2021-03-29 19:31:22,275 input dataset shape: (6909, 19891)
2021-03-29 19:31:22,277 possible cell types: ['B cell', 'Dendritic', 'Endothelial', 'Fibroblast', 'Malignant', 'Mast', 'Microglia', 'Monocyte', 'Neuron', 'Oligodendrocytes', 'T cells']
2021-03-29 19:31:22,278 using validation key
2021-03-29 19:31:31,154 epoch: 1, train loss: 42.82967758178711, val loss: 64.04055786132812
2021-03-29 19:31:38,414 epoch: 2, train loss: 42.689693450927734, val loss: 63.924400329589844
2021-03-29 19:31:45,607 epoch: 3, train loss: 42.316436767578125, val loss: 63.526981353759766
2021-03-29 19:31:52,742 epoch: 4, train loss: 42.05997848510742, val loss: 63.323856353759766
2021-03-29 19:31:59,917 epoch: 5, train loss: 41.83797073364258, val loss: 63.23320007324219
2021-03-29 19:32:07,162 epoch: 6, train loss: 41.57749557495117, val loss: 63.05408477783203
2021-03-29 19:32:14,395 epoch: 7, train loss: 41.376312255859375, val loss: 62.8991813659668
2021-03-29 19:32:21,583 epoch: 8, train loss: 41.201480

False


2021-03-29 19:33:17,323 0 genes in training set are missing from prediction set
2021-03-29 19:33:17,847 starting batch 1 of 2
2021-03-29 19:33:18,034 0 genes in training set are missing from prediction set
2021-03-29 19:33:27,595 starting batch 2 of 2
2021-03-29 19:33:27,685 0 genes in training set are missing from prediction set


snATACseq gbm_gene_activity_train_brca_motif_val (3390, 19891) (3519, 633)
failed
snATACseq gbm_gene_activity_train_ccrcc_gene_activity_val (3390, 19891) (3000, 19843)


2021-03-29 19:33:36,791 input dataset shape: (6390, 19815)
2021-03-29 19:33:36,793 possible cell types: ['B cell', 'Endothelial', 'Epithelial', 'Fibroblast', 'Malignant', 'Microglia', 'Monocyte', 'Neuron', 'Oligodendrocytes', 'T cells']
2021-03-29 19:33:36,794 using validation key
2021-03-29 19:33:46,448 epoch: 1, train loss: 50.80326843261719, val loss: 34.03038024902344
2021-03-29 19:33:53,608 epoch: 2, train loss: 50.58797836303711, val loss: 33.946754455566406
2021-03-29 19:34:00,802 epoch: 3, train loss: 50.04458999633789, val loss: 33.79859161376953
2021-03-29 19:34:07,973 epoch: 4, train loss: 49.82723617553711, val loss: 33.80309295654297
2021-03-29 19:34:15,088 epoch: 5, train loss: 49.50973892211914, val loss: 33.70692443847656
2021-03-29 19:34:22,165 epoch: 6, train loss: 49.15999984741211, val loss: 33.63518524169922
2021-03-29 19:34:29,195 epoch: 7, train loss: 48.893978118896484, val loss: 33.597721099853516
2021-03-29 19:34:36,266 epoch: 8, train loss: 48.70989227294922,

False


2021-03-29 19:35:30,979 0 genes in training set are missing from prediction set
2021-03-29 19:35:31,358 starting batch 1 of 2
2021-03-29 19:35:31,496 0 genes in training set are missing from prediction set
2021-03-29 19:35:39,570 starting batch 2 of 2
2021-03-29 19:35:39,648 0 genes in training set are missing from prediction set


snATACseq gbm_gene_activity_train_ccrcc_motif_val (3390, 19891) (3000, 633)
failed
snATACseq gbm_gene_activity_train_gbm_gene_activity_val (3390, 19891) (2875, 19891)


2021-03-29 19:35:45,984 input dataset shape: (6265, 19891)
2021-03-29 19:35:45,986 possible cell types: ['B cell', 'Endothelial', 'Fibroblast', 'Malignant', 'Microglia', 'Neuron', 'Oligodendrocytes', 'T cells']
2021-03-29 19:35:45,987 using validation key
2021-03-29 19:35:54,960 epoch: 1, train loss: 46.644935607910156, val loss: 32.53227996826172
2021-03-29 19:36:02,082 epoch: 2, train loss: 46.45624923706055, val loss: 32.34149932861328
2021-03-29 19:36:09,252 epoch: 3, train loss: 46.004913330078125, val loss: 32.18382263183594
2021-03-29 19:36:16,370 epoch: 4, train loss: 45.77650833129883, val loss: 32.11737823486328
2021-03-29 19:36:23,538 epoch: 5, train loss: 45.50642013549805, val loss: 31.974498748779297
2021-03-29 19:36:30,678 epoch: 6, train loss: 45.21806716918945, val loss: 31.89142608642578
2021-03-29 19:36:37,845 epoch: 7, train loss: 44.969417572021484, val loss: 31.789505004882812
2021-03-29 19:36:45,001 epoch: 8, train loss: 44.78163146972656, val loss: 31.7616214752

True


2021-03-29 19:37:41,185 0 genes in training set are missing from prediction set
2021-03-29 19:37:41,691 starting batch 1 of 2
2021-03-29 19:37:41,843 0 genes in training set are missing from prediction set
2021-03-29 19:37:50,071 starting batch 2 of 2
2021-03-29 19:37:50,159 0 genes in training set are missing from prediction set


snATACseq gbm_gene_activity_train_gbm_motif_val (3390, 19891) (2884, 633)
failed
snATACseq gbm_motif_train_brca_gene_activity_val (3381, 633) (3519, 19891)
failed


2021-03-29 19:37:52,750 input dataset shape: (6900, 633)
2021-03-29 19:37:52,752 possible cell types: ['B cell', 'Dendritic', 'Endothelial', 'Fibroblast', 'Malignant', 'Mast', 'Microglia', 'Monocyte', 'Neuron', 'Oligodendrocytes', 'T cells']
2021-03-29 19:37:52,753 using validation key


snATACseq gbm_motif_train_brca_motif_val (3381, 633) (3519, 633)


2021-03-29 19:37:55,221 epoch: 1, train loss: 9.472312927246094, val loss: 12.052694320678711
2021-03-29 19:37:56,756 epoch: 2, train loss: 7.677891731262207, val loss: 8.985247611999512
2021-03-29 19:37:58,296 epoch: 3, train loss: 7.127804279327393, val loss: 8.447019577026367
2021-03-29 19:37:59,845 epoch: 4, train loss: 6.740544319152832, val loss: 8.12708568572998
2021-03-29 19:38:01,363 epoch: 5, train loss: 6.388997554779053, val loss: 7.7296223640441895
2021-03-29 19:38:02,891 epoch: 6, train loss: 6.135955333709717, val loss: 7.467185020446777
2021-03-29 19:38:04,405 epoch: 7, train loss: 5.929812908172607, val loss: 7.1704912185668945
2021-03-29 19:38:05,946 epoch: 8, train loss: 5.7465009689331055, val loss: 6.9373955726623535
2021-03-29 19:38:07,485 epoch: 9, train loss: 5.598974704742432, val loss: 6.762215614318848
2021-03-29 19:38:08,991 epoch: 10, train loss: 5.466940402984619, val loss: 6.622872829437256
2021-03-29 19:38:10,562 epoch: 11, train loss: 5.359436988830566,

False


2021-03-29 19:38:18,935 0 genes in training set are missing from prediction set
2021-03-29 19:38:19,074 starting batch 1 of 2
2021-03-29 19:38:19,139 0 genes in training set are missing from prediction set
2021-03-29 19:38:19,303 starting batch 2 of 2
2021-03-29 19:38:19,366 0 genes in training set are missing from prediction set


snATACseq gbm_motif_train_ccrcc_gene_activity_val (3381, 633) (3000, 19843)
failed
snATACseq gbm_motif_train_ccrcc_motif_val (3381, 633) (3000, 633)


2021-03-29 19:38:20,329 input dataset shape: (6381, 633)
2021-03-29 19:38:20,332 possible cell types: ['B cell', 'Endothelial', 'Epithelial', 'Fibroblast', 'Malignant', 'Microglia', 'Monocyte', 'Neuron', 'Oligodendrocytes', 'T cells']
2021-03-29 19:38:20,333 using validation key
2021-03-29 19:38:22,926 epoch: 1, train loss: 9.069759368896484, val loss: 7.731391906738281
2021-03-29 19:38:24,454 epoch: 2, train loss: 7.6767683029174805, val loss: 6.839583396911621
2021-03-29 19:38:25,997 epoch: 3, train loss: 7.092930793762207, val loss: 6.5353875160217285
2021-03-29 19:38:27,523 epoch: 4, train loss: 6.668084621429443, val loss: 6.201239109039307
2021-03-29 19:38:29,077 epoch: 5, train loss: 6.26027250289917, val loss: 5.895917892456055
2021-03-29 19:38:30,608 epoch: 6, train loss: 5.979154586791992, val loss: 5.690249443054199
2021-03-29 19:38:32,135 epoch: 7, train loss: 5.779459476470947, val loss: 5.549208641052246
2021-03-29 19:38:33,684 epoch: 8, train loss: 5.6075263023376465, va

False


2021-03-29 19:38:46,594 0 genes in training set are missing from prediction set
2021-03-29 19:38:46,742 starting batch 1 of 2
2021-03-29 19:38:46,807 0 genes in training set are missing from prediction set
2021-03-29 19:38:46,982 starting batch 2 of 2
2021-03-29 19:38:47,045 0 genes in training set are missing from prediction set


snATACseq gbm_motif_train_gbm_gene_activity_val (3381, 633) (2875, 19891)
failed
snATACseq gbm_motif_train_gbm_motif_val (3381, 633) (2884, 633)


2021-03-29 19:38:47,754 input dataset shape: (6265, 633)
2021-03-29 19:38:47,756 possible cell types: ['B cell', 'Endothelial', 'Fibroblast', 'Malignant', 'Microglia', 'Neuron', 'Oligodendrocytes', 'T cells']
2021-03-29 19:38:47,757 using validation key
2021-03-29 19:38:50,328 epoch: 1, train loss: 9.266752243041992, val loss: 8.322961807250977
2021-03-29 19:38:51,921 epoch: 2, train loss: 7.646029949188232, val loss: 7.193948268890381
2021-03-29 19:38:53,509 epoch: 3, train loss: 7.0615057945251465, val loss: 6.713696479797363
2021-03-29 19:38:55,099 epoch: 4, train loss: 6.682766437530518, val loss: 6.4473748207092285
2021-03-29 19:38:56,701 epoch: 5, train loss: 6.319805145263672, val loss: 6.157022953033447
2021-03-29 19:38:58,321 epoch: 6, train loss: 6.051654815673828, val loss: 5.919748306274414
2021-03-29 19:38:59,910 epoch: 7, train loss: 5.83771276473999, val loss: 5.739931583404541
2021-03-29 19:39:01,497 epoch: 8, train loss: 5.674844264984131, val loss: 5.623539924621582
2

True


2021-03-29 19:39:15,135 0 genes in training set are missing from prediction set
2021-03-29 19:39:15,279 starting batch 1 of 2
2021-03-29 19:39:15,347 0 genes in training set are missing from prediction set
2021-03-29 19:39:15,503 starting batch 2 of 2
2021-03-29 19:39:15,568 0 genes in training set are missing from prediction set


snRNAseq brca_train_brca_val (5254, 29175) (4891, 29175)


2021-03-29 19:39:26,171 input dataset shape: (10145, 29175)
2021-03-29 19:39:26,173 possible cell types: ['Adipocyte', 'B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Fibroblast', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Plasma', 'Treg']
2021-03-29 19:39:26,174 using validation key
2021-03-29 19:39:43,698 epoch: 1, train loss: 42.07243728637695, val loss: 49.87724304199219
2021-03-29 19:39:58,767 epoch: 2, train loss: 41.449989318847656, val loss: 48.92787170410156
2021-03-29 19:40:13,702 epoch: 3, train loss: 40.808555603027344, val loss: 48.17041015625
2021-03-29 19:40:28,638 epoch: 4, train loss: 40.30082321166992, val loss: 47.885276794433594
2021-03-29 19:40:43,448 epoch: 5, train loss: 40.0721321105957, val loss: 47.71094512939453
2021-03-29 19:40:58,608 epoch: 6, train loss: 39.87150955200195, val loss: 47.57929229736328
2021-03-29 19:41:13,619 epoch: 7, train loss: 39.71698760986328, val loss: 47.45453643798828
2021-03-29 19:41:28,776 epoch: 8, train loss: 39.

True


2021-03-29 19:43:26,921 0 genes in training set are missing from prediction set
2021-03-29 19:43:27,617 starting batch 1 of 2
2021-03-29 19:43:27,757 0 genes in training set are missing from prediction set
2021-03-29 19:43:34,133 starting batch 2 of 2
2021-03-29 19:43:34,258 0 genes in training set are missing from prediction set


snRNAseq brca_train_ccrcc_val (5254, 29175) (4525, 33538)


2021-03-29 19:43:51,315 input dataset shape: (9779, 29175)
2021-03-29 19:43:51,317 possible cell types: ['Adipocyte', 'B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Epithelial', 'Fibroblast', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Plasma', 'Treg']
2021-03-29 19:43:51,318 using validation key
2021-03-29 19:44:08,353 epoch: 1, train loss: 41.076332092285156, val loss: 54.77976608276367
2021-03-29 19:44:22,897 epoch: 2, train loss: 40.70975875854492, val loss: 54.16603088378906
2021-03-29 19:44:37,433 epoch: 3, train loss: 40.151275634765625, val loss: 53.60441970825195
2021-03-29 19:44:51,719 epoch: 4, train loss: 39.6126594543457, val loss: 53.29629135131836
2021-03-29 19:45:06,312 epoch: 5, train loss: 39.29072570800781, val loss: 53.26311111450195
2021-03-29 19:45:20,810 epoch: 6, train loss: 39.077030181884766, val loss: 53.211708068847656
2021-03-29 19:45:35,430 epoch: 7, train loss: 39.0013313293457, val loss: 53.17617416381836
2021-03-29 19:45:49,826 epoch: 8,

False


2021-03-29 19:47:43,865 0 genes in training set are missing from prediction set
2021-03-29 19:47:44,546 starting batch 1 of 2
2021-03-29 19:47:44,677 0 genes in training set are missing from prediction set
2021-03-29 19:47:51,356 starting batch 2 of 2
2021-03-29 19:47:51,479 0 genes in training set are missing from prediction set


snRNAseq brca_train_gbm_val (5254, 29175) (3554, 29748)


2021-03-29 19:48:06,236 input dataset shape: (8808, 28713)
2021-03-29 19:48:06,238 possible cell types: ['Adipocyte', 'B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Fibroblast', 'Malignant', 'Mast', 'Microglia', 'Monocyte', 'NK', 'Neuron', 'Oligodendrocytes', 'Plasma', 'T cells', 'Treg']
2021-03-29 19:48:06,239 using validation key
2021-03-29 19:48:23,202 epoch: 1, train loss: 43.71079635620117, val loss: 43.03593444824219
2021-03-29 19:48:37,372 epoch: 2, train loss: 42.99290084838867, val loss: 42.676963806152344
2021-03-29 19:48:51,613 epoch: 3, train loss: 42.39483642578125, val loss: 42.526283264160156
2021-03-29 19:49:05,790 epoch: 4, train loss: 41.90628433227539, val loss: 42.440460205078125
2021-03-29 19:49:20,211 epoch: 5, train loss: 41.58183288574219, val loss: 42.40070724487305
2021-03-29 19:49:35,348 epoch: 6, train loss: 41.424888610839844, val loss: 42.35060501098633
2021-03-29 19:49:49,703 epoch: 7, train loss: 41.24309158325195, val loss: 42.318611

False


2021-03-29 19:51:54,397 0 genes in training set are missing from prediction set
2021-03-29 19:51:55,079 starting batch 1 of 2
2021-03-29 19:51:55,216 0 genes in training set are missing from prediction set
2021-03-29 19:52:01,673 starting batch 2 of 2
2021-03-29 19:52:01,778 0 genes in training set are missing from prediction set


snRNAseq ccrcc_train_brca_val (4747, 33538) (4891, 29175)


2021-03-29 19:52:14,989 input dataset shape: (9638, 29175)
2021-03-29 19:52:14,991 possible cell types: ['Adipocyte', 'B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Epithelial', 'Fibroblast', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Plasma', 'Treg']
2021-03-29 19:52:14,992 using validation key
2021-03-29 19:52:31,119 epoch: 1, train loss: 36.343116760253906, val loss: 50.280357360839844
2021-03-29 19:52:44,407 epoch: 2, train loss: 35.616615295410156, val loss: 50.03634262084961
2021-03-29 19:52:57,640 epoch: 3, train loss: 34.86154556274414, val loss: 49.84489440917969
2021-03-29 19:53:11,041 epoch: 4, train loss: 34.619022369384766, val loss: 49.733802795410156
2021-03-29 19:53:24,363 epoch: 5, train loss: 34.41267395019531, val loss: 49.680885314941406
2021-03-29 19:53:37,706 epoch: 6, train loss: 34.278724670410156, val loss: 49.626522064208984
2021-03-29 19:53:51,057 epoch: 7, train loss: 34.114501953125, val loss: 49.597957611083984
2021-03-29 19:54:04,490 epoc

False


2021-03-29 19:55:47,489 0 genes in training set are missing from prediction set
2021-03-29 19:55:48,177 starting batch 1 of 2
2021-03-29 19:55:48,294 0 genes in training set are missing from prediction set
2021-03-29 19:55:54,670 starting batch 2 of 2
2021-03-29 19:55:54,775 0 genes in training set are missing from prediction set


snRNAseq ccrcc_train_ccrcc_val (4747, 33538) (4525, 33538)


2021-03-29 19:56:11,939 input dataset shape: (9272, 33538)
2021-03-29 19:56:11,942 possible cell types: ['CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Epithelial', 'Fibroblast', 'Malignant', 'Monocyte', 'NK', 'Plasma', 'Treg']
2021-03-29 19:56:11,943 using validation key
2021-03-29 19:56:29,385 epoch: 1, train loss: 40.1624755859375, val loss: 59.25124740600586
2021-03-29 19:56:44,016 epoch: 2, train loss: 39.315059661865234, val loss: 57.771034240722656
2021-03-29 19:56:58,754 epoch: 3, train loss: 38.94365692138672, val loss: 56.860145568847656
2021-03-29 19:57:13,746 epoch: 4, train loss: 38.787803649902344, val loss: 56.48268508911133
2021-03-29 19:57:28,715 epoch: 5, train loss: 38.56843948364258, val loss: 56.19133758544922
2021-03-29 19:57:43,882 epoch: 6, train loss: 38.125797271728516, val loss: 55.834068298339844
2021-03-29 19:57:58,943 epoch: 7, train loss: 38.01652526855469, val loss: 55.56856918334961
2021-03-29 19:58:13,911 epoch: 8, train loss: 37.673839569091

True


2021-03-29 20:00:10,688 0 genes in training set are missing from prediction set
2021-03-29 20:00:11,452 starting batch 1 of 2
2021-03-29 20:00:11,539 0 genes in training set are missing from prediction set
2021-03-29 20:00:18,833 starting batch 2 of 2
2021-03-29 20:00:18,949 0 genes in training set are missing from prediction set


snRNAseq ccrcc_train_gbm_val (4747, 33538) (3554, 29748)


2021-03-29 20:00:33,936 input dataset shape: (8301, 29748)
2021-03-29 20:00:33,938 possible cell types: ['B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Epithelial', 'Fibroblast', 'Malignant', 'Microglia', 'Monocyte', 'NK', 'Neuron', 'Oligodendrocytes', 'Plasma', 'T cells', 'Treg']
2021-03-29 20:00:33,939 using validation key
2021-03-29 20:00:48,361 epoch: 1, train loss: 38.992069244384766, val loss: 44.54145050048828
2021-03-29 20:01:00,524 epoch: 2, train loss: 37.920745849609375, val loss: 44.413307189941406
2021-03-29 20:01:12,688 epoch: 3, train loss: 37.384944915771484, val loss: 44.29132080078125
2021-03-29 20:01:24,778 epoch: 4, train loss: 37.1844482421875, val loss: 44.19805908203125
2021-03-29 20:01:36,866 epoch: 5, train loss: 36.90092468261719, val loss: 44.134586334228516
2021-03-29 20:01:48,990 epoch: 6, train loss: 36.5786018371582, val loss: 44.07411193847656
2021-03-29 20:02:01,244 epoch: 7, train loss: 36.35905838012695, val loss: 44.02925109863281

False


2021-03-29 20:03:46,998 0 genes in training set are missing from prediction set
2021-03-29 20:03:47,689 starting batch 1 of 2
2021-03-29 20:03:47,807 0 genes in training set are missing from prediction set
2021-03-29 20:03:54,476 starting batch 2 of 2
2021-03-29 20:03:54,577 0 genes in training set are missing from prediction set


snRNAseq gbm_train_brca_val (3745, 29748) (4891, 29175)


2021-03-29 20:04:06,695 input dataset shape: (8636, 28713)
2021-03-29 20:04:06,697 possible cell types: ['Adipocyte', 'B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Fibroblast', 'Malignant', 'Mast', 'Microglia', 'Monocyte', 'NK', 'Neuron', 'Oligodendrocytes', 'Plasma', 'T cells', 'Treg']
2021-03-29 20:04:06,698 using validation key
2021-03-29 20:04:19,529 epoch: 1, train loss: 43.57672119140625, val loss: 53.090301513671875
2021-03-29 20:04:30,228 epoch: 2, train loss: 42.820167541503906, val loss: 52.846107482910156
2021-03-29 20:04:40,784 epoch: 3, train loss: 42.024444580078125, val loss: 52.67784881591797
2021-03-29 20:04:51,321 epoch: 4, train loss: 41.60187530517578, val loss: 52.45075607299805
2021-03-29 20:05:01,870 epoch: 5, train loss: 41.278968811035156, val loss: 52.275184631347656
2021-03-29 20:05:12,488 epoch: 6, train loss: 41.01976776123047, val loss: 52.217445373535156
2021-03-29 20:05:23,349 epoch: 7, train loss: 40.79929733276367, val loss: 52.153

False


2021-03-29 20:06:56,910 0 genes in training set are missing from prediction set
2021-03-29 20:06:57,579 starting batch 1 of 2
2021-03-29 20:06:57,678 0 genes in training set are missing from prediction set
2021-03-29 20:07:04,132 starting batch 2 of 2
2021-03-29 20:07:04,226 0 genes in training set are missing from prediction set


snRNAseq gbm_train_ccrcc_val (3745, 29748) (4525, 33538)


2021-03-29 20:07:20,642 input dataset shape: (8270, 29748)
2021-03-29 20:07:20,644 possible cell types: ['B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Epithelial', 'Fibroblast', 'Malignant', 'Microglia', 'Monocyte', 'NK', 'Neuron', 'Oligodendrocytes', 'Plasma', 'T cells', 'Treg']
2021-03-29 20:07:20,645 using validation key
2021-03-29 20:07:33,175 epoch: 1, train loss: 44.688148498535156, val loss: 59.05528259277344
2021-03-29 20:07:43,175 epoch: 2, train loss: 44.02775192260742, val loss: 58.758094787597656
2021-03-29 20:07:53,548 epoch: 3, train loss: 43.206512451171875, val loss: 58.37523651123047
2021-03-29 20:08:03,579 epoch: 4, train loss: 42.84215545654297, val loss: 57.98307800292969
2021-03-29 20:08:13,649 epoch: 5, train loss: 42.54443359375, val loss: 57.75287628173828
2021-03-29 20:08:24,121 epoch: 6, train loss: 42.35531997680664, val loss: 57.60797882080078
2021-03-29 20:08:34,558 epoch: 7, train loss: 42.10585021972656, val loss: 57.41252899169922
20

False


2021-03-29 20:10:04,883 0 genes in training set are missing from prediction set
2021-03-29 20:10:05,586 starting batch 1 of 2
2021-03-29 20:10:05,706 0 genes in training set are missing from prediction set
2021-03-29 20:10:12,586 starting batch 2 of 2
2021-03-29 20:10:12,659 0 genes in training set are missing from prediction set


snRNAseq gbm_train_gbm_val (3745, 29748) (3554, 29748)


2021-03-29 20:10:27,864 input dataset shape: (7299, 29748)
2021-03-29 20:10:27,865 possible cell types: ['B cell', 'Endothelial', 'Fibroblast', 'Malignant', 'Microglia', 'Monocyte', 'Neuron', 'Oligodendrocytes', 'T cells']
2021-03-29 20:10:27,866 using validation key
2021-03-29 20:10:40,863 epoch: 1, train loss: 48.398319244384766, val loss: 47.211524963378906
2021-03-29 20:10:51,454 epoch: 2, train loss: 47.63067626953125, val loss: 46.73516082763672
2021-03-29 20:11:01,980 epoch: 3, train loss: 46.666664123535156, val loss: 46.19440841674805
2021-03-29 20:11:12,114 epoch: 4, train loss: 46.169090270996094, val loss: 45.935325622558594
2021-03-29 20:11:22,591 epoch: 5, train loss: 45.88922119140625, val loss: 45.82231140136719
2021-03-29 20:11:33,000 epoch: 6, train loss: 45.67880630493164, val loss: 45.72295379638672
2021-03-29 20:11:43,645 epoch: 7, train loss: 45.442447662353516, val loss: 45.66011047363281
2021-03-29 20:11:54,174 epoch: 8, train loss: 45.214324951171875, val loss:

True


2021-03-29 20:13:16,126 0 genes in training set are missing from prediction set
2021-03-29 20:13:16,820 starting batch 1 of 2
2021-03-29 20:13:16,953 0 genes in training set are missing from prediction set
2021-03-29 20:13:24,214 starting batch 2 of 2
2021-03-29 20:13:24,307 0 genes in training set are missing from prediction set


In [None]:
# run_workflow_for_cross_datatype(adata_map, run_pollock_workflow, 'pollock', RESULTS_CROSS_DTYPE_DIR)

###### testing stuff

In [None]:
# a = sc.read_h5ad(adata_map['snATACseq']['gbm']['train'])
# a

In [None]:
train, val = sc.read_h5ad(adata_map['scRNAseq']['pbmc']['train']), sc.read_h5ad(adata_map['scRNAseq']['brca']['val'])

In [None]:
module_dir = os.path.join(SANDBOX_DIR, 'temp_module')

In [None]:
train.obs['is_validation'] = [False] * train.shape[0]
val.obs['is_validation'] = [True] * val.shape[0]
combined = train.concatenate(val)
combined

In [None]:
train.shape, val.shape

In [None]:
np.count_nonzero(combined.obs['is_validation']), np.count_nonzero(~combined.obs['is_validation'])

In [None]:
# pds = PollockDataset(train, cell_type_key=CELL_TYPE_KEY,
#                      dataset_type='training')

In [None]:
val.shape

In [None]:
pds = PollockDataset(combined, cell_type_key=CELL_TYPE_KEY,
                     dataset_type='training', validation_key='is_validation')

In [None]:
pm = PollockModel(pds.cell_types, pds.train_adata.shape[1], alpha=.0001, latent_dim=25)

In [None]:
pm.fit(pds, epochs=2)

In [None]:
pm.save(pds, module_dir)

In [None]:
val.shape

In [None]:
preds = predict_from_anndata(val.copy(),
        '/home/estorrs/pollock/benchmarking/sandbox/temp_module', adata_batch_size=10000)
preds

In [None]:
df = pd.DataFrame.from_dict({
    'cell_id': preds.index.to_list(),
    'groundtruth': val.obs.loc[preds.index][CELL_TYPE_KEY].to_list(),
    'predicted': preds['predicted_cell_type'],
    'probability': preds['cell_type_probability']
})
df

##### scanpy ingest

In [None]:
def ingest_preprocess(adata):
    adata.var['mt'] = adata.var_names.str.startswith('MT-')
    sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)
    sc.pp.normalize_total(adata, target_sum=1e4)
    sc.pp.log1p(adata)
    sc.pp.highly_variable_genes(adata, flavor="seurat", n_top_genes=2500)
    adata.raw = adata
    adata = adata[:, adata.var.highly_variable]
    sc.pp.regress_out(adata, ['total_counts', 'pct_counts_mt'])
    sc.pp.scale(adata)
    
    return adata

def run_scanpy_workflow(train, val, cell_type_key):
    var_names = train.var_names.intersection(val.var_names)
    train = train[:, var_names]
    val = val[:, var_names]
    
    groundtruth = val.obs[cell_type_key].to_list()

    sc.pp.pca(train)
    sc.pp.neighbors(train)
    sc.tl.umap(train)
    
    sc.tl.ingest(val, train, obs=cell_type_key)
    
    df = pd.DataFrame.from_dict({
        'cell_id': val.obs.index.to_list(),
        'groundtruth': groundtruth,
        'predicted': val.obs[cell_type_key].to_list(),
        'probability': [np.nan] * val.shape[0]
    })
    
    return df

In [None]:
run_workflow_for_datasets(adata_map, run_scanpy_workflow, 'scanpy_ingest', RESULTS_DIR)

In [None]:
run_workflow_for_cross_disease(adata_map, run_scanpy_workflow, 'scanpy_ingest', RESULTS_CROSS_DISEASE_DIR)

In [None]:
# run_workflow_for_cross_datatype(adata_map, run_scanpy_workflow, 'scanpy_ingest', RESULTS_CROSS_DTYPE_DIR)

###### testing stuff

In [None]:
train, val = adata_map['scRNAseq']['pbmc']['train'].copy(), adata_map['scRNAseq']['pbmc']['val'].copy()

In [None]:
train, val = ingest_preprocess(train), ingest_preprocess(val)

var_names = train.var_names.intersection(val.var_names)
train = train[:, var_names]
val = val[:, var_names]

sc.pp.pca(train)
sc.pp.neighbors(train)
sc.tl.umap(train)

In [None]:
sc.pl.umap(train, color='cell_type')

In [None]:
sc.tl.ingest(val, train, obs=CELL_TYPE_KEY)
val.uns[f'{CELL_TYPE_KEY}_colors'] = train.uns[f'{CELL_TYPE_KEY}_colors']

In [None]:
sc.pl.umap(val, color=[CELL_TYPE_KEY], wspace=0.5)


In [None]:
val

In [None]:
val.obs

##### ACTINN

In [6]:
def run_actinn_workflow(train, val, cell_type_key):
    X = train.X.toarray() if 'sparse' in str(type(train.X)) else train.X
    train_counts_df = pd.DataFrame(data=X.transpose(), index=train.var.index.to_list(),
                        columns=train.obs.index.to_list())
    X = val.X.toarray() if 'sparse' in str(type(val.X)) else val.X
    val_counts_df = pd.DataFrame(data=X.transpose(), index=val.var.index.to_list(),
                        columns=val.obs.index.to_list())
    
    train_counts_fp = os.path.join(SANDBOX_DIR, 'train_counts.txt')
    val_counts_fp = os.path.join(SANDBOX_DIR, 'val_counts.txt')
    train_counts_df.to_csv(train_counts_fp, sep='\t')
    val_counts_df.to_csv(val_counts_fp, sep='\t')
    
    train_h5_fp = os.path.join(SANDBOX_DIR, 'train.h5')
    train_annotations_fp = os.path.join(SANDBOX_DIR, 'train_annotations.txt')
    val_h5_fp = os.path.join(SANDBOX_DIR, 'val.h5')

    train.obs[[CELL_TYPE_KEY]].to_csv(train_annotations_fp, sep='\t', index=True, header=False)

    subprocess.check_output(('python', ACTINN_FORMAT, '-i', train_counts_fp,
                            '-o', train_h5_fp.replace('.h5', ''), '-f', 'txt'))
    subprocess.check_output(('python', ACTINN_FORMAT, '-i', val_counts_fp,
                            '-o', val_h5_fp.replace('.h5', ''), '-f', 'txt'))
    # dont use probablity argument or it breaks
    subprocess.check_output(('python', ACTINN_PREDICT, '-trs', train_h5_fp,
                            '-trl', train_annotations_fp, '-ts', val_h5_fp))
    
    prediction_df = pd.read_csv('predicted_label.txt', sep='\t')
    
    df = pd.DataFrame.from_dict({
        'cell_id': prediction_df['cellname'].to_list(),
        'predicted': prediction_df['celltype'].to_list(),
        'probability': [np.nan] * prediction_df.shape[0]
    })
    
    df = pd.merge(df, val.obs, left_on='cell_id', right_index=True)
    df = df[['cell_id', 'cell_type', 'predicted', 'probability']]
    df.columns = ['cell_id', 'groundtruth', 'predicted', 'probability']
    
    return df
    
    
    
    


In [7]:
ACTINN_FORMAT = '/home/estorrs/ACTINN/actinn_format.py'
ACTINN_PREDICT = '/home/estorrs/ACTINN/actinn_predict.py'

run_workflow_for_datasets(adata_map, run_actinn_workflow, 'actinn', RESULTS_DIR)

scRNAseq brca (6105, 27131) (5748, 27131)
scRNAseq cesc (4661, 22928) (4276, 22928)
scRNAseq hnscc (5287, 26929) (5201, 26929)
scRNAseq melanoma (4218, 23452) (3517, 23452)


Traceback (most recent call last):
  File "<ipython-input-5-ce2e432e20b0>", line 14, in run_workflow_for_datasets
    train, val, directory)
  File "<ipython-input-5-ce2e432e20b0>", line 64, in run_workflow
    df = workflow(train, val, CELL_TYPE_KEY)
  File "<ipython-input-6-8b9bf882e9f0>", line 21, in run_actinn_workflow
    '-o', train_h5_fp.replace('.h5', ''), '-f', 'txt'))
  File "/home/estorrs/miniconda3/envs/actinn/lib/python3.7/subprocess.py", line 411, in check_output
    **kwargs).stdout
  File "/home/estorrs/miniconda3/envs/actinn/lib/python3.7/subprocess.py", line 512, in run
    output=stdout, stderr=stderr)
subprocess.CalledProcessError: Command '('python', '/home/estorrs/ACTINN/actinn_format.py', '-i', '/home/estorrs/pollock/benchmarking/sandbox/train_counts.txt', '-o', '/home/estorrs/pollock/benchmarking/sandbox/train', '-f', 'txt')' returned non-zero exit status 1.


None
scRNAseq myeloma (3617, 24020) (3312, 24020)


Traceback (most recent call last):
  File "<ipython-input-5-ce2e432e20b0>", line 14, in run_workflow_for_datasets
    train, val, directory)
  File "<ipython-input-5-ce2e432e20b0>", line 64, in run_workflow
    df = workflow(train, val, CELL_TYPE_KEY)
  File "<ipython-input-6-8b9bf882e9f0>", line 37, in run_actinn_workflow
    df = df[['cell_id', 'cell_type', 'predicted', 'probability']]
  File "/home/estorrs/miniconda3/envs/actinn/lib/python3.7/site-packages/pandas/core/frame.py", line 3030, in __getitem__
    indexer = self.loc._get_listlike_indexer(key, axis=1, raise_missing=True)[1]
  File "/home/estorrs/miniconda3/envs/actinn/lib/python3.7/site-packages/pandas/core/indexing.py", line 1266, in _get_listlike_indexer
    self._validate_read_indexer(keyarr, indexer, axis, raise_missing=raise_missing)
  File "/home/estorrs/miniconda3/envs/actinn/lib/python3.7/site-packages/pandas/core/indexing.py", line 1316, in _validate_read_indexer
    raise KeyError(f"{not_found} not in index")
Key

None
scRNAseq pdac (7940, 28756) (7823, 28756)
snATACseq brca_gene_activity (3576, 19891) (3519, 19891)
snATACseq brca_motif (3576, 633) (3519, 633)


Traceback (most recent call last):
  File "<ipython-input-5-ce2e432e20b0>", line 14, in run_workflow_for_datasets
    train, val, directory)
  File "<ipython-input-5-ce2e432e20b0>", line 64, in run_workflow
    df = workflow(train, val, CELL_TYPE_KEY)
  File "<ipython-input-6-8b9bf882e9f0>", line 26, in run_actinn_workflow
    '-trl', train_annotations_fp, '-ts', val_h5_fp))
  File "/home/estorrs/miniconda3/envs/actinn/lib/python3.7/subprocess.py", line 411, in check_output
    **kwargs).stdout
  File "/home/estorrs/miniconda3/envs/actinn/lib/python3.7/subprocess.py", line 512, in run
    output=stdout, stderr=stderr)
subprocess.CalledProcessError: Command '('python', '/home/estorrs/ACTINN/actinn_predict.py', '-trs', '/home/estorrs/pollock/benchmarking/sandbox/train.h5', '-trl', '/home/estorrs/pollock/benchmarking/sandbox/train_annotations.txt', '-ts', '/home/estorrs/pollock/benchmarking/sandbox/val.h5')' returned non-zero exit status 1.


None
snATACseq ccrcc_gene_activity (3000, 19843) (3000, 19843)
snATACseq ccrcc_motif (3000, 633) (3000, 633)


Traceback (most recent call last):
  File "<ipython-input-5-ce2e432e20b0>", line 14, in run_workflow_for_datasets
    train, val, directory)
  File "<ipython-input-5-ce2e432e20b0>", line 64, in run_workflow
    df = workflow(train, val, CELL_TYPE_KEY)
  File "<ipython-input-6-8b9bf882e9f0>", line 26, in run_actinn_workflow
    '-trl', train_annotations_fp, '-ts', val_h5_fp))
  File "/home/estorrs/miniconda3/envs/actinn/lib/python3.7/subprocess.py", line 411, in check_output
    **kwargs).stdout
  File "/home/estorrs/miniconda3/envs/actinn/lib/python3.7/subprocess.py", line 512, in run
    output=stdout, stderr=stderr)
subprocess.CalledProcessError: Command '('python', '/home/estorrs/ACTINN/actinn_predict.py', '-trs', '/home/estorrs/pollock/benchmarking/sandbox/train.h5', '-trl', '/home/estorrs/pollock/benchmarking/sandbox/train_annotations.txt', '-ts', '/home/estorrs/pollock/benchmarking/sandbox/val.h5')' returned non-zero exit status 1.


None
snATACseq gbm_gene_activity (3389, 19891) (2876, 19891)
snATACseq gbm_motif (3393, 633) (2872, 633)


Traceback (most recent call last):
  File "<ipython-input-5-ce2e432e20b0>", line 14, in run_workflow_for_datasets
    train, val, directory)
  File "<ipython-input-5-ce2e432e20b0>", line 64, in run_workflow
    df = workflow(train, val, CELL_TYPE_KEY)
  File "<ipython-input-6-8b9bf882e9f0>", line 26, in run_actinn_workflow
    '-trl', train_annotations_fp, '-ts', val_h5_fp))
  File "/home/estorrs/miniconda3/envs/actinn/lib/python3.7/subprocess.py", line 411, in check_output
    **kwargs).stdout
  File "/home/estorrs/miniconda3/envs/actinn/lib/python3.7/subprocess.py", line 512, in run
    output=stdout, stderr=stderr)
subprocess.CalledProcessError: Command '('python', '/home/estorrs/ACTINN/actinn_predict.py', '-trs', '/home/estorrs/pollock/benchmarking/sandbox/train.h5', '-trl', '/home/estorrs/pollock/benchmarking/sandbox/train_annotations.txt', '-ts', '/home/estorrs/pollock/benchmarking/sandbox/val.h5')' returned non-zero exit status 1.


None
snRNAseq brca (5252, 29175) (4893, 29175)
snRNAseq ccrcc (4754, 33538) (4518, 33538)
snRNAseq gbm (3722, 29748) (3577, 29748)


In [11]:
run_workflow_for_cross_disease(adata_map, run_actinn_workflow, 'actinn', RESULTS_CROSS_DISEASE_DIR)

scRNAseq brca_train_brca_val (6080, 27131) (5773, 27131)
scRNAseq brca_train_cesc_val (6080, 27131) (4277, 22928)
scRNAseq brca_train_hnscc_val (6080, 27131) (5203, 26929)
scRNAseq brca_train_melanoma_val (6080, 27131) (3532, 23452)
failed
scRNAseq brca_train_myeloma_val (6080, 27131) (3312, 24020)
failed
scRNAseq brca_train_pbmc_val (6080, 27131) (886, 32738)
scRNAseq brca_train_pdac_val (6080, 27131) (7840, 28756)
scRNAseq cesc_train_brca_val (4660, 22928) (5773, 27131)
scRNAseq cesc_train_cesc_val (4660, 22928) (4277, 22928)
scRNAseq cesc_train_hnscc_val (4660, 22928) (5203, 26929)
scRNAseq cesc_train_melanoma_val (4660, 22928) (3532, 23452)
failed
scRNAseq cesc_train_myeloma_val (4660, 22928) (3312, 24020)
failed
scRNAseq cesc_train_pbmc_val (4660, 22928) (886, 32738)
scRNAseq cesc_train_pdac_val (4660, 22928) (7840, 28756)
scRNAseq hnscc_train_brca_val (5285, 26929) (5773, 27131)
scRNAseq hnscc_train_cesc_val (5285, 26929) (4277, 22928)
scRNAseq hnscc_train_hnscc_val (5285, 26929)

###### testing stuff

In [None]:
train, val = adata_map['scRNAseq']['pbmc']['train'].copy(), adata_map['scRNAseq']['pbmc']['val'].copy()

In [None]:
# train.obs['dataset'] = ['train'] * train.shape[0]
# val.obs['dataset'] = ['val'] * val.shape[0]
# combined = train.concatenate(val)
# combined

In [None]:
train_counts_df = pd.DataFrame(data=train.X.transpose().toarray(), index=train.var.index.to_list(),
                        columns=train.obs.index.to_list())
val_counts_df = pd.DataFrame(data=val.X.transpose().toarray(), index=val.var.index.to_list(),
                        columns=val.obs.index.to_list())
train_counts_df

In [None]:
train_counts_fp = os.path.join(SANDBOX_DIR, 'train_counts.txt')
val_counts_fp = os.path.join(SANDBOX_DIR, 'val_counts.txt')
train_counts_df.to_csv(train_counts_fp, sep='\t')
val_counts_df.to_csv(val_counts_fp, sep='\t')

python actinn_format.py -i input_file -o output_prefix -f format

python actinn_format.py -i ./test_data/train_set.txt.gz -o train_set -f txt


In [None]:
train_h5_fp = os.path.join(SANDBOX_DIR, 'train.h5')
train_annotations_fp = os.path.join(SANDBOX_DIR, 'train_annotations.txt')
val_h5_fp = os.path.join(SANDBOX_DIR, 'val.h5')

train.obs[[CELL_TYPE_KEY]].to_csv(train_annotations_fp, sep='\t', index=True, header=False)

subprocess.check_output(('python', '/home/estorrs/ACTINN/actinn_format.py', '-i', train_counts_fp,
                        '-o', train_h5_fp.replace('.h5', ''), '-f', 'txt'))

In [None]:
subprocess.check_output(('python', '/home/estorrs/ACTINN/actinn_format.py', '-i', val_counts_fp,
                        '-o', val_h5_fp.replace('.h5', ''), '-f', 'txt'))

In [None]:
train.obs[[CELL_TYPE_KEY]]

python actinn_predict.py -trs training_set -trl training_label -ts test_set -lr learning_rat -ne num_epoch -ms minibatch_size -pc print_cost -op output_probability


-trs Path to the training set, must be HDF5 format with key "dge".

-trl Path to the training label (the cell types for the training set), must be tab separated text file with no column and row names.

-ts Path to test sets, must be HDF5 format with key "dge".

-lr Learning rate (default: 0.0001). We can increase the learning rate if the cost drops too slow, or decrease the learning rate if the cost drops super fast in the beginning and starts to fluctuate in later epochs.

-ne Number of epochs (default: 50). The number of epochs can be determined by looking at the cost after each epoch. If the cost starts to decrease very slowly after ceartain epoch, then the "ne" parameter should be set to that epoch number.

-ms Minibatch size (default: 128). This parameter can be set larger when training a large dataset.

-pc Print cost (default: True). Whether to print cost after each 5 epochs.

-op Output probabilities for each cell being the cell types in the training data (default: False).


In [None]:
subprocess.check_output(('python', '/home/estorrs/ACTINN/actinn_predict.py', '-trs', train_h5_fp,
                        '-trl', train_annotations_fp, '-ts', val_h5_fp))

In [None]:
' '.join(('python', '/home/estorrs/ACTINN/actinn_predict.py', '-trs', train_h5_fp,
                        '-trl', train_annotations_fp, '-ts', val_h5_fp,
                        '-op', 'True'))

In [None]:
prediction_df = pd.read_csv('predicted_label.txt', sep='\t')
prediction_df

In [None]:
df = pd.DataFrame.from_dict({
        'cell_id': prediction_df['cellname'].to_list(),
        'prediction': prediction_df['celltype'].to_list(),
        'probability': [np.nan] * val.shape[0]
    })
df

In [None]:
val.obs

In [None]:
df = pd.merge(df, val.obs, left_on='cell_id', right_index=True)
df = df[['cell_id', 'cell_type', 'prediction', 'probability']]
df.columns = ['cell_id', 'groundtruth', 'prediction', 'probability']
df


##### Seurat

In [None]:
def run_seurat_transfer(train, val, cell_type_key):
    # save the input data for the seurat script
    train_counts_fp, val_counts_fp = (os.path.join(SANDBOX_DIR, 'train_counts.txt'),
                                        os.path.join(SANDBOX_DIR, 'val_counts.txt'))
    train_annotations_fp, val_annotations_fp = (os.path.join(SANDBOX_DIR, 'train_annotations.txt'),
                                                os.path.join(SANDBOX_DIR, 'val_annotations.txt'))

    ## prepare train and val count matrices
    X = train.X.toarray() if 'sparse' in str(type(train.X)) else train.X
    train_counts = pd.DataFrame(data=X.transpose().astype(np.int32), index=train.var.index,
                                columns=train.obs.index)
    train_counts.index.name = ''
    # for some reason SCTransform fails if the integer values are too high, so capping them here
    cap = pow(2, 14)
    train_counts.values[train_counts.values>cap] = cap
    train_counts.to_csv(train_counts_fp, sep='\t', header=True, index=True)
    
    X = val.X.toarray() if 'sparse' in str(type(val.X)) else val.X
    val_counts = pd.DataFrame(data=X.transpose().astype(np.int32), index=val.var.index,
                                columns=val.obs.index)
    val_counts.index.name = ''
    val_counts.values[val_counts.values>cap] = cap
    val_counts.to_csv(val_counts_fp, sep='\t', header=True, index=True)

    train.obs[[CELL_TYPE_KEY]].to_csv(train_annotations_fp, sep='\t', header=False, index=False)
    val.obs[[CELL_TYPE_KEY]].to_csv(val_annotations_fp, sep='\t', header=False, index=False)
    
    # actually run the script and read the results back in
    prediction_fp = os.path.join(SANDBOX_DIR, 'seurat_predictions.txt')
    try:
        subprocess.check_output(('Rscript', SEURAT_SCRIPT, train_counts_fp, train_annotations_fp,
                            val_counts_fp, val_annotations_fp, prediction_fp))
    except subprocess.CalledProcessError as e:
        print(f'called process error', e)
        return pd.DataFrame()
    
    # format the predictions dataframe
    df = pd.read_csv(prediction_fp, sep='\t')
    df.index = [x.replace('.', '-') for x in df.index]
    # also remove that weird X thing seurat sometimes puts there if first char is _
    df.index = [x[1:] if x[:2]=='X_' else x for x in df.index]
    df = pd.merge(df, val.obs, left_index=True, right_index=True)
    df['cell_id'] = df.index.to_list()
    try:
        df = df[['cell_id', 'cell_type', 'predicted.id', 'prediction.score.max']]        
        df.columns = ['cell_id', 'groundtruth', 'predicted', 'probability']
        return df
    except KeyError as e:
        print(f'key error', e)
        return pd.DataFrame()

In [None]:
SEURAT_SCRIPT = '/home/estorrs/pollock/benchmarking/tools/run_seurat_workflow.R'
run_workflow_for_datasets(adata_map, run_seurat_transfer, 'seurat_transfer', RESULTS_DIR)

In [None]:
run_workflow_for_cross_disease(adata_map, run_seurat_transfer, 'seurat_transfer', RESULTS_CROSS_DISEASE_DIR)

In [None]:
# run_workflow_for_cross_datatype(adata_map, run_seurat_transfer, 'seurat_transfer', RESULTS_CROSS_DTYPE_DIR)

###### testing stuff

In [None]:
# train, val = adata_map['scRNAseq']['pbmc']['train'].copy(), adata_map['scRNAseq']['pbmc']['val'].copy()
train, val = sc.read_h5ad(adata_map['scRNAseq']['brca']['train']), sc.read_h5ad(adata_map['scRNAseq']['brca']['val'])

In [None]:
pow(2, 14)

In [None]:
# save the input data for the seurat script
train_counts_fp, val_counts_fp = (os.path.join(SANDBOX_DIR, 'train_counts.txt'),
                                    os.path.join(SANDBOX_DIR, 'val_counts.txt'))
train_annotations_fp, val_annotations_fp = (os.path.join(SANDBOX_DIR, 'train_annotations.txt'),
                                            os.path.join(SANDBOX_DIR, 'val_annotations.txt'))

## prepare train and val count matrices
X = train.X.toarray() if 'sparse' in str(type(train.X)) else train.X
train_counts = pd.DataFrame(data=X.transpose().astype(np.int32), index=train.var.index,
                            columns=train.obs.index)
train_counts.index.name = ''
# for some reason SCTransform fails if the integer values are too high, so capping them here
cap = pow(2, 14)
train_counts.values[train_counts.values>cap] = cap
train_counts.to_csv(train_counts_fp, sep='\t', header=True, index=True)

X = val.X.toarray() if 'sparse' in str(type(val.X)) else val.X
val_counts = pd.DataFrame(data=X.transpose().astype(np.int32), index=val.var.index,
                            columns=val.obs.index)
val_counts.index.name = ''
val_counts.values[val_counts.values>cap] = cap
val_counts.to_csv(val_counts_fp, sep='\t', header=True, index=True)

train.obs[[CELL_TYPE_KEY]].to_csv(train_annotations_fp, sep='\t', header=False, index=False)
val.obs[[CELL_TYPE_KEY]].to_csv(val_annotations_fp, sep='\t', header=False, index=False)

In [None]:
train_counts

In [None]:
train_counts

In [None]:
type(train_counts.values), type(train_counts.values[0, 0])

In [None]:
vals = sorted(set(train_counts.values.flatten()))
vals

In [None]:
vals[:10], vals[-10:]

In [None]:
train_counts.values[train_counts.values>1000] = 1000

In [None]:
np.where(train_counts>1)

In [None]:
# actually run the script and read the results back in
prediction_fp = os.path.join(SANDBOX_DIR, 'seurat_predictions.txt')
subprocess.check_output(('Rscript', SEURAT_SCRIPT, train_counts_fp, train_annotations_fp,
                    val_counts_fp, val_annotations_fp, prediction_fp))

In [None]:
# format the predictions dataframe
df = pd.read_csv(prediction_fp, sep='\t')
df.index = [x.replace('.', '-') for x in df.index]
# also remove that weird X thing seurat sometimes puts there
df.index = [x[1:] if x[:2]=='X_' else x for x in df.index]
df = pd.merge(df, val.obs, left_index=True, right_index=True)
df['cell_id'] = df.index.to_list()
df = df[['cell_id', 'cell_type', 'predicted.id', 'prediction.score.max']]        
df.columns = ['cell_id', 'groundtruth', 'predicted', 'probability']
df

In [None]:
val.obs

##### SingleCellNet

In [None]:
# !pip install git+https://github.com/pcahan1/PySingleCellNet/

In [15]:
import pySingleCellNet as pySCN

In [16]:
import scipy

In [17]:
def run_SingleCellNet(train, val, cell_type_key):
#     if 'sparse' in str(type(train.X)): train.X = train.X.toarray()
    if 'sparse' not in str(type(train.X)): train.X = scipy.sparse.csr_matrix(train.X)
    if 'sparse' in str(type(val.X)): val.X = val.X.toarray()
    
    # save the input data for the seurat script
    cgenesA, xpairs, tspRF = pySCN.scn_train(train,
            nTopGenes=100, nRand=100, nTrees=1000, nTopGenePairs=100,
            dLevel=cell_type_key, stratify=True, limitToHVG=True, )
    predictions = pySCN.scn_classify(val, cgenesA, xpairs, tspRF, nrand = 0)
    
    df = pd.merge(predictions.obs[['SCN_class']], val.obs, left_index=True, right_index=True)
    
    df = df[['cell_type', 'SCN_class']]
    df.columns = ['groundtruth', 'predicted']
    df['cell_id'] = df.index.to_list()
    df['probability'] = [np.nan] * df.shape[0]
    df = df[['cell_id', 'groundtruth', 'predicted', 'probability']]
    

    return df

In [18]:
run_workflow_for_datasets(adata_map, run_SingleCellNet, 'SingleCellNet', RESULTS_DIR)

scRNAseq brca (6105, 27131) (5748, 27131)
HVG
Matrix normalized
There are  1061  classification genes

B cell
CD4 T cell
CD8 T cell
Dendritic
Endothelial
Erythrocyte
Fibroblast
Malignant
Mast
Monocyte
NK
Plasma
Treg
There are 1265 top gene pairs

Finished pair transforming the data

scRNAseq cesc (4661, 22928) (4276, 22928)
HVG
Matrix normalized
There are  984  classification genes

CD4 T cell
CD8 T cell
Endothelial
Epithelial
Erythrocyte
Fibroblast
Malignant
Mast
Monocyte
NK
Plasma
There are 1075 top gene pairs

Finished pair transforming the data

scRNAseq hnscc (5287, 26929) (5201, 26929)
HVG
Matrix normalized
There are  1077  classification genes

B cell
CD4 T cell
CD8 T cell
Endothelial
Erythrocyte
Malignant
Mast
Monocyte
NK
Plasma
Treg
There are 1067 top gene pairs

Finished pair transforming the data

scRNAseq melanoma (4218, 23452) (3517, 23452)
HVG
Matrix normalized
There are  892  classification genes

B cell
CD4 T cell
CD8 T cell
Dendritic
Fibroblast
Malignant
Monocyte
NK
Pl

Traceback (most recent call last):
  File "<ipython-input-12-ce2e432e20b0>", line 14, in run_workflow_for_datasets
    train, val, directory)
  File "<ipython-input-12-ce2e432e20b0>", line 64, in run_workflow
    df = workflow(train, val, CELL_TYPE_KEY)
  File "<ipython-input-17-8387889a0627>", line 9, in run_SingleCellNet
    dLevel=cell_type_key, stratify=True, limitToHVG=True, )
  File "/home/estorrs/miniconda3/envs/pollock_dev/lib/python3.7/site-packages/pySingleCellNet/scn_train.py", line 51, in scn_train
    sc.pp.highly_variable_genes(adNorm, min_mean=0.0125, max_mean=4, min_disp=0.5)
  File "/home/estorrs/miniconda3/envs/pollock_dev/lib/python3.7/site-packages/scanpy/preprocessing/_highly_variable_genes.py", line 435, in highly_variable_genes
    flavor=flavor,
  File "/home/estorrs/miniconda3/envs/pollock_dev/lib/python3.7/site-packages/scanpy/preprocessing/_highly_variable_genes.py", line 215, in _highly_variable_genes_single_batch
    df['mean_bin'] = pd.cut(df['means'], bin

snATACseq ccrcc_gene_activity (3000, 19843) (3000, 19843)
HVG
Matrix normalized
There are  772  classification genes

Endothelial
Epithelial
Fibroblast
Malignant
Monocyte
T cells
There are 594 top gene pairs

Finished pair transforming the data

snATACseq ccrcc_motif (3000, 633) (3000, 633)
HVG
None


Traceback (most recent call last):
  File "<ipython-input-12-ce2e432e20b0>", line 14, in run_workflow_for_datasets
    train, val, directory)
  File "<ipython-input-12-ce2e432e20b0>", line 64, in run_workflow
    df = workflow(train, val, CELL_TYPE_KEY)
  File "<ipython-input-17-8387889a0627>", line 9, in run_SingleCellNet
    dLevel=cell_type_key, stratify=True, limitToHVG=True, )
  File "/home/estorrs/miniconda3/envs/pollock_dev/lib/python3.7/site-packages/pySingleCellNet/scn_train.py", line 51, in scn_train
    sc.pp.highly_variable_genes(adNorm, min_mean=0.0125, max_mean=4, min_disp=0.5)
  File "/home/estorrs/miniconda3/envs/pollock_dev/lib/python3.7/site-packages/scanpy/preprocessing/_highly_variable_genes.py", line 435, in highly_variable_genes
    flavor=flavor,
  File "/home/estorrs/miniconda3/envs/pollock_dev/lib/python3.7/site-packages/scanpy/preprocessing/_highly_variable_genes.py", line 215, in _highly_variable_genes_single_batch
    df['mean_bin'] = pd.cut(df['means'], bin

snATACseq gbm_gene_activity (3389, 19891) (2876, 19891)
HVG
Matrix normalized
There are  968  classification genes

B cell
Endothelial
Fibroblast
Malignant
Microglia
Neuron
Oligodendrocytes
T cells
There are 790 top gene pairs

Finished pair transforming the data

snATACseq gbm_motif (3393, 633) (2872, 633)
HVG
None


Traceback (most recent call last):
  File "<ipython-input-12-ce2e432e20b0>", line 14, in run_workflow_for_datasets
    train, val, directory)
  File "<ipython-input-12-ce2e432e20b0>", line 64, in run_workflow
    df = workflow(train, val, CELL_TYPE_KEY)
  File "<ipython-input-17-8387889a0627>", line 9, in run_SingleCellNet
    dLevel=cell_type_key, stratify=True, limitToHVG=True, )
  File "/home/estorrs/miniconda3/envs/pollock_dev/lib/python3.7/site-packages/pySingleCellNet/scn_train.py", line 51, in scn_train
    sc.pp.highly_variable_genes(adNorm, min_mean=0.0125, max_mean=4, min_disp=0.5)
  File "/home/estorrs/miniconda3/envs/pollock_dev/lib/python3.7/site-packages/scanpy/preprocessing/_highly_variable_genes.py", line 435, in highly_variable_genes
    flavor=flavor,
  File "/home/estorrs/miniconda3/envs/pollock_dev/lib/python3.7/site-packages/scanpy/preprocessing/_highly_variable_genes.py", line 215, in _highly_variable_genes_single_batch
    df['mean_bin'] = pd.cut(df['means'], bin

snRNAseq brca (5252, 29175) (4893, 29175)
HVG
Matrix normalized
There are  1164  classification genes

Adipocyte
B cell
CD4 T cell
CD8 T cell
Dendritic
Endothelial
Fibroblast
Malignant
Mast
Monocyte
NK
Plasma
Treg
There are 1243 top gene pairs

Finished pair transforming the data

snRNAseq ccrcc (4754, 33538) (4518, 33538)
HVG
Matrix normalized
There are  1112  classification genes

CD4 T cell
CD8 T cell
Dendritic
Endothelial
Epithelial
Fibroblast
Malignant
Monocyte
NK
Plasma
Treg
There are 1058 top gene pairs

Finished pair transforming the data

snRNAseq gbm (3722, 29748) (3577, 29748)
HVG
Matrix normalized
There are  1003  classification genes

B cell
Endothelial
Fibroblast
Malignant
Microglia
Monocyte
Neuron
Oligodendrocytes
T cells
There are 890 top gene pairs

Finished pair transforming the data



In [26]:
run_workflow_for_cross_disease(adata_map, run_SingleCellNet, 'SingleCellNet', RESULTS_CROSS_DISEASE_DIR)

scRNAseq brca_train_brca_val (6080, 27131) (5773, 27131)
HVG
Matrix normalized
There are  1077  classification genes

B cell
CD4 T cell
CD8 T cell
Dendritic
Endothelial
Erythrocyte
Fibroblast
Malignant
Mast
Monocyte
NK
Plasma
Treg
There are 1263 top gene pairs

Finished pair transforming the data

scRNAseq brca_train_cesc_val (6080, 27131) (4277, 22928)
HVG
Matrix normalized
There are  1077  classification genes

B cell
CD4 T cell
CD8 T cell
Dendritic
Endothelial
Erythrocyte
Fibroblast
Malignant
Mast
Monocyte
NK
Plasma
Treg
There are 1263 top gene pairs

Finished pair transforming the data

scRNAseq brca_train_hnscc_val (6080, 27131) (5203, 26929)
HVG
Matrix normalized
There are  1077  classification genes

B cell
CD4 T cell
CD8 T cell
Dendritic
Endothelial
Erythrocyte
Fibroblast
Malignant
Mast
Monocyte
NK
Plasma
Treg
There are 1263 top gene pairs

Finished pair transforming the data

scRNAseq brca_train_melanoma_val (6080, 27131) (3532, 23452)
HVG
Matrix normalized
There are  1077  cl

There are 1068 top gene pairs

Finished pair transforming the data

scRNAseq myeloma_train_cesc_val (3617, 24020) (4277, 22928)
HVG
Matrix normalized
There are  943  classification genes

B cell
CD4 T cell
CD8 T cell
Dendritic
Erythrocyte
Malignant
Mast
Monocyte
NK
Plasma
Platlete
There are 1068 top gene pairs

Finished pair transforming the data

scRNAseq myeloma_train_hnscc_val (3617, 24020) (5203, 26929)
HVG
Matrix normalized
There are  943  classification genes

B cell
CD4 T cell
CD8 T cell
Dendritic
Erythrocyte
Malignant
Mast
Monocyte
NK
Plasma
Platlete
There are 1068 top gene pairs

Finished pair transforming the data

scRNAseq myeloma_train_melanoma_val (3617, 24020) (3532, 23452)
HVG
Matrix normalized
There are  943  classification genes

B cell
CD4 T cell
CD8 T cell
Dendritic
Erythrocyte
Malignant
Mast
Monocyte
NK
Plasma
Platlete
There are 1068 top gene pairs

Finished pair transforming the data

scRNAseq myeloma_train_myeloma_val (3617, 24020) (3312, 24020)
HVG
Matrix normali

There are 594 top gene pairs

Finished pair transforming the data

snATACseq ccrcc_gene_activity_train_brca_motif_val (3000, 19843) (3519, 633)
HVG
Matrix normalized
There are  782  classification genes

Endothelial
Epithelial
Fibroblast
Malignant
Monocyte
T cells
There are 594 top gene pairs

Finished pair transforming the data

snATACseq ccrcc_gene_activity_train_ccrcc_gene_activity_val (3000, 19843) (3000, 19843)
HVG
Matrix normalized
There are  782  classification genes

Endothelial
Epithelial
Fibroblast
Malignant
Monocyte
T cells
There are 594 top gene pairs

Finished pair transforming the data

snATACseq ccrcc_gene_activity_train_ccrcc_motif_val (3000, 19843) (3000, 633)
HVG
Matrix normalized
There are  782  classification genes

Endothelial
Epithelial
Fibroblast
Malignant
Monocyte
T cells
There are 594 top gene pairs

Finished pair transforming the data

snATACseq ccrcc_gene_activity_train_gbm_gene_activity_val (3000, 19843) (2875, 19891)
HVG
Matrix normalized
There are  782  cl

In [None]:
# run_workflow_for_cross_datatype(adata_map, run_SingleCellNet, 'SingleCellNet', RESULTS_CROSS_DTYPE_DIR)

###### testing stuff

In [None]:
train, val = (sc.read_h5ad(adata_map['snATACseq']['brca_motif']['train']),
              sc.read_h5ad(adata_map['snATACseq']['brca_motif']['val']))

In [None]:
if 'sparse' not in str(type(train.X)): train.X = scipy.sparse.csr_matrix(train.X)
if 'sparse' in str(type(val.X)): val.X = val.X.toarray()

In [None]:
train.var

In [None]:
len(set(train.var.index))

In [None]:
cgenesA, xpairs, tspRF = pySCN.scn_train(train,
            nTopGenes = 100, nRand = 100, nTrees = 1000 ,nTopGenePairs = 100,
            dLevel = "cell_type", stratify=True, limitToHVG=True, )

In [None]:
train.shape, val.shape

In [None]:
val.obs

In [None]:
tspRF

In [None]:
val.X

In [None]:
val.X = val.X.toarray()

In [None]:
predictions = pySCN.scn_classify(val, cgenesA, xpairs, tspRF, nrand = 0)


In [None]:
predictions.obs

In [None]:
df = pd.merge(predictions.obs[['SCN_class']], val.obs, left_index=True, right_index=True)

df = df[['cell_type', 'SCN_class']]
df.index.name = 'cell_id'
df.columns = ['groundtruth', 'predictions']
df['probability'] = [np.nan] * df.shape[0]
df

##### MARS

In [None]:
from args_parser import get_parser
from model.mars import MARS
from model.experiment_dataset import ExperimentDataset
from sklearn.utils.linear_assignment_ import linear_assignment

In [None]:
##pulled from evaluation.py and modified to remove class thing
def hungarian_match(y_true, y_pred):
    """Matches predicted labels to original using hungarian algorithm."""
    
    y_true = adjust_range(y_true)
    y_pred = adjust_range(y_pred)
    
    D = max(y_pred.max(), y_true.max()) + 1
    w = np.zeros((D, D), dtype=np.int64)
    # Confusion matrix.
    for i in range(y_pred.size):
        w[y_pred[i], y_true[i]] += 1
    ind = linear_assignment(-w)
    d = {i:j for i, j in ind}
    y_pred = np.array([d[v] for v in y_pred])
    
    return y_true, y_pred


def adjust_range(y):
    """Assures that the range of indices if from 0 to n-1."""
    y = np.array(y, dtype=np.int64)
    val_set = set(y)
    mapping = {val:i for  i,val in enumerate(val_set)}
    y = np.array([mapping[val] for val in y], dtype=np.int64)
    return y
    
    
def run_mars_workflow(train, val, cell_type_key):
    params, unknown = get_parser().parse_known_args()
    params.device = 'cpu'
    
    if 'sparse' not in str(type(train.X)): train.X = scipy.sparse.csr_matrix(train.X)
    if 'sparse' not in str(type(val.X)): val.X = scipy.sparse.csr_matrix(val.X)
        
    var_names = train.var_names.intersection(val.var_names)
    train = train[:, var_names]
    val = val[:, var_names]

    train.obs['dataset'] = 'train'
    val.obs['dataset'] = 'val'

    adata = anndata.concat((train, val))

    sc.pp.normalize_per_cell(adata, counts_per_cell_after=1e4)
    sc.pp.scale(adata, zero_center=True, max_value=10.)

    sc.pp.neighbors(adata, n_neighbors=30, use_rep='X')
    sc.pp.pca(adata, n_comps=50)
    
    train = adata[adata.obs['dataset'] == 'train',:]
    val = adata[adata.obs['dataset'] == 'val',:]

    train = train[train.obs.sort_values(cell_type_key).index]
    val = val[val.obs.sort_values(cell_type_key).index]
    
    class_to_int = {c:i for i, c in enumerate(sorted(set(train.obs[cell_type_key])))}
    int_to_class = {i:c for c, i in class_to_int.items()}
    
    y_train = np.array([class_to_int[c] for c in train.obs[cell_type_key]], dtype=np.int64)
    annotated = ExperimentDataset(train.X.toarray(), train.obs_names, train.var_names, 'train', y_train, )
    y_val = np.array([class_to_int[c] for c in val.obs[cell_type_key]], dtype=np.int64)
    unannotated = ExperimentDataset(val.X.toarray(), val.obs_names, val.var_names, 'val', y_val)
    
    pretrain_data = ExperimentDataset(val.X.toarray(), val.obs_names, val.var_names, 'val')
    n_clusters = len(np.unique(unannotated.y))
    mars = MARS(n_clusters, params, [annotated], unannotated, pretrain_data, hid_dim_1=1000, hid_dim_2=100)
    a, landmarks, scores = mars.train(evaluation_mode=True, save_all_embeddings=True)
    
    preds = a[a.obs['experiment']=='val'].copy()
    preds.obs.index = [x.replace('-val', '') for x in preds.obs.index]
    _, adjusted = hungarian_match(preds.obs['truth_labels'], preds.obs['MARS_labels'])
    preds.obs['adjusted_predicted'] = adjusted
    preds.obs['groundtruth'] = [val.obs.loc[x, cell_type_key] for x in preds.obs.index]
    preds.obs['predicted'] = [int_to_class[x] for x in preds.obs['adjusted_predicted']]

    df = preds.obs[['groundtruth', 'predicted']]
    df['cell_id'] = df.index.to_list()
    df['probability'] = np.nan
    df = df[['cell_id', 'groundtruth', 'predicted', 'probability']]

    return df

In [None]:
run_workflow_for_datasets(adata_map, run_mars_workflow, 'mars', RESULTS_DIR)

In [None]:
run_workflow_for_cross_disease(adata_map, run_mars_workflow, 'mars', RESULTS_CROSS_DISEASE_DIR)

In [None]:
# run_workflow_for_cross_datatype(adata_map, run_mars_workflow, 'mars', RESULTS_CROSS_DTYPE_DIR)

###### testing stuff

In [None]:
from model.mars import MARS
from model.experiment_dataset import ExperimentDataset

In [None]:
train, val = (sc.read_h5ad(adata_map['scRNAseq']['pbmc']['train']),
              sc.read_h5ad(adata_map['scRNAseq']['pbmc']['val']))

In [None]:
if 'sparse' not in str(type(train.X)): train.X = scipy.sparse.csr_matrix(train.X)
if 'sparse' not in str(type(val.X)): val.X = scipy.sparse.csr_matrix(val.X)


In [None]:
var_names = train.var_names.intersection(val.var_names)
train = train[:, var_names]
val = val[:, var_names]

train.obs['dataset'] = 'train'
val.obs['dataset'] = 'val'

combined = train.concat(val)

sc.pp.normalize_per_cell(adata, counts_per_cell_after=1e4)
sc.pp.scale(adata, zero_center=True)

sc.pp.neighbors(adata, n_neighbors=30, use_rep='X')
sc.pp.pca(adata, n_comps=50)

In [None]:
train = adata[adata.obs['dataset'] == 'train',:]
val = adata[adata.obs['dataset'] == 'val',:]

In [None]:
y_train = np.array(train.obs['cell_type'])
annotated = ExperimentDataset(train.X.toarray(), train.obs_names, train.var_names, 'train', y_train)

In [None]:
y_val = np.array(val.obs['cell_type'])
unannotated = ExperimentDataset(val.X.toarray(), val.obs_names, val.var_names, 'val', y_val)

In [None]:
pretrain_data = ExperimentDataset(val.X.toarray(), val.obs_names, val.var_names, 'val')

In [None]:

n_clusters = len(np.unique(unannnotated.y))

In [None]:
mars = MARS(n_clusters, params, [annotated], unannnotated, pretrain_data, hid_dim_1=1000, hid_dim_2=100)

In [None]:
# return both annotated and unannotated datasets with save_all_embeddings
adata, landmarks, scores = mars.train(evaluation_mode=True, save_all_embeddings=True) # evaluation mode

In [None]:
adata.obs