In [1]:
import json
import logging
import os
import re
from pathlib import Path

import anndata
import numpy as np
import pandas as pd
import scanpy as sc
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
%load_ext autoreload

In [3]:
%autoreload 2

In [4]:
import pollock.utils as utils
from pollock.explain import explain_adata

In [5]:
data_dir = '/data/pollock/benchmarking/pollock_datasets/'
fps = utils.listfiles(data_dir, regex=r'.h5ad')
fps = [fp for fp in fps if '_train' in fp or '_val' in fp]

fmap = {}
for fp in fps:
    dtype = fp.split('/')[-2]
    disease = fp.split('/')[-1].split('_')[0]
    partition = 'train' if '_train' in fp else 'val'
    
    if dtype not in fmap:
        fmap[dtype] = {}
    if disease not in fmap[dtype]:
        fmap[dtype][disease] = {}
    
    if not (dtype == 'snATACseq' and ('peaks' in fp or 'motif' in fp)):
        fmap[dtype][disease][partition] = fp
fmap

{'snRNAseq': {'ccrcc': {'train': '/data/pollock/benchmarking/pollock_datasets/snRNAseq/ccrcc_train.h5ad',
   'val': '/data/pollock/benchmarking/pollock_datasets/snRNAseq/ccrcc_val.h5ad'},
  'brca': {'train': '/data/pollock/benchmarking/pollock_datasets/snRNAseq/brca_train.h5ad',
   'val': '/data/pollock/benchmarking/pollock_datasets/snRNAseq/brca_val.h5ad'},
  'gbm': {'val': '/data/pollock/benchmarking/pollock_datasets/snRNAseq/gbm_val.h5ad',
   'train': '/data/pollock/benchmarking/pollock_datasets/snRNAseq/gbm_train.h5ad'}},
 'snATACseq': {'brca': {'train': '/data/pollock/benchmarking/pollock_datasets/snATACseq/brca_gene_activity_train.h5ad',
   'val': '/data/pollock/benchmarking/pollock_datasets/snATACseq/brca_gene_activity_val.h5ad'},
  'ccrcc': {'train': '/data/pollock/benchmarking/pollock_datasets/snATACseq/ccrcc_gene_activity_train.h5ad',
   'val': '/data/pollock/benchmarking/pollock_datasets/snATACseq/ccrcc_gene_activity_val.h5ad'},
  'gbm': {'val': '/data/pollock/benchmarking/p

In [6]:
data_dir = '/data/pollock/benchmarking/pollock_datasets_with_folds/'
fps = utils.listfiles(data_dir, regex=r'.h5ad')
fps = [fp for fp in fps if '_train' in fp or '_val' in fp]

folds_fmap = {}
for fp in fps:
    dtype = fp.split('/')[-2]
    disease = fp.split('/')[-1].split('_')[0]
    partition = 'train' if '_train' in fp else 'val'
    fold = fp.split('/')[-1].split('_')[-2]
    
    if dtype not in folds_fmap:
        folds_fmap[dtype] = {}
    if disease not in folds_fmap[dtype]:
        folds_fmap[dtype][disease] = {}
    if fold not in folds_fmap[dtype][disease]:
        folds_fmap[dtype][disease][fold] = {}
    
    if not (dtype == 'snATACseq' and ('peaks' in fp or 'motif' in fp)):
        folds_fmap[dtype][disease][fold][partition] = fp
folds_fmap

{'snRNAseq': {'ccrcc': {'fold3': {'train': '/data/pollock/benchmarking/pollock_datasets_with_folds/snRNAseq/ccrcc_fold3_train.h5ad',
    'val': '/data/pollock/benchmarking/pollock_datasets_with_folds/snRNAseq/ccrcc_fold3_val.h5ad'},
   'fold1': {'val': '/data/pollock/benchmarking/pollock_datasets_with_folds/snRNAseq/ccrcc_fold1_val.h5ad',
    'train': '/data/pollock/benchmarking/pollock_datasets_with_folds/snRNAseq/ccrcc_fold1_train.h5ad'},
   'fold2': {'train': '/data/pollock/benchmarking/pollock_datasets_with_folds/snRNAseq/ccrcc_fold2_train.h5ad',
    'val': '/data/pollock/benchmarking/pollock_datasets_with_folds/snRNAseq/ccrcc_fold2_val.h5ad'},
   'fold0': {'val': '/data/pollock/benchmarking/pollock_datasets_with_folds/snRNAseq/ccrcc_fold0_val.h5ad',
    'train': '/data/pollock/benchmarking/pollock_datasets_with_folds/snRNAseq/ccrcc_fold0_train.h5ad'},
   'fold4': {'val': '/data/pollock/benchmarking/pollock_datasets_with_folds/snRNAseq/ccrcc_fold4_val.h5ad',
    'train': '/data/pol

In [7]:
def get_metrics(y_true_labels, y_pred_labels, overlapping_only=False):
    classes = sorted(set(y_true_labels))
    
    if overlapping_only:
        classes = [c for c in classes if c in y_pred_labels]
        y_pred_labels, y_true_labels = zip(*[(p, t) for p, t in zip(y_pred_labels, y_true_labels)
                              if t in classes])
        classes = sorted(set(y_true_labels).union(set(y_pred_labels)))

    y_pred = [classes.index(c) for c in y_pred_labels]
    y_true = [classes.index(c) for c in y_true_labels]

    d = classification_report(y_true, y_pred, labels=list(range(len(classes))), target_names=classes,
                             output_dict=True)
    d = pd.DataFrame.from_dict(d)
    cm = confusion_matrix(y_true, y_pred, labels=list(range(len(classes))))
    cm = pd.DataFrame(data=cm, index=classes, columns=classes)
    
    d.index.name = 'metric'
    cm.index.name = 'cell_type'
    
    return d, cm

#### benchmarking pollock

###### train modules

In [8]:
# use default train args
args = utils.DEFAULT_TRAIN_ARGS
args['use_cuda'] = True
args

{'lr': 0.0001,
 'epochs': 20,
 'batch_size': 64,
 'latent_dim': 64,
 'enc_out_dim': 128,
 'middle_dim': 512,
 'kl_scaler': 0.001,
 'clf_scaler': 1.0,
 'zinb_scaler': 0.5,
 'use_cuda': True,
 'cell_type_key': 'cell_type',
 'module_filepath': './new_module'}

In [9]:
module_dir = '/data/pollock/benchmarking/modules/'
fold_module_dir = '/data/pollock/benchmarking/modules_folds/'

In [10]:
# with folds
# start = None
start = ('scRNAseq', 'myeloma', 'fold0')
# valid = True
valid = False
for dtype, d1 in folds_fmap.items():
    for disease, d2 in d1.items():
        for fold, d3 in d2.items():
            print(dtype, disease, fold)
            if start is None or start == (dtype, disease, fold):
                valid = True
                
            if valid:
                train = sc.read_h5ad(d3['train'])
                val = sc.read_h5ad(d3['val'])
                args['module_filepath'] = os.path.join(fold_module_dir, f'{dtype}_{disease}_{fold}')
                utils.train_and_save_model(train, val, args)

snRNAseq ccrcc fold3
snRNAseq ccrcc fold1
snRNAseq ccrcc fold2
snRNAseq ccrcc fold0
snRNAseq ccrcc fold4
snRNAseq brca fold2
snRNAseq brca fold1
snRNAseq brca fold3
snRNAseq brca fold0
snRNAseq brca fold4
snRNAseq gbm fold4
snRNAseq gbm fold3
snRNAseq gbm fold0
snRNAseq gbm fold2
snRNAseq gbm fold1
snATACseq ccrcc fold3
snATACseq ccrcc fold1
snATACseq ccrcc fold2
snATACseq ccrcc fold0
snATACseq ccrcc fold4
snATACseq brca fold2
snATACseq brca fold1
snATACseq brca fold3
snATACseq brca fold0
snATACseq brca fold4
snATACseq gbm fold4
snATACseq gbm fold3
snATACseq gbm fold0
snATACseq gbm fold2
snATACseq gbm fold1
scRNAseq hnscc fold0
scRNAseq hnscc fold4
scRNAseq hnscc fold1
scRNAseq hnscc fold3
scRNAseq hnscc fold2
scRNAseq cesc fold4
scRNAseq cesc fold3
scRNAseq cesc fold2
scRNAseq cesc fold0
scRNAseq cesc fold1
scRNAseq brca fold2
scRNAseq brca fold1
scRNAseq brca fold3
scRNAseq brca fold0
scRNAseq brca fold4
scRNAseq myeloma fold0


2022-03-24 11:46:14,556 beginning training
2022-03-24 11:46:14,557 creating dataloaders
2022-03-24 11:46:15,386 18620 genes overlap with model after filtering
2022-03-24 11:46:15,387 1218 genes missing from dataset after filtering
2022-03-24 11:46:16,386 creating model
2022-03-24 11:46:19,200 training dataset size: 3617, validation dataset size: 3312, cell types: ['B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Erythrocyte', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Plasma', 'Platlete']
2022-03-24 11:46:19,201 fitting model
2022-03-24 11:46:21,854 epoch: 0, train loss: 2.619, val loss: 2.615, zinb loss: 0.438, kl loss: 0.622, clf loss: 2.395, time: 1.93
2022-03-24 11:46:24,379 epoch: 1, train loss: 2.609, val loss: 2.598, zinb loss: 0.417, kl loss: 1.275, clf loss: 2.388, time: 1.73
2022-03-24 11:46:26,868 epoch: 2, train loss: 2.550, val loss: 2.446, zinb loss: 0.305, kl loss: 40.402, clf loss: 2.253, time: 1.69
2022-03-24 11:46:29,341 epoch: 3, train loss: 2.267, val loss: 2.096, z

scRNAseq myeloma fold1


2022-03-24 11:47:10,998 18583 genes overlap with model after filtering
2022-03-24 11:47:10,999 1374 genes missing from dataset after filtering
2022-03-24 11:47:11,981 creating model
2022-03-24 11:47:12,266 training dataset size: 3617, validation dataset size: 3312, cell types: ['B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Erythrocyte', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Plasma', 'Platlete']
2022-03-24 11:47:12,269 fitting model
2022-03-24 11:47:14,969 epoch: 0, train loss: 2.622, val loss: 2.618, zinb loss: 0.437, kl loss: 0.505, clf loss: 2.399, time: 1.89
2022-03-24 11:47:17,577 epoch: 1, train loss: 2.612, val loss: 2.599, zinb loss: 0.416, kl loss: 1.611, clf loss: 2.390, time: 1.81
2022-03-24 11:47:20,205 epoch: 2, train loss: 2.541, val loss: 2.446, zinb loss: 0.305, kl loss: 45.416, clf loss: 2.249, time: 1.82
2022-03-24 11:47:22,786 epoch: 3, train loss: 2.280, val loss: 2.183, zinb loss: 0.217, kl loss: 74.983, clf loss: 1.999, time: 1.81
2022-03-24 11:47:25,331 ep

scRNAseq myeloma fold3


2022-03-24 11:48:04,342 18628 genes overlap with model after filtering
2022-03-24 11:48:04,343 1465 genes missing from dataset after filtering
2022-03-24 11:48:05,338 creating model
2022-03-24 11:48:05,624 training dataset size: 3617, validation dataset size: 3312, cell types: ['B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Erythrocyte', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Plasma', 'Platlete']
2022-03-24 11:48:05,625 fitting model
2022-03-24 11:48:08,078 epoch: 0, train loss: 2.616, val loss: 2.611, zinb loss: 0.437, kl loss: 0.551, clf loss: 2.392, time: 1.66
2022-03-24 11:48:10,532 epoch: 1, train loss: 2.606, val loss: 2.593, zinb loss: 0.415, kl loss: 1.240, clf loss: 2.384, time: 1.64
2022-03-24 11:48:13,051 epoch: 2, train loss: 2.541, val loss: 2.418, zinb loss: 0.289, kl loss: 53.438, clf loss: 2.220, time: 1.72
2022-03-24 11:48:15,521 epoch: 3, train loss: 2.218, val loss: 2.088, zinb loss: 0.215, kl loss: 90.727, clf loss: 1.890, time: 1.65
2022-03-24 11:48:17,989 ep

scRNAseq myeloma fold4


2022-03-24 11:48:56,152 18710 genes overlap with model after filtering
2022-03-24 11:48:56,153 1292 genes missing from dataset after filtering
2022-03-24 11:48:57,153 creating model
2022-03-24 11:48:57,437 training dataset size: 3617, validation dataset size: 3312, cell types: ['B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Erythrocyte', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Plasma', 'Platlete']
2022-03-24 11:48:57,438 fitting model
2022-03-24 11:48:59,884 epoch: 0, train loss: 2.616, val loss: 2.613, zinb loss: 0.438, kl loss: 0.678, clf loss: 2.393, time: 1.65
2022-03-24 11:49:02,335 epoch: 1, train loss: 2.606, val loss: 2.595, zinb loss: 0.416, kl loss: 1.393, clf loss: 2.385, time: 1.65
2022-03-24 11:49:04,807 epoch: 2, train loss: 2.547, val loss: 2.433, zinb loss: 0.282, kl loss: 46.827, clf loss: 2.245, time: 1.65
2022-03-24 11:49:07,250 epoch: 3, train loss: 2.217, val loss: 2.107, zinb loss: 0.217, kl loss: 93.084, clf loss: 1.906, time: 1.65
2022-03-24 11:49:09,724 ep

scRNAseq myeloma fold2


2022-03-24 11:49:47,685 18683 genes overlap with model after filtering
2022-03-24 11:49:47,686 1360 genes missing from dataset after filtering
2022-03-24 11:49:48,689 creating model
2022-03-24 11:49:48,983 training dataset size: 3617, validation dataset size: 3312, cell types: ['B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Erythrocyte', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Plasma', 'Platlete']
2022-03-24 11:49:48,984 fitting model
2022-03-24 11:49:51,490 epoch: 0, train loss: 2.614, val loss: 2.611, zinb loss: 0.436, kl loss: 0.580, clf loss: 2.392, time: 1.68
2022-03-24 11:49:53,978 epoch: 1, train loss: 2.604, val loss: 2.593, zinb loss: 0.413, kl loss: 1.449, clf loss: 2.385, time: 1.69
2022-03-24 11:49:56,419 epoch: 2, train loss: 2.546, val loss: 2.450, zinb loss: 0.290, kl loss: 45.267, clf loss: 2.260, time: 1.65
2022-03-24 11:49:58,857 epoch: 3, train loss: 2.256, val loss: 2.104, zinb loss: 0.210, kl loss: 109.272, clf loss: 1.890, time: 1.64
2022-03-24 11:50:01,298 e

scRNAseq melanoma fold3


2022-03-24 11:50:39,543 19162 genes overlap with model after filtering
2022-03-24 11:50:39,544 1543 genes missing from dataset after filtering
2022-03-24 11:50:40,521 creating model
2022-03-24 11:50:40,819 training dataset size: 4404, validation dataset size: 3331, cell types: ['B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Fibroblast', 'Malignant', 'Monocyte', 'NK', 'Plasma', 'Treg']
2022-03-24 11:50:40,820 fitting model
2022-03-24 11:50:43,666 epoch: 0, train loss: 2.522, val loss: 2.517, zinb loss: 0.431, kl loss: 0.609, clf loss: 2.302, time: 2.05
2022-03-24 11:50:46,497 epoch: 1, train loss: 2.508, val loss: 2.490, zinb loss: 0.400, kl loss: 2.186, clf loss: 2.288, time: 2.03
2022-03-24 11:50:49,320 epoch: 2, train loss: 2.400, val loss: 2.280, zinb loss: 0.254, kl loss: 42.356, clf loss: 2.111, time: 2.02
2022-03-24 11:50:52,144 epoch: 3, train loss: 2.156, val loss: 2.037, zinb loss: 0.196, kl loss: 73.482, clf loss: 1.866, time: 2.02
2022-03-24 11:50:54,965 epoch: 4, train

scRNAseq melanoma fold0


2022-03-24 11:51:38,610 beginning training
2022-03-24 11:51:38,611 creating dataloaders
2022-03-24 11:51:39,581 19131 genes overlap with model after filtering
2022-03-24 11:51:39,582 1544 genes missing from dataset after filtering
2022-03-24 11:51:40,562 creating model
2022-03-24 11:51:40,804 training dataset size: 4404, validation dataset size: 3331, cell types: ['B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Fibroblast', 'Malignant', 'Monocyte', 'NK', 'Plasma', 'Treg']
2022-03-24 11:51:40,804 fitting model
2022-03-24 11:51:43,658 epoch: 0, train loss: 2.521, val loss: 2.519, zinb loss: 0.431, kl loss: 0.586, clf loss: 2.303, time: 2.05
2022-03-24 11:51:46,513 epoch: 1, train loss: 2.509, val loss: 2.495, zinb loss: 0.401, kl loss: 1.522, clf loss: 2.293, time: 2.05
2022-03-24 11:51:49,379 epoch: 2, train loss: 2.421, val loss: 2.266, zinb loss: 0.254, kl loss: 54.657, clf loss: 2.084, time: 2.05
2022-03-24 11:51:52,230 epoch: 3, train loss: 2.137, val loss: 2.008, zinb loss: 0.1

scRNAseq melanoma fold2


2022-03-24 11:52:39,544 19149 genes overlap with model after filtering
2022-03-24 11:52:39,545 1523 genes missing from dataset after filtering
2022-03-24 11:52:40,510 creating model
2022-03-24 11:52:40,755 training dataset size: 4404, validation dataset size: 3331, cell types: ['B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Fibroblast', 'Malignant', 'Monocyte', 'NK', 'Plasma', 'Treg']
2022-03-24 11:52:40,756 fitting model
2022-03-24 11:52:43,626 epoch: 0, train loss: 2.522, val loss: 2.518, zinb loss: 0.431, kl loss: 0.510, clf loss: 2.302, time: 2.07
2022-03-24 11:52:46,471 epoch: 1, train loss: 2.510, val loss: 2.497, zinb loss: 0.402, kl loss: 1.357, clf loss: 2.295, time: 2.04
2022-03-24 11:52:49,298 epoch: 2, train loss: 2.428, val loss: 2.356, zinb loss: 0.257, kl loss: 36.670, clf loss: 2.191, time: 2.03
2022-03-24 11:52:52,127 epoch: 3, train loss: 2.134, val loss: 1.973, zinb loss: 0.196, kl loss: 105.428, clf loss: 1.769, time: 2.03
2022-03-24 11:52:54,966 epoch: 4, trai

scRNAseq melanoma fold1


2022-03-24 11:53:38,975 19164 genes overlap with model after filtering
2022-03-24 11:53:38,976 1529 genes missing from dataset after filtering
2022-03-24 11:53:39,941 creating model
2022-03-24 11:53:40,181 training dataset size: 4404, validation dataset size: 3331, cell types: ['B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Fibroblast', 'Malignant', 'Monocyte', 'NK', 'Plasma', 'Treg']
2022-03-24 11:53:40,182 fitting model
2022-03-24 11:53:43,043 epoch: 0, train loss: 2.522, val loss: 2.516, zinb loss: 0.430, kl loss: 0.575, clf loss: 2.300, time: 2.05
2022-03-24 11:53:45,885 epoch: 1, train loss: 2.510, val loss: 2.492, zinb loss: 0.402, kl loss: 2.123, clf loss: 2.289, time: 2.04
2022-03-24 11:53:48,726 epoch: 2, train loss: 2.410, val loss: 2.239, zinb loss: 0.248, kl loss: 72.436, clf loss: 2.042, time: 2.03
2022-03-24 11:53:51,577 epoch: 3, train loss: 2.083, val loss: 1.943, zinb loss: 0.194, kl loss: 103.060, clf loss: 1.743, time: 2.04
2022-03-24 11:53:54,429 epoch: 4, trai

scRNAseq melanoma fold4


2022-03-24 11:54:38,707 19103 genes overlap with model after filtering
2022-03-24 11:54:38,708 1510 genes missing from dataset after filtering
2022-03-24 11:54:39,822 creating model
2022-03-24 11:54:40,110 training dataset size: 4404, validation dataset size: 3331, cell types: ['B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Fibroblast', 'Malignant', 'Monocyte', 'NK', 'Plasma', 'Treg']
2022-03-24 11:54:40,112 fitting model
2022-03-24 11:54:43,013 epoch: 0, train loss: 2.521, val loss: 2.516, zinb loss: 0.432, kl loss: 0.684, clf loss: 2.299, time: 2.06
2022-03-24 11:54:45,959 epoch: 1, train loss: 2.508, val loss: 2.492, zinb loss: 0.402, kl loss: 1.690, clf loss: 2.289, time: 2.14
2022-03-24 11:54:48,827 epoch: 2, train loss: 2.419, val loss: 2.242, zinb loss: 0.238, kl loss: 64.617, clf loss: 2.058, time: 2.06
2022-03-24 11:54:51,697 epoch: 3, train loss: 2.072, val loss: 1.984, zinb loss: 0.198, kl loss: 85.038, clf loss: 1.800, time: 2.06
2022-03-24 11:54:54,551 epoch: 4, train

scRNAseq pdac fold2


2022-03-24 11:55:38,279 beginning training
2022-03-24 11:55:38,280 creating dataloaders
2022-03-24 11:55:41,053 23822 genes overlap with model after filtering
2022-03-24 11:55:41,054 1199 genes missing from dataset after filtering
2022-03-24 11:55:44,738 creating model
2022-03-24 11:55:45,142 training dataset size: 8109, validation dataset size: 7654, cell types: ['Acinar', 'B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Epithelial', 'Erythrocyte', 'Fibroblast', 'Islet', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Plasma', 'Treg', 'Tuft']
2022-03-24 11:55:45,143 fitting model
2022-03-24 11:55:51,141 epoch: 0, train loss: 3.062, val loss: 3.056, zinb loss: 0.450, kl loss: 0.725, clf loss: 2.831, time: 4.21
2022-03-24 11:55:57,122 epoch: 1, train loss: 3.036, val loss: 2.979, zinb loss: 0.358, kl loss: 18.239, clf loss: 2.782, time: 4.20
2022-03-24 11:56:03,200 epoch: 2, train loss: 2.833, val loss: 2.692, zinb loss: 0.231, kl loss: 91.125, clf loss: 2.486, time: 4.23
2022

scRNAseq pdac fold3


2022-03-24 11:57:46,671 beginning training
2022-03-24 11:57:46,672 creating dataloaders
2022-03-24 11:57:49,596 23837 genes overlap with model after filtering
2022-03-24 11:57:49,597 1215 genes missing from dataset after filtering
2022-03-24 11:57:53,248 creating model
2022-03-24 11:57:53,649 training dataset size: 8109, validation dataset size: 7654, cell types: ['Acinar', 'B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Epithelial', 'Erythrocyte', 'Fibroblast', 'Islet', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Plasma', 'Treg', 'Tuft']
2022-03-24 11:57:53,650 fitting model
2022-03-24 11:57:59,634 epoch: 0, train loss: 3.063, val loss: 3.058, zinb loss: 0.450, kl loss: 0.672, clf loss: 2.832, time: 4.20
2022-03-24 11:58:05,646 epoch: 1, train loss: 3.038, val loss: 2.987, zinb loss: 0.361, kl loss: 16.525, clf loss: 2.790, time: 4.22
2022-03-24 11:58:11,634 epoch: 2, train loss: 2.866, val loss: 2.737, zinb loss: 0.233, kl loss: 81.185, clf loss: 2.539, time: 4.21
2022

scRNAseq pdac fold1


2022-03-24 11:59:55,045 beginning training
2022-03-24 11:59:55,046 creating dataloaders
2022-03-24 11:59:57,994 23823 genes overlap with model after filtering
2022-03-24 11:59:57,995 1164 genes missing from dataset after filtering
2022-03-24 12:00:01,705 creating model
2022-03-24 12:00:02,103 training dataset size: 8109, validation dataset size: 7654, cell types: ['Acinar', 'B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Epithelial', 'Erythrocyte', 'Fibroblast', 'Islet', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Plasma', 'Treg', 'Tuft']
2022-03-24 12:00:02,104 fitting model
2022-03-24 12:00:08,191 epoch: 0, train loss: 3.063, val loss: 3.059, zinb loss: 0.451, kl loss: 0.667, clf loss: 2.833, time: 4.29
2022-03-24 12:00:14,219 epoch: 1, train loss: 3.041, val loss: 3.008, zinb loss: 0.367, kl loss: 7.386, clf loss: 2.817, time: 4.22
2022-03-24 12:00:20,282 epoch: 2, train loss: 2.871, val loss: 2.652, zinb loss: 0.231, kl loss: 132.773, clf loss: 2.403, time: 4.26
2022

scRNAseq pdac fold4


2022-03-24 12:02:04,203 beginning training
2022-03-24 12:02:04,203 creating dataloaders
2022-03-24 12:02:07,197 23781 genes overlap with model after filtering
2022-03-24 12:02:07,198 1197 genes missing from dataset after filtering
2022-03-24 12:02:10,944 creating model
2022-03-24 12:02:11,344 training dataset size: 8109, validation dataset size: 7654, cell types: ['Acinar', 'B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Epithelial', 'Erythrocyte', 'Fibroblast', 'Islet', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Plasma', 'Treg', 'Tuft']
2022-03-24 12:02:11,345 fitting model
2022-03-24 12:02:17,798 epoch: 0, train loss: 3.061, val loss: 3.056, zinb loss: 0.450, kl loss: 0.786, clf loss: 2.830, time: 4.54
2022-03-24 12:02:23,805 epoch: 1, train loss: 3.031, val loss: 2.965, zinb loss: 0.353, kl loss: 26.650, clf loss: 2.761, time: 4.25
2022-03-24 12:02:29,813 epoch: 2, train loss: 2.817, val loss: 2.703, zinb loss: 0.232, kl loss: 77.398, clf loss: 2.509, time: 4.26
2022

scRNAseq pdac fold0


2022-03-24 12:04:14,483 beginning training
2022-03-24 12:04:14,484 creating dataloaders
2022-03-24 12:04:17,459 23764 genes overlap with model after filtering
2022-03-24 12:04:17,460 1177 genes missing from dataset after filtering
2022-03-24 12:04:21,205 creating model
2022-03-24 12:04:21,613 training dataset size: 8109, validation dataset size: 7654, cell types: ['Acinar', 'B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Epithelial', 'Erythrocyte', 'Fibroblast', 'Islet', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Plasma', 'Treg', 'Tuft']
2022-03-24 12:04:21,614 fitting model
2022-03-24 12:04:27,712 epoch: 0, train loss: 3.063, val loss: 3.058, zinb loss: 0.452, kl loss: 0.666, clf loss: 2.831, time: 4.26
2022-03-24 12:04:33,781 epoch: 1, train loss: 3.036, val loss: 2.980, zinb loss: 0.358, kl loss: 17.401, clf loss: 2.783, time: 4.26
2022-03-24 12:04:39,831 epoch: 2, train loss: 2.833, val loss: 2.672, zinb loss: 0.233, kl loss: 98.845, clf loss: 2.457, time: 4.23
2022

In [18]:
# original single train/val datasets
for dtype, d1 in fmap.items():
    for disease, d2 in d1.items():
#         if dtype == 'scRNAseq':
        print(dtype, disease)
        train = sc.read_h5ad(d2['train'])
        val = sc.read_h5ad(d2['val'])
        args['module_filepath'] = os.path.join(module_dir, f'{dtype}_{disease}')
        utils.train_and_save_model(train, val, args)

2022-01-13 08:58:29,784 beginning training
2022-01-13 08:58:29,785 creating dataloaders


scRNAseq cesc


2022-01-13 08:58:31,104 21146 genes overlap with model after filtering
2022-01-13 08:58:31,105 703 genes missing from dataset after filtering
2022-01-13 08:58:32,794 creating model
2022-01-13 08:58:33,110 training dataset size: 4661, validation dataset size: 4276, cell types: ['CD4 T cell', 'CD8 T cell', 'Endothelial', 'Epithelial', 'Erythrocyte', 'Fibroblast', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Plasma']
2022-01-13 08:58:33,111 fitting model
2022-01-13 08:58:36,634 epoch: 0, train loss: 2.667, val loss: 2.662, zinb loss: 0.532, kl loss: 0.678, clf loss: 2.395, time: 2.42
2022-01-13 08:58:40,148 epoch: 1, train loss: 2.647, val loss: 2.618, zinb loss: 0.498, kl loss: 6.720, clf loss: 2.363, time: 2.41
2022-01-13 08:58:43,653 epoch: 2, train loss: 2.471, val loss: 2.294, zinb loss: 0.327, kl loss: 94.867, clf loss: 2.035, time: 2.41
2022-01-13 08:58:47,210 epoch: 3, train loss: 2.119, val loss: 2.002, zinb loss: 0.292, kl loss: 120.842, clf loss: 1.735, time: 2.53
2022-01-13 08:58:5

scRNAseq myeloma


2022-01-13 08:59:42,591 18733 genes overlap with model after filtering
2022-01-13 08:59:42,592 1290 genes missing from dataset after filtering
2022-01-13 08:59:43,569 creating model
2022-01-13 08:59:43,856 training dataset size: 3617, validation dataset size: 3312, cell types: ['B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Erythrocyte', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Plasma', 'Platlete']
2022-01-13 08:59:43,857 fitting model
2022-01-13 08:59:46,457 epoch: 0, train loss: 2.618, val loss: 2.616, zinb loss: 0.436, kl loss: 0.553, clf loss: 2.397, time: 1.77
2022-01-13 08:59:49,039 epoch: 1, train loss: 2.607, val loss: 2.597, zinb loss: 0.414, kl loss: 1.209, clf loss: 2.389, time: 1.75
2022-01-13 08:59:51,614 epoch: 2, train loss: 2.538, val loss: 2.446, zinb loss: 0.288, kl loss: 38.246, clf loss: 2.264, time: 1.74
2022-01-13 08:59:54,185 epoch: 3, train loss: 2.207, val loss: 2.043, zinb loss: 0.211, kl loss: 110.539, clf loss: 1.827, time: 1.74
2022-01-13 08:59:56,756 e

scRNAseq brca


2022-01-13 09:00:37,476 22285 genes overlap with model after filtering
2022-01-13 09:00:37,477 1268 genes missing from dataset after filtering
2022-01-13 09:00:39,510 creating model
2022-01-13 09:00:39,830 training dataset size: 6105, validation dataset size: 5748, cell types: ['B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Erythrocyte', 'Fibroblast', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Plasma', 'Treg']
2022-01-13 09:00:39,831 fitting model
2022-01-13 09:00:44,430 epoch: 0, train loss: 2.784, val loss: 2.780, zinb loss: 0.428, kl loss: 0.745, clf loss: 2.565, time: 3.24
2022-01-13 09:00:49,005 epoch: 1, train loss: 2.768, val loss: 2.747, zinb loss: 0.381, kl loss: 3.507, clf loss: 2.553, time: 3.22
2022-01-13 09:00:53,606 epoch: 2, train loss: 2.633, val loss: 2.466, zinb loss: 0.232, kl loss: 80.924, clf loss: 2.269, time: 3.24
2022-01-13 09:00:58,186 epoch: 3, train loss: 2.328, val loss: 2.267, zinb loss: 0.214, kl loss: 98.384, clf loss: 2.062, time: 3.22
2

scRNAseq hnscc


2022-01-13 09:02:13,367 21915 genes overlap with model after filtering
2022-01-13 09:02:13,368 1008 genes missing from dataset after filtering
2022-01-13 09:02:15,239 creating model
2022-01-13 09:02:15,549 training dataset size: 5287, validation dataset size: 5201, cell types: ['B cell', 'CD4 T cell', 'CD8 T cell', 'Endothelial', 'Erythrocyte', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Plasma', 'Treg']
2022-01-13 09:02:15,550 fitting model
2022-01-13 09:02:19,527 epoch: 0, train loss: 2.639, val loss: 2.635, zinb loss: 0.474, kl loss: 0.582, clf loss: 2.398, time: 2.74
2022-01-13 09:02:23,503 epoch: 1, train loss: 2.621, val loss: 2.596, zinb loss: 0.432, kl loss: 4.969, clf loss: 2.375, time: 2.74
2022-01-13 09:02:27,474 epoch: 2, train loss: 2.461, val loss: 2.303, zinb loss: 0.269, kl loss: 86.265, clf loss: 2.082, time: 2.74
2022-01-13 09:02:31,448 epoch: 3, train loss: 2.167, val loss: 2.092, zinb loss: 0.243, kl loss: 93.755, clf loss: 1.877, time: 2.74
2022-01-13 09:02:35,428 epoc

scRNAseq pdac


2022-01-13 09:03:37,638 23831 genes overlap with model after filtering
2022-01-13 09:03:37,639 1188 genes missing from dataset after filtering
2022-01-13 09:03:40,725 creating model
2022-01-13 09:03:41,080 training dataset size: 7940, validation dataset size: 7823, cell types: ['Acinar', 'B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Epithelial', 'Erythrocyte', 'Fibroblast', 'Islet', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Plasma', 'Treg', 'Tuft']
2022-01-13 09:03:41,083 fitting model
2022-01-13 09:03:47,241 epoch: 0, train loss: 3.062, val loss: 3.056, zinb loss: 0.449, kl loss: 0.730, clf loss: 2.831, time: 4.33
2022-01-13 09:03:53,351 epoch: 1, train loss: 3.039, val loss: 3.000, zinb loss: 0.361, kl loss: 9.095, clf loss: 2.811, time: 4.28
2022-01-13 09:03:59,445 epoch: 2, train loss: 2.866, val loss: 2.699, zinb loss: 0.231, kl loss: 105.157, clf loss: 2.479, time: 4.29
2022-01-13 09:04:05,533 epoch: 3, train loss: 2.548, val loss: 2.469, zinb loss: 0.218, kl l

scRNAseq melanoma


2022-01-13 09:05:44,611 19219 genes overlap with model after filtering
2022-01-13 09:05:44,612 1385 genes missing from dataset after filtering
2022-01-13 09:05:45,630 creating model
2022-01-13 09:05:45,925 training dataset size: 4218, validation dataset size: 3517, cell types: ['B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Fibroblast', 'Malignant', 'Monocyte', 'NK', 'Plasma', 'Treg']
2022-01-13 09:05:45,926 fitting model
2022-01-13 09:05:48,870 epoch: 0, train loss: 2.522, val loss: 2.519, zinb loss: 0.432, kl loss: 0.608, clf loss: 2.302, time: 2.06
2022-01-13 09:05:51,806 epoch: 1, train loss: 2.508, val loss: 2.488, zinb loss: 0.404, kl loss: 2.807, clf loss: 2.283, time: 2.06
2022-01-13 09:05:54,739 epoch: 2, train loss: 2.397, val loss: 2.254, zinb loss: 0.264, kl loss: 52.587, clf loss: 2.069, time: 2.05
2022-01-13 09:05:57,678 epoch: 3, train loss: 2.126, val loss: 2.013, zinb loss: 0.199, kl loss: 90.237, clf loss: 1.823, time: 2.06
2022-01-13 09:06:00,619 epoch: 4, train

In [84]:
train = sc.read_h5ad('/data/pollock/panimmune/data/brca-panimmune_train.h5ad')
val = sc.read_h5ad('/data/pollock/panimmune/data/brca-panimmune_val.h5ad')
args['module_filepath'] = os.path.join('/data/pollock/panimmune/brca_panimmune')
utils.train_and_save_model(train, val, args)

2022-01-13 12:26:24,370 beginning training
2022-01-13 12:26:24,371 creating dataloaders
2022-01-13 12:26:26,886 22507 genes overlap with model after filtering
2022-01-13 12:26:26,887 1442 genes missing from dataset after filtering
2022-01-13 12:26:29,271 creating model
2022-01-13 12:26:29,632 training dataset size: 7866, validation dataset size: 6303, cell types: ['B cell', 'CD4 T cell', 'CD4 T cell - activated', 'CD4 T cell - exhausted', 'CD8 T cell - CTL', 'CD8 T cell - exhausted', 'CD8 T cell - preexhausted', 'CD8 T cell - proliferating', 'Mast', 'Monocyte/Macrophage', 'NK', 'NKT', 'Plasma', 'Treg', 'cDC1', 'cDC2', 'pDC']
2022-01-13 12:26:29,633 fitting model
2022-01-13 12:26:35,498 epoch: 0, train loss: 3.069, val loss: 3.061, zinb loss: 0.457, kl loss: 0.610, clf loss: 2.832, time: 4.28
2022-01-13 12:26:41,382 epoch: 1, train loss: 3.050, val loss: 3.018, zinb loss: 0.380, kl loss: 3.009, clf loss: 2.825, time: 4.30
2022-01-13 12:26:47,245 epoch: 2, train loss: 2.930, val loss: 2.

generalized

In [20]:
# cell_types = ['B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Erythrocyte', 'Fibroblast',
#               'Malignant', 'Mast', 'Monocyte', 'NK', 'Plasma', 'Treg']
exclude = ['Epithelial']
for dtype, d1 in fmap.items():
    print(dtype)
    # combine train and val across diseases
    train, val = None, None
    for disease, d2 in d1.items():
        t, v = sc.read_h5ad(d2['train']), sc.read_h5ad(d2['val'])
        t.obs['disease'], v.obs['disease'] = disease, disease
        if train is None:
            train, val, = t, v
        else:
            train = anndata.concat((train, t), axis=0)
            val = anndata.concat((val, v), axis=0)
    
    # conflicting annotation with snRNA t cells
    if dtype == 'snRNAseq':
        train = train[train.obs['cell_type']!='T cells']
        val = val[val.obs['cell_type']!='T cells']
    
    train = train[[False if c in exclude else True for c in train.obs['cell_type']]]
    val = val[[False if c in exclude else True for c in val.obs['cell_type']]]
    train.obs_names_make_unique()
    val.obs_names_make_unique()
    
    args['module_filepath'] = os.path.join(module_dir, f'{dtype}_generalized')
    utils.train_and_save_model(train, val, args)


snRNAseq


2022-01-19 09:40:01,375 beginning training
2022-01-19 09:40:01,376 creating dataloaders
2022-01-19 09:40:05,713 27237 genes overlap with model after filtering
2022-01-19 09:40:05,714 513 genes missing from dataset after filtering
2022-01-19 09:40:11,085 creating model
2022-01-19 09:40:11,498 training dataset size: 12728, validation dataset size: 11988, cell types: ['Adipocyte', 'B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Fibroblast', 'Malignant', 'Mast', 'Microglia', 'Monocyte', 'NK', 'Neuron', 'Oligodendrocytes', 'Plasma', 'Treg']
2022-01-19 09:40:11,499 fitting model
2022-01-19 09:40:22,249 epoch: 0, train loss: 2.986, val loss: 2.979, zinb loss: 0.417, kl loss: 0.975, clf loss: 2.769, time: 7.85
2022-01-19 09:40:31,847 epoch: 1, train loss: 2.923, val loss: 2.777, zinb loss: 0.264, kl loss: 70.818, clf loss: 2.574, time: 6.88
2022-01-19 09:40:41,456 epoch: 2, train loss: 2.636, val loss: 2.488, zinb loss: 0.244, kl loss: 120.319, clf loss: 2.246, time: 6.90
20

snATACseq


Observation names are not unique. To make them unique, call `.obs_names_make_unique`.
Observation names are not unique. To make them unique, call `.obs_names_make_unique`.
Observation names are not unique. To make them unique, call `.obs_names_make_unique`.
Observation names are not unique. To make them unique, call `.obs_names_make_unique`.
Observation names are not unique. To make them unique, call `.obs_names_make_unique`.
Observation names are not unique. To make them unique, call `.obs_names_make_unique`.
2022-01-19 09:43:32,571 beginning training
2022-01-19 09:43:32,571 creating dataloaders
2022-01-19 09:43:36,429 19750 genes overlap with model after filtering
2022-01-19 09:43:36,430 12 genes missing from dataset after filtering
2022-01-19 09:43:42,797 creating model
2022-01-19 09:43:43,085 training dataset size: 9465, validation dataset size: 8895, cell types: ['B cell', 'Dendritic', 'Endothelial', 'Fibroblast', 'Malignant', 'Mast', 'Microglia', 'Monocyte', 'Neuron', 'Oligodendr

scRNAseq


Observation names are not unique. To make them unique, call `.obs_names_make_unique`.
Observation names are not unique. To make them unique, call `.obs_names_make_unique`.
Observation names are not unique. To make them unique, call `.obs_names_make_unique`.
Observation names are not unique. To make them unique, call `.obs_names_make_unique`.
Observation names are not unique. To make them unique, call `.obs_names_make_unique`.
Observation names are not unique. To make them unique, call `.obs_names_make_unique`.
Observation names are not unique. To make them unique, call `.obs_names_make_unique`.
Observation names are not unique. To make them unique, call `.obs_names_make_unique`.
Observation names are not unique. To make them unique, call `.obs_names_make_unique`.
Observation names are not unique. To make them unique, call `.obs_names_make_unique`.
2022-01-19 09:46:07,039 beginning training
2022-01-19 09:46:07,040 creating dataloaders
2022-01-19 09:46:13,707 17536 genes overlap with mod

###### predict validation set

In [12]:
result_dir = '/data/pollock/benchmarking/results/pollock/'

inter-dataset with folds

In [13]:
for dtype, d1 in folds_fmap.items():
    for disease, d2 in d1.items():
        for fold, d3 in d2.items():
            print(dtype, disease, fold)
            model = utils.load_model(os.path.join(fold_module_dir, f'{dtype}_{disease}_{fold}'))
            val = sc.read_h5ad(d3['val'])
            adata = utils.predict_adata(model, val)
            adata.write_h5ad(os.path.join(result_dir, 'interdataset_with_folds', f'{dtype}_{disease}_{fold}.h5ad'))

snRNAseq ccrcc fold3


2022-03-24 12:11:27,946 24602 genes overlap with model after filtering
2022-03-24 12:11:27,947 1513 genes missing from dataset after filtering
2022-03-24 12:11:30,152 starting prediction of 4255 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


snRNAseq ccrcc fold1


2022-03-24 12:12:01,078 24613 genes overlap with model after filtering
2022-03-24 12:12:01,079 1496 genes missing from dataset after filtering
2022-03-24 12:12:02,867 starting prediction of 4255 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


snRNAseq ccrcc fold2


2022-03-24 12:12:18,468 24633 genes overlap with model after filtering
2022-03-24 12:12:18,469 1406 genes missing from dataset after filtering
2022-03-24 12:12:20,301 starting prediction of 4255 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


snRNAseq ccrcc fold0


2022-03-24 12:12:36,832 24598 genes overlap with model after filtering
2022-03-24 12:12:36,833 1535 genes missing from dataset after filtering
2022-03-24 12:12:38,752 starting prediction of 4255 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


snRNAseq ccrcc fold4


2022-03-24 12:12:54,154 24626 genes overlap with model after filtering
2022-03-24 12:12:54,155 1508 genes missing from dataset after filtering
2022-03-24 12:12:56,277 starting prediction of 4255 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


snRNAseq brca fold2


2022-03-24 12:13:12,449 25314 genes overlap with model after filtering
2022-03-24 12:13:12,450 1192 genes missing from dataset after filtering
2022-03-24 12:13:14,695 starting prediction of 4731 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


snRNAseq brca fold1


2022-03-24 12:13:32,132 25304 genes overlap with model after filtering
2022-03-24 12:13:32,133 1237 genes missing from dataset after filtering
2022-03-24 12:13:34,349 starting prediction of 4731 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


snRNAseq brca fold3


2022-03-24 12:13:51,657 25340 genes overlap with model after filtering
2022-03-24 12:13:51,658 1209 genes missing from dataset after filtering
2022-03-24 12:13:54,012 starting prediction of 4731 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


snRNAseq brca fold0


2022-03-24 12:14:11,570 25343 genes overlap with model after filtering
2022-03-24 12:14:11,571 1195 genes missing from dataset after filtering
2022-03-24 12:14:13,829 starting prediction of 4731 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


snRNAseq brca fold4


2022-03-24 12:14:31,013 25346 genes overlap with model after filtering
2022-03-24 12:14:31,013 1217 genes missing from dataset after filtering
2022-03-24 12:14:33,240 starting prediction of 4731 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


snRNAseq gbm fold4


2022-03-24 12:14:49,849 24927 genes overlap with model after filtering
2022-03-24 12:14:49,851 1326 genes missing from dataset after filtering
2022-03-24 12:14:51,451 starting prediction of 3261 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


snRNAseq gbm fold3


2022-03-24 12:15:21,659 24983 genes overlap with model after filtering
2022-03-24 12:15:21,660 1404 genes missing from dataset after filtering
2022-03-24 12:15:23,242 starting prediction of 3261 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


snRNAseq gbm fold0


2022-03-24 12:15:48,431 24962 genes overlap with model after filtering
2022-03-24 12:15:48,432 1361 genes missing from dataset after filtering
2022-03-24 12:15:49,999 starting prediction of 3261 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


snRNAseq gbm fold2


2022-03-24 12:16:15,275 24988 genes overlap with model after filtering
2022-03-24 12:16:15,276 1392 genes missing from dataset after filtering
2022-03-24 12:16:16,855 starting prediction of 3261 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


snRNAseq gbm fold1


2022-03-24 12:16:42,506 24951 genes overlap with model after filtering
2022-03-24 12:16:42,507 1407 genes missing from dataset after filtering
2022-03-24 12:16:44,100 starting prediction of 3261 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


snATACseq ccrcc fold3


2022-03-24 12:17:09,252 19692 genes overlap with model after filtering
2022-03-24 12:17:09,253 35 genes missing from dataset after filtering
2022-03-24 12:17:11,422 starting prediction of 3000 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


snATACseq ccrcc fold1


2022-03-24 12:17:33,218 19680 genes overlap with model after filtering
2022-03-24 12:17:33,218 38 genes missing from dataset after filtering
2022-03-24 12:17:35,551 starting prediction of 3000 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


snATACseq ccrcc fold2


2022-03-24 12:17:57,774 19683 genes overlap with model after filtering
2022-03-24 12:17:57,775 38 genes missing from dataset after filtering
2022-03-24 12:18:00,010 starting prediction of 3000 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


snATACseq ccrcc fold0


2022-03-24 12:18:22,087 19682 genes overlap with model after filtering
2022-03-24 12:18:22,088 37 genes missing from dataset after filtering
2022-03-24 12:18:24,264 starting prediction of 3000 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


snATACseq ccrcc fold4


2022-03-24 12:18:46,599 19685 genes overlap with model after filtering
2022-03-24 12:18:46,600 47 genes missing from dataset after filtering
2022-03-24 12:18:48,921 starting prediction of 3000 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


snATACseq brca fold2


2022-03-24 12:19:11,902 19694 genes overlap with model after filtering
2022-03-24 12:19:11,903 21 genes missing from dataset after filtering
2022-03-24 12:19:15,286 starting prediction of 3519 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


snATACseq brca fold1


2022-03-24 12:19:44,209 19694 genes overlap with model after filtering
2022-03-24 12:19:44,210 24 genes missing from dataset after filtering
2022-03-24 12:19:47,546 starting prediction of 3519 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


snATACseq brca fold3


2022-03-24 12:20:15,424 19698 genes overlap with model after filtering
2022-03-24 12:20:15,425 25 genes missing from dataset after filtering
2022-03-24 12:20:18,797 starting prediction of 3519 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


snATACseq brca fold0


2022-03-24 12:20:46,958 19696 genes overlap with model after filtering
2022-03-24 12:20:46,959 22 genes missing from dataset after filtering
2022-03-24 12:20:50,260 starting prediction of 3519 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


snATACseq brca fold4


2022-03-24 12:21:18,295 19689 genes overlap with model after filtering
2022-03-24 12:21:18,296 24 genes missing from dataset after filtering
2022-03-24 12:21:21,611 starting prediction of 3519 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


snATACseq gbm fold4


2022-03-24 12:21:49,852 19680 genes overlap with model after filtering
2022-03-24 12:21:49,853 47 genes missing from dataset after filtering
2022-03-24 12:21:52,314 starting prediction of 2820 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


snATACseq gbm fold3


2022-03-24 12:22:12,973 19686 genes overlap with model after filtering
2022-03-24 12:22:12,974 33 genes missing from dataset after filtering
2022-03-24 12:22:15,409 starting prediction of 2820 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


snATACseq gbm fold0


2022-03-24 12:22:36,000 19692 genes overlap with model after filtering
2022-03-24 12:22:36,002 35 genes missing from dataset after filtering
2022-03-24 12:22:38,523 starting prediction of 2820 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


snATACseq gbm fold2


2022-03-24 12:22:59,385 19692 genes overlap with model after filtering
2022-03-24 12:22:59,386 32 genes missing from dataset after filtering
2022-03-24 12:23:01,830 starting prediction of 2820 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


snATACseq gbm fold1


2022-03-24 12:23:22,464 19694 genes overlap with model after filtering
2022-03-24 12:23:22,465 31 genes missing from dataset after filtering
2022-03-24 12:23:25,119 starting prediction of 2820 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


scRNAseq hnscc fold0


2022-03-24 12:23:45,790 21815 genes overlap with model after filtering
2022-03-24 12:23:45,791 1132 genes missing from dataset after filtering
2022-03-24 12:23:47,910 starting prediction of 5098 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


scRNAseq hnscc fold4


2022-03-24 12:24:05,716 21953 genes overlap with model after filtering
2022-03-24 12:24:05,717 1017 genes missing from dataset after filtering
2022-03-24 12:24:07,758 starting prediction of 5098 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


scRNAseq hnscc fold1


2022-03-24 12:24:25,741 21876 genes overlap with model after filtering
2022-03-24 12:24:25,742 1029 genes missing from dataset after filtering
2022-03-24 12:24:27,839 starting prediction of 5098 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


scRNAseq hnscc fold3


2022-03-24 12:24:45,174 21868 genes overlap with model after filtering
2022-03-24 12:24:45,175 1038 genes missing from dataset after filtering
2022-03-24 12:24:47,321 starting prediction of 5098 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


scRNAseq hnscc fold2


2022-03-24 12:25:05,163 21879 genes overlap with model after filtering
2022-03-24 12:25:05,164 944 genes missing from dataset after filtering
2022-03-24 12:25:07,226 starting prediction of 5098 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


scRNAseq cesc fold4


2022-03-24 12:25:25,315 21175 genes overlap with model after filtering
2022-03-24 12:25:25,316 652 genes missing from dataset after filtering
2022-03-24 12:25:27,262 starting prediction of 4274 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


scRNAseq cesc fold3


2022-03-24 12:25:43,038 21072 genes overlap with model after filtering
2022-03-24 12:25:43,039 706 genes missing from dataset after filtering
2022-03-24 12:25:44,891 starting prediction of 4274 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


scRNAseq cesc fold2


2022-03-24 12:26:00,969 21174 genes overlap with model after filtering
2022-03-24 12:26:00,970 664 genes missing from dataset after filtering
2022-03-24 12:26:02,805 starting prediction of 4274 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


scRNAseq cesc fold0


2022-03-24 12:26:18,211 21151 genes overlap with model after filtering
2022-03-24 12:26:18,211 645 genes missing from dataset after filtering
2022-03-24 12:26:20,075 starting prediction of 4274 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


scRNAseq cesc fold1


2022-03-24 12:26:35,723 21184 genes overlap with model after filtering
2022-03-24 12:26:35,724 667 genes missing from dataset after filtering
2022-03-24 12:26:37,418 starting prediction of 4274 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


scRNAseq brca fold2


2022-03-24 12:26:53,006 22276 genes overlap with model after filtering
2022-03-24 12:26:53,007 1435 genes missing from dataset after filtering
2022-03-24 12:26:54,996 starting prediction of 5468 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


scRNAseq brca fold1


2022-03-24 12:27:13,160 22307 genes overlap with model after filtering
2022-03-24 12:27:13,161 1398 genes missing from dataset after filtering
2022-03-24 12:27:15,168 starting prediction of 5468 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


scRNAseq brca fold3


2022-03-24 12:27:33,503 22275 genes overlap with model after filtering
2022-03-24 12:27:33,504 1441 genes missing from dataset after filtering
2022-03-24 12:27:35,547 starting prediction of 5468 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


scRNAseq brca fold0


2022-03-24 12:27:53,665 22254 genes overlap with model after filtering
2022-03-24 12:27:53,666 1370 genes missing from dataset after filtering
2022-03-24 12:27:55,801 starting prediction of 5468 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


scRNAseq brca fold4


2022-03-24 12:28:14,396 22221 genes overlap with model after filtering
2022-03-24 12:28:14,397 1367 genes missing from dataset after filtering
2022-03-24 12:28:16,587 starting prediction of 5468 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


scRNAseq myeloma fold0


2022-03-24 12:28:34,878 18620 genes overlap with model after filtering
2022-03-24 12:28:34,879 1218 genes missing from dataset after filtering
2022-03-24 12:28:35,977 starting prediction of 3312 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


scRNAseq myeloma fold1


2022-03-24 12:29:01,172 18583 genes overlap with model after filtering
2022-03-24 12:29:01,173 1374 genes missing from dataset after filtering
2022-03-24 12:29:02,333 starting prediction of 3312 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


scRNAseq myeloma fold3


2022-03-24 12:29:27,423 18628 genes overlap with model after filtering
2022-03-24 12:29:27,424 1465 genes missing from dataset after filtering
2022-03-24 12:29:28,555 starting prediction of 3312 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


scRNAseq myeloma fold4


2022-03-24 12:29:54,330 18710 genes overlap with model after filtering
2022-03-24 12:29:54,330 1292 genes missing from dataset after filtering
2022-03-24 12:29:55,492 starting prediction of 3312 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


scRNAseq myeloma fold2


2022-03-24 12:30:20,493 18683 genes overlap with model after filtering
2022-03-24 12:30:20,493 1360 genes missing from dataset after filtering
2022-03-24 12:30:21,608 starting prediction of 3312 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


scRNAseq melanoma fold3


2022-03-24 12:30:46,344 19162 genes overlap with model after filtering
2022-03-24 12:30:46,345 1543 genes missing from dataset after filtering
2022-03-24 12:30:47,495 starting prediction of 3331 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


scRNAseq melanoma fold0


2022-03-24 12:31:13,599 19131 genes overlap with model after filtering
2022-03-24 12:31:13,599 1544 genes missing from dataset after filtering
2022-03-24 12:31:14,765 starting prediction of 3331 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


scRNAseq melanoma fold2


2022-03-24 12:31:40,565 19149 genes overlap with model after filtering
2022-03-24 12:31:40,566 1523 genes missing from dataset after filtering
2022-03-24 12:31:41,707 starting prediction of 3331 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


scRNAseq melanoma fold1


2022-03-24 12:32:07,347 19164 genes overlap with model after filtering
2022-03-24 12:32:07,348 1529 genes missing from dataset after filtering
2022-03-24 12:32:08,575 starting prediction of 3331 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


scRNAseq melanoma fold4


2022-03-24 12:32:34,640 19103 genes overlap with model after filtering
2022-03-24 12:32:34,641 1510 genes missing from dataset after filtering
2022-03-24 12:32:35,789 starting prediction of 3331 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


scRNAseq pdac fold2


2022-03-24 12:33:01,563 23822 genes overlap with model after filtering
2022-03-24 12:33:01,563 1199 genes missing from dataset after filtering
2022-03-24 12:33:06,237 starting prediction of 7654 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


scRNAseq pdac fold3


2022-03-24 12:33:31,220 23837 genes overlap with model after filtering
2022-03-24 12:33:31,222 1215 genes missing from dataset after filtering
2022-03-24 12:33:34,948 starting prediction of 7654 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


scRNAseq pdac fold1


2022-03-24 12:33:58,990 23823 genes overlap with model after filtering
2022-03-24 12:33:58,991 1164 genes missing from dataset after filtering
2022-03-24 12:34:05,766 starting prediction of 7654 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


scRNAseq pdac fold4


2022-03-24 12:34:30,697 23781 genes overlap with model after filtering
2022-03-24 12:34:30,698 1197 genes missing from dataset after filtering
2022-03-24 12:34:37,732 starting prediction of 7654 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


scRNAseq pdac fold0


2022-03-24 12:35:02,361 23764 genes overlap with model after filtering
2022-03-24 12:35:02,362 1177 genes missing from dataset after filtering
2022-03-24 12:35:09,030 starting prediction of 7654 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


inter-dataset

In [19]:
for dtype, d1 in fmap.items():
    for disease, d2 in d1.items():
        print(dtype, disease)
        model = utils.load_model(os.path.join(module_dir, f'{dtype}_{disease}'))
        val = sc.read_h5ad(d2['val'])
        adata = utils.predict_adata(model, val)
        adata.write_h5ad(os.path.join(result_dir, f'{dtype}_{disease}.h5ad'))

snRNAseq ccrcc


2022-01-13 09:07:00,902 24684 genes overlap with model after filtering
2022-01-13 09:07:00,903 1318 genes missing from dataset after filtering
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


snRNAseq brca


2022-01-13 09:07:20,075 25360 genes overlap with model after filtering
2022-01-13 09:07:20,076 1183 genes missing from dataset after filtering
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


snRNAseq gbm


2022-01-13 09:07:41,131 25009 genes overlap with model after filtering
2022-01-13 09:07:41,132 1142 genes missing from dataset after filtering
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


snATACseq brca


2022-01-13 09:08:11,196 19699 genes overlap with model after filtering
2022-01-13 09:08:11,198 17 genes missing from dataset after filtering
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


snATACseq ccrcc


2022-01-13 09:08:41,566 19677 genes overlap with model after filtering
2022-01-13 09:08:41,567 40 genes missing from dataset after filtering
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


snATACseq gbm


2022-01-13 09:09:05,184 19698 genes overlap with model after filtering
2022-01-13 09:09:05,184 30 genes missing from dataset after filtering
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


scRNAseq cesc


2022-01-13 09:09:27,738 21146 genes overlap with model after filtering
2022-01-13 09:09:27,739 703 genes missing from dataset after filtering
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


scRNAseq myeloma


2022-01-13 09:09:44,356 18733 genes overlap with model after filtering
2022-01-13 09:09:44,357 1290 genes missing from dataset after filtering
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


scRNAseq brca


2022-01-13 09:10:09,843 22285 genes overlap with model after filtering
2022-01-13 09:10:09,844 1268 genes missing from dataset after filtering
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


scRNAseq hnscc


2022-01-13 09:10:30,151 21915 genes overlap with model after filtering
2022-01-13 09:10:30,152 1008 genes missing from dataset after filtering
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


scRNAseq pdac


2022-01-13 09:10:49,282 23831 genes overlap with model after filtering
2022-01-13 09:10:49,283 1188 genes missing from dataset after filtering
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


scRNAseq melanoma


2022-01-13 09:11:16,123 19219 genes overlap with model after filtering
2022-01-13 09:11:16,124 1385 genes missing from dataset after filtering
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


intra-dataset

In [44]:
for dtype, d1 in fmap.items():
    for disease, d2 in d1.items():
        model = utils.load_model(os.path.join(module_dir, f'{dtype}_{disease}'))
        
        for disease_2, disease_2_d in d1.items():
            print(dtype, disease, disease_2)
            val = sc.read_h5ad(disease_2_d['val'])
            adata = utils.predict_adata(model, val)
            adata.write_h5ad(os.path.join(result_dir, 'cross_disease', f'{dtype}_{disease}_{disease_2}.h5ad'))

2022-01-13 09:27:18,491 24684 genes overlap with model after filtering
2022-01-13 09:27:18,492 1318 genes missing from dataset after filtering


snRNAseq ccrcc ccrcc


  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical
2022-01-13 09:27:36,100 24698 genes overlap with model after filtering
2022-01-13 09:27:36,101 1304 genes missing from dataset after filtering


snRNAseq ccrcc brca


  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical
2022-01-13 09:27:54,894 24492 genes overlap with model after filtering
2022-01-13 09:27:54,895 1510 genes missing from dataset after filtering


snRNAseq ccrcc gbm


  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical
2022-01-13 09:28:24,609 24725 genes overlap with model after filtering
2022-01-13 09:28:24,610 1818 genes missing from dataset after filtering


snRNAseq brca ccrcc


  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical
2022-01-13 09:28:41,915 25360 genes overlap with model after filtering
2022-01-13 09:28:41,916 1183 genes missing from dataset after filtering


snRNAseq brca brca


  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical
2022-01-13 09:29:00,564 24864 genes overlap with model after filtering
2022-01-13 09:29:00,564 1679 genes missing from dataset after filtering


snRNAseq brca gbm


  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical
2022-01-13 09:29:31,395 24495 genes overlap with model after filtering
2022-01-13 09:29:31,396 1656 genes missing from dataset after filtering


snRNAseq gbm ccrcc


  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical
2022-01-13 09:29:49,896 24744 genes overlap with model after filtering
2022-01-13 09:29:49,896 1407 genes missing from dataset after filtering


snRNAseq gbm brca


  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical
2022-01-13 09:30:09,512 25009 genes overlap with model after filtering
2022-01-13 09:30:09,513 1142 genes missing from dataset after filtering


snRNAseq gbm gbm


  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


snATACseq brca brca


2022-01-13 09:30:39,009 19699 genes overlap with model after filtering
2022-01-13 09:30:39,010 17 genes missing from dataset after filtering
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical
2022-01-13 09:31:08,954 19652 genes overlap with model after filtering
2022-01-13 09:31:08,955 64 genes missing from dataset after filtering


snATACseq brca ccrcc


  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


snATACseq brca gbm


2022-01-13 09:31:31,727 19678 genes overlap with model after filtering
2022-01-13 09:31:31,728 38 genes missing from dataset after filtering
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


snATACseq ccrcc brca


2022-01-13 09:31:55,133 19664 genes overlap with model after filtering
2022-01-13 09:31:55,134 53 genes missing from dataset after filtering
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical
2022-01-13 09:32:24,552 19677 genes overlap with model after filtering
2022-01-13 09:32:24,553 40 genes missing from dataset after filtering


snATACseq ccrcc ccrcc


  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical
2022-01-13 09:32:47,854 19667 genes overlap with model after filtering
2022-01-13 09:32:47,855 50 genes missing from dataset after filtering


snATACseq ccrcc gbm


  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


snATACseq gbm brca


2022-01-13 09:33:10,374 19689 genes overlap with model after filtering
2022-01-13 09:33:10,375 39 genes missing from dataset after filtering
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical
2022-01-13 09:33:40,713 19666 genes overlap with model after filtering
2022-01-13 09:33:40,713 62 genes missing from dataset after filtering


snATACseq gbm ccrcc


  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


snATACseq gbm gbm


2022-01-13 09:34:03,516 19698 genes overlap with model after filtering
2022-01-13 09:34:03,516 30 genes missing from dataset after filtering
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical
2022-01-13 09:34:26,856 21146 genes overlap with model after filtering
2022-01-13 09:34:26,857 703 genes missing from dataset after filtering


scRNAseq cesc cesc


  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical
2022-01-13 09:34:42,809 16059 genes overlap with model after filtering
2022-01-13 09:34:42,810 5790 genes missing from dataset after filtering


scRNAseq cesc myeloma


  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical
2022-01-13 09:35:08,896 20837 genes overlap with model after filtering
2022-01-13 09:35:08,897 1012 genes missing from dataset after filtering


scRNAseq cesc brca


  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical
2022-01-13 09:35:28,664 20880 genes overlap with model after filtering
2022-01-13 09:35:28,665 969 genes missing from dataset after filtering


scRNAseq cesc hnscc


  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


scRNAseq cesc pdac


2022-01-13 09:35:48,380 21297 genes overlap with model after filtering
2022-01-13 09:35:48,381 552 genes missing from dataset after filtering
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical
2022-01-13 09:36:13,844 17922 genes overlap with model after filtering
2022-01-13 09:36:13,845 3927 genes missing from dataset after filtering


scRNAseq cesc melanoma


  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical
2022-01-13 09:36:42,961 16162 genes overlap with model after filtering
2022-01-13 09:36:42,962 3861 genes missing from dataset after filtering


scRNAseq myeloma cesc


  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical
2022-01-13 09:36:59,929 18733 genes overlap with model after filtering
2022-01-13 09:36:59,930 1290 genes missing from dataset after filtering


scRNAseq myeloma myeloma


  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical
2022-01-13 09:37:25,412 16452 genes overlap with model after filtering
2022-01-13 09:37:25,413 3571 genes missing from dataset after filtering


scRNAseq myeloma brca


  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical
2022-01-13 09:37:45,091 16448 genes overlap with model after filtering
2022-01-13 09:37:45,092 3575 genes missing from dataset after filtering


scRNAseq myeloma hnscc


  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


scRNAseq myeloma pdac


2022-01-13 09:38:04,911 16546 genes overlap with model after filtering
2022-01-13 09:38:04,912 3477 genes missing from dataset after filtering
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical
2022-01-13 09:38:32,005 15407 genes overlap with model after filtering
2022-01-13 09:38:32,006 4616 genes missing from dataset after filtering


scRNAseq myeloma melanoma


  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical
2022-01-13 09:39:00,406 20766 genes overlap with model after filtering
2022-01-13 09:39:00,407 2787 genes missing from dataset after filtering


scRNAseq brca cesc


  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical
2022-01-13 09:39:16,921 16314 genes overlap with model after filtering
2022-01-13 09:39:16,921 7239 genes missing from dataset after filtering


scRNAseq brca myeloma


  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical
2022-01-13 09:39:42,373 22285 genes overlap with model after filtering
2022-01-13 09:39:42,374 1268 genes missing from dataset after filtering


scRNAseq brca brca


  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical
2022-01-13 09:40:03,888 21719 genes overlap with model after filtering
2022-01-13 09:40:03,888 1834 genes missing from dataset after filtering


scRNAseq brca hnscc


  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


scRNAseq brca pdac


2022-01-13 09:40:22,946 22690 genes overlap with model after filtering
2022-01-13 09:40:22,947 863 genes missing from dataset after filtering
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical
2022-01-13 09:40:48,817 18471 genes overlap with model after filtering
2022-01-13 09:40:48,818 5082 genes missing from dataset after filtering


scRNAseq brca melanoma


  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical
2022-01-13 09:41:16,764 20762 genes overlap with model after filtering
2022-01-13 09:41:16,765 2161 genes missing from dataset after filtering


scRNAseq hnscc cesc


  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical
2022-01-13 09:41:34,367 16282 genes overlap with model after filtering
2022-01-13 09:41:34,367 6641 genes missing from dataset after filtering


scRNAseq hnscc myeloma


  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical
2022-01-13 09:42:00,825 21641 genes overlap with model after filtering
2022-01-13 09:42:00,826 1282 genes missing from dataset after filtering


scRNAseq hnscc brca


  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical
2022-01-13 09:42:20,868 21915 genes overlap with model after filtering
2022-01-13 09:42:20,869 1008 genes missing from dataset after filtering


scRNAseq hnscc hnscc


  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


scRNAseq hnscc pdac


2022-01-13 09:42:38,967 22148 genes overlap with model after filtering
2022-01-13 09:42:38,968 775 genes missing from dataset after filtering
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical
2022-01-13 09:43:07,607 18375 genes overlap with model after filtering
2022-01-13 09:43:07,608 4548 genes missing from dataset after filtering


scRNAseq hnscc melanoma


  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical
2022-01-13 09:43:36,316 21169 genes overlap with model after filtering
2022-01-13 09:43:36,317 3850 genes missing from dataset after filtering


scRNAseq pdac cesc


  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical
2022-01-13 09:43:53,852 16401 genes overlap with model after filtering
2022-01-13 09:43:53,853 8618 genes missing from dataset after filtering


scRNAseq pdac myeloma


  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical
2022-01-13 09:44:19,257 22661 genes overlap with model after filtering
2022-01-13 09:44:19,258 2358 genes missing from dataset after filtering


scRNAseq pdac brca


  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical
2022-01-13 09:44:40,235 22141 genes overlap with model after filtering
2022-01-13 09:44:40,236 2878 genes missing from dataset after filtering


scRNAseq pdac hnscc


  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


scRNAseq pdac pdac


2022-01-13 09:44:59,606 23831 genes overlap with model after filtering
2022-01-13 09:44:59,606 1188 genes missing from dataset after filtering
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical
2022-01-13 09:45:26,716 18673 genes overlap with model after filtering
2022-01-13 09:45:26,716 6346 genes missing from dataset after filtering


scRNAseq pdac melanoma


  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical
2022-01-13 09:45:56,059 18112 genes overlap with model after filtering
2022-01-13 09:45:56,060 2492 genes missing from dataset after filtering


scRNAseq melanoma cesc


  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical
2022-01-13 09:46:12,621 15417 genes overlap with model after filtering
2022-01-13 09:46:12,622 5187 genes missing from dataset after filtering


scRNAseq melanoma myeloma


  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical
2022-01-13 09:46:38,592 18727 genes overlap with model after filtering
2022-01-13 09:46:38,593 1877 genes missing from dataset after filtering


scRNAseq melanoma brca


  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical
2022-01-13 09:47:00,025 18671 genes overlap with model after filtering
2022-01-13 09:47:00,025 1933 genes missing from dataset after filtering


scRNAseq melanoma hnscc


  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


scRNAseq melanoma pdac


2022-01-13 09:47:20,194 19022 genes overlap with model after filtering
2022-01-13 09:47:20,195 1582 genes missing from dataset after filtering
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical
2022-01-13 09:47:46,240 19219 genes overlap with model after filtering
2022-01-13 09:47:46,240 1385 genes missing from dataset after filtering


scRNAseq melanoma melanoma


  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


generalized

In [22]:
for dtype, d1 in fmap.items():
    for disease, d2 in d1.items():
        print(dtype, disease)
        model = utils.load_model(os.path.join(module_dir, f'{dtype}_generalized'))
        val = sc.read_h5ad(d2['val'])
        adata = utils.predict_adata(model, val)
        adata.write_h5ad(os.path.join(result_dir, 'generalized', f'{dtype}_{disease}.h5ad'))

snRNAseq ccrcc


2022-01-19 09:59:03,352 25451 genes overlap with model after filtering
2022-01-19 09:59:03,353 2299 genes missing from dataset after filtering
2022-01-19 09:59:05,429 starting prediction of 4518 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


snRNAseq brca


2022-01-19 09:59:23,367 25884 genes overlap with model after filtering
2022-01-19 09:59:23,368 1866 genes missing from dataset after filtering
2022-01-19 09:59:25,567 starting prediction of 4893 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


snRNAseq gbm


2022-01-19 09:59:42,905 25685 genes overlap with model after filtering
2022-01-19 09:59:42,906 2065 genes missing from dataset after filtering
2022-01-19 09:59:44,577 starting prediction of 3577 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


snATACseq brca


2022-01-19 10:00:18,551 19714 genes overlap with model after filtering
2022-01-19 10:00:18,552 48 genes missing from dataset after filtering
2022-01-19 10:00:21,312 starting prediction of 3519 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


snATACseq ccrcc


2022-01-19 10:00:48,533 19676 genes overlap with model after filtering
2022-01-19 10:00:48,534 86 genes missing from dataset after filtering
2022-01-19 10:00:50,441 starting prediction of 3000 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


snATACseq gbm


2022-01-19 10:01:12,548 19713 genes overlap with model after filtering
2022-01-19 10:01:12,548 49 genes missing from dataset after filtering
2022-01-19 10:01:14,659 starting prediction of 2876 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


scRNAseq cesc


2022-01-19 10:01:36,015 17302 genes overlap with model after filtering
2022-01-19 10:01:36,016 236 genes missing from dataset after filtering
2022-01-19 10:01:37,504 starting prediction of 4276 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


scRNAseq myeloma


2022-01-19 10:01:51,982 15469 genes overlap with model after filtering
2022-01-19 10:01:51,983 2069 genes missing from dataset after filtering
2022-01-19 10:01:52,934 starting prediction of 3312 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


scRNAseq brca


2022-01-19 10:02:17,800 17383 genes overlap with model after filtering
2022-01-19 10:02:17,801 155 genes missing from dataset after filtering
2022-01-19 10:02:19,713 starting prediction of 5748 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


scRNAseq hnscc


2022-01-19 10:02:39,810 17373 genes overlap with model after filtering
2022-01-19 10:02:39,812 165 genes missing from dataset after filtering
2022-01-19 10:02:41,371 starting prediction of 5201 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


scRNAseq pdac


2022-01-19 10:02:58,209 17488 genes overlap with model after filtering
2022-01-19 10:02:58,210 50 genes missing from dataset after filtering
2022-01-19 10:03:00,890 starting prediction of 7823 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


scRNAseq melanoma


2022-01-19 10:03:24,054 16748 genes overlap with model after filtering
2022-01-19 10:03:24,055 790 genes missing from dataset after filtering
2022-01-19 10:03:24,971 starting prediction of 3517 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


In [23]:
# hca dataset
model = utils.load_model(os.path.join(module_dir, 'scRNAseq_generalized'))
val = sc.read_h5ad('/data/pollock/benchmarking/generalized_datasets/scRNAseq_hca.h5ad')
adata = utils.predict_adata(model, val)
adata.write_h5ad(os.path.join(result_dir, 'generalized', f'scRNAseq_hca.h5ad'))

2022-01-19 10:03:55,051 17205 genes overlap with model after filtering
2022-01-19 10:03:55,052 333 genes missing from dataset after filtering
2022-01-19 10:04:14,013 starting prediction of 69421 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


dropout

In [79]:
fps = sorted(utils.listfiles('/data/pollock/benchmarking/dropout_datasets/'))
fps

['/data/pollock/benchmarking/dropout_datasets/brca_0.00.h5ad',
 '/data/pollock/benchmarking/dropout_datasets/brca_0.20.h5ad',
 '/data/pollock/benchmarking/dropout_datasets/brca_0.40.h5ad',
 '/data/pollock/benchmarking/dropout_datasets/brca_0.60.h5ad',
 '/data/pollock/benchmarking/dropout_datasets/brca_0.80.h5ad',
 '/data/pollock/benchmarking/dropout_datasets/brca_0.90.h5ad',
 '/data/pollock/benchmarking/dropout_datasets/brca_0.95.h5ad',
 '/data/pollock/benchmarking/dropout_datasets/cesc_0.00.h5ad',
 '/data/pollock/benchmarking/dropout_datasets/cesc_0.20.h5ad',
 '/data/pollock/benchmarking/dropout_datasets/cesc_0.40.h5ad',
 '/data/pollock/benchmarking/dropout_datasets/cesc_0.60.h5ad',
 '/data/pollock/benchmarking/dropout_datasets/cesc_0.80.h5ad',
 '/data/pollock/benchmarking/dropout_datasets/cesc_0.90.h5ad',
 '/data/pollock/benchmarking/dropout_datasets/cesc_0.95.h5ad',
 '/data/pollock/benchmarking/dropout_datasets/hnscc_0.00.h5ad',
 '/data/pollock/benchmarking/dropout_datasets/hnscc_0.

In [80]:
for fp in fps:
    root = fp.split('/')[-1].replace('.h5ad', '')
    disease, rate = root.split('_')
    print(disease, rate)
    model = utils.load_model(os.path.join(module_dir, f'scRNAseq_{disease}'))
    val = sc.read_h5ad(fp)
    adata = utils.predict_adata(model, val)
    adata.write_h5ad(os.path.join(result_dir, 'dropout', f'{disease}_{rate}.h5ad'))

brca 0.00


2022-01-13 11:36:27,992 22285 genes overlap with model after filtering
2022-01-13 11:36:27,993 1268 genes missing from dataset after filtering
2022-01-13 11:36:29,992 starting prediction of 5748 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


brca 0.20


2022-01-13 11:36:50,125 22005 genes overlap with model after filtering
2022-01-13 11:36:50,126 1548 genes missing from dataset after filtering
2022-01-13 11:36:52,020 starting prediction of 5748 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


brca 0.40


2022-01-13 11:37:09,274 21660 genes overlap with model after filtering
2022-01-13 11:37:09,275 1893 genes missing from dataset after filtering
2022-01-13 11:37:11,013 starting prediction of 5748 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


brca 0.60


2022-01-13 11:37:28,874 20969 genes overlap with model after filtering
2022-01-13 11:37:28,875 2584 genes missing from dataset after filtering
2022-01-13 11:37:30,382 starting prediction of 5748 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


brca 0.80


2022-01-13 11:37:50,437 19689 genes overlap with model after filtering
2022-01-13 11:37:50,438 3864 genes missing from dataset after filtering
2022-01-13 11:37:51,822 starting prediction of 5748 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


brca 0.90


2022-01-13 11:38:09,603 18200 genes overlap with model after filtering
2022-01-13 11:38:09,604 5353 genes missing from dataset after filtering
2022-01-13 11:38:10,983 starting prediction of 5748 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


brca 0.95


2022-01-13 11:38:30,367 16527 genes overlap with model after filtering
2022-01-13 11:38:30,367 7026 genes missing from dataset after filtering
2022-01-13 11:38:31,829 starting prediction of 5748 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


cesc 0.00


2022-01-13 11:38:49,056 21146 genes overlap with model after filtering
2022-01-13 11:38:49,056 703 genes missing from dataset after filtering
2022-01-13 11:38:50,624 starting prediction of 4276 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


cesc 0.20


2022-01-13 11:39:06,994 20956 genes overlap with model after filtering
2022-01-13 11:39:06,995 893 genes missing from dataset after filtering
2022-01-13 11:39:08,460 starting prediction of 4276 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


cesc 0.40


2022-01-13 11:39:24,211 20686 genes overlap with model after filtering
2022-01-13 11:39:24,212 1163 genes missing from dataset after filtering
2022-01-13 11:39:25,476 starting prediction of 4276 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


cesc 0.60


2022-01-13 11:39:40,006 20143 genes overlap with model after filtering
2022-01-13 11:39:40,007 1706 genes missing from dataset after filtering
2022-01-13 11:39:41,105 starting prediction of 4276 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


cesc 0.80


2022-01-13 11:39:57,385 19124 genes overlap with model after filtering
2022-01-13 11:39:57,386 2725 genes missing from dataset after filtering
2022-01-13 11:39:58,395 starting prediction of 4276 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


cesc 0.90


2022-01-13 11:40:13,229 17895 genes overlap with model after filtering
2022-01-13 11:40:13,230 3954 genes missing from dataset after filtering
2022-01-13 11:40:14,185 starting prediction of 4276 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


cesc 0.95


2022-01-13 11:40:29,125 16547 genes overlap with model after filtering
2022-01-13 11:40:29,126 5302 genes missing from dataset after filtering
2022-01-13 11:40:30,099 starting prediction of 4276 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


hnscc 0.00


2022-01-13 11:40:44,872 21915 genes overlap with model after filtering
2022-01-13 11:40:44,873 1008 genes missing from dataset after filtering
2022-01-13 11:40:46,684 starting prediction of 5201 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


hnscc 0.20


2022-01-13 11:41:04,589 21683 genes overlap with model after filtering
2022-01-13 11:41:04,590 1240 genes missing from dataset after filtering
2022-01-13 11:41:06,240 starting prediction of 5201 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


hnscc 0.40


2022-01-13 11:41:23,479 21390 genes overlap with model after filtering
2022-01-13 11:41:23,480 1533 genes missing from dataset after filtering
2022-01-13 11:41:24,954 starting prediction of 5201 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


hnscc 0.60


2022-01-13 11:41:42,299 20807 genes overlap with model after filtering
2022-01-13 11:41:42,300 2116 genes missing from dataset after filtering
2022-01-13 11:41:43,688 starting prediction of 5201 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


hnscc 0.80


2022-01-13 11:42:01,803 19604 genes overlap with model after filtering
2022-01-13 11:42:01,804 3319 genes missing from dataset after filtering
2022-01-13 11:42:03,066 starting prediction of 5201 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


hnscc 0.90


2022-01-13 11:42:20,313 18206 genes overlap with model after filtering
2022-01-13 11:42:20,314 4717 genes missing from dataset after filtering
2022-01-13 11:42:21,472 starting prediction of 5201 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


hnscc 0.95


2022-01-13 11:42:39,333 16537 genes overlap with model after filtering
2022-01-13 11:42:39,333 6386 genes missing from dataset after filtering
2022-01-13 11:42:40,581 starting prediction of 5201 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


melanoma 0.00


2022-01-13 11:42:57,564 19219 genes overlap with model after filtering
2022-01-13 11:42:57,565 1385 genes missing from dataset after filtering
2022-01-13 11:42:58,625 starting prediction of 3517 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


melanoma 0.20


2022-01-13 11:43:25,726 18923 genes overlap with model after filtering
2022-01-13 11:43:25,727 1681 genes missing from dataset after filtering
2022-01-13 11:43:26,681 starting prediction of 3517 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


melanoma 0.40


2022-01-13 11:43:54,520 18497 genes overlap with model after filtering
2022-01-13 11:43:54,521 2107 genes missing from dataset after filtering
2022-01-13 11:43:55,402 starting prediction of 3517 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


melanoma 0.60


2022-01-13 11:44:21,993 17796 genes overlap with model after filtering
2022-01-13 11:44:21,994 2808 genes missing from dataset after filtering
2022-01-13 11:44:22,817 starting prediction of 3517 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


melanoma 0.80


2022-01-13 11:44:50,663 16483 genes overlap with model after filtering
2022-01-13 11:44:50,664 4121 genes missing from dataset after filtering
2022-01-13 11:44:51,412 starting prediction of 3517 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


melanoma 0.90


2022-01-13 11:45:18,560 14997 genes overlap with model after filtering
2022-01-13 11:45:18,561 5607 genes missing from dataset after filtering
2022-01-13 11:45:19,350 starting prediction of 3517 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


melanoma 0.95


2022-01-13 11:45:47,104 13395 genes overlap with model after filtering
2022-01-13 11:45:47,105 7209 genes missing from dataset after filtering
2022-01-13 11:45:47,887 starting prediction of 3517 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


myeloma 0.00


2022-01-13 11:46:15,496 18733 genes overlap with model after filtering
2022-01-13 11:46:15,497 1290 genes missing from dataset after filtering
2022-01-13 11:46:16,498 starting prediction of 3312 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


myeloma 0.20


2022-01-13 11:46:41,649 18441 genes overlap with model after filtering
2022-01-13 11:46:41,650 1582 genes missing from dataset after filtering
2022-01-13 11:46:42,540 starting prediction of 3312 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


myeloma 0.40


2022-01-13 11:47:07,436 18059 genes overlap with model after filtering
2022-01-13 11:47:07,437 1964 genes missing from dataset after filtering
2022-01-13 11:47:08,267 starting prediction of 3312 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


myeloma 0.60


2022-01-13 11:47:33,577 17431 genes overlap with model after filtering
2022-01-13 11:47:33,578 2592 genes missing from dataset after filtering
2022-01-13 11:47:34,360 starting prediction of 3312 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


myeloma 0.80


2022-01-13 11:47:59,508 16217 genes overlap with model after filtering
2022-01-13 11:47:59,509 3806 genes missing from dataset after filtering
2022-01-13 11:48:00,224 starting prediction of 3312 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


myeloma 0.90


2022-01-13 11:48:25,413 14878 genes overlap with model after filtering
2022-01-13 11:48:25,414 5145 genes missing from dataset after filtering
2022-01-13 11:48:26,154 starting prediction of 3312 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


myeloma 0.95


2022-01-13 11:48:50,289 13509 genes overlap with model after filtering
2022-01-13 11:48:50,290 6514 genes missing from dataset after filtering
2022-01-13 11:48:51,001 starting prediction of 3312 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


pdac 0.00


2022-01-13 11:49:17,470 23831 genes overlap with model after filtering
2022-01-13 11:49:17,471 1188 genes missing from dataset after filtering
2022-01-13 11:49:20,630 starting prediction of 7823 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


pdac 0.20


2022-01-13 11:49:44,569 23595 genes overlap with model after filtering
2022-01-13 11:49:44,569 1424 genes missing from dataset after filtering
2022-01-13 11:49:47,176 starting prediction of 7823 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


pdac 0.40


2022-01-13 11:50:13,745 23232 genes overlap with model after filtering
2022-01-13 11:50:13,745 1787 genes missing from dataset after filtering
2022-01-13 11:50:16,150 starting prediction of 7823 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


pdac 0.60


2022-01-13 11:50:41,109 22706 genes overlap with model after filtering
2022-01-13 11:50:41,109 2313 genes missing from dataset after filtering
2022-01-13 11:50:43,267 starting prediction of 7823 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


pdac 0.80


2022-01-13 11:51:08,898 21415 genes overlap with model after filtering
2022-01-13 11:51:08,899 3604 genes missing from dataset after filtering
2022-01-13 11:51:10,859 starting prediction of 7823 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


pdac 0.90


2022-01-13 11:51:35,187 19977 genes overlap with model after filtering
2022-01-13 11:51:35,188 5042 genes missing from dataset after filtering
2022-01-13 11:51:37,096 starting prediction of 7823 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


pdac 0.95


2022-01-13 11:52:01,398 18381 genes overlap with model after filtering
2022-01-13 11:52:01,399 6638 genes missing from dataset after filtering
2022-01-13 11:52:03,364 starting prediction of 7823 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


panimmune

In [86]:
for disease, d2 in fmap['scRNAseq'].items():
    print(disease)
    model = utils.load_model('/data/pollock/panimmune/brca_panimmune')
    val = sc.read_h5ad(d2['val'])
    adata = utils.predict_adata(model, val)
    adata.write_h5ad(f'/data/pollock/panimmune/results/{disease}.h5ad')

cesc


2022-01-13 12:31:59,418 20721 genes overlap with model after filtering
2022-01-13 12:31:59,419 3228 genes missing from dataset after filtering
2022-01-13 12:32:01,175 starting prediction of 4276 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


myeloma


2022-01-13 12:32:17,369 16340 genes overlap with model after filtering
2022-01-13 12:32:17,370 7609 genes missing from dataset after filtering
2022-01-13 12:32:18,662 starting prediction of 3312 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


brca


2022-01-13 12:32:43,381 22433 genes overlap with model after filtering
2022-01-13 12:32:43,382 1516 genes missing from dataset after filtering
2022-01-13 12:32:45,395 starting prediction of 5748 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


hnscc


2022-01-13 12:33:06,093 21699 genes overlap with model after filtering
2022-01-13 12:33:06,093 2250 genes missing from dataset after filtering
2022-01-13 12:33:07,982 starting prediction of 5201 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


pdac


2022-01-13 12:33:24,937 22875 genes overlap with model after filtering
2022-01-13 12:33:24,938 1074 genes missing from dataset after filtering
2022-01-13 12:33:27,918 starting prediction of 7823 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


melanoma


2022-01-13 12:33:54,734 18497 genes overlap with model after filtering
2022-01-13 12:33:54,735 5452 genes missing from dataset after filtering
2022-01-13 12:33:55,947 starting prediction of 3517 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


In [26]:
model = utils.load_model('/data/pollock/panimmune/brca_panimmune')
val = sc.read_h5ad('/data/pollock/panimmune/data/brca-panimmune_val.h5ad')
adata = utils.predict_adata(model, val)
adata.write_h5ad(f'/data/pollock/panimmune/results/brca-panimmune_val.h5ad')

2022-01-19 10:50:56,739 22507 genes overlap with model after filtering
2022-01-19 10:50:56,739 1442 genes missing from dataset after filtering
2022-01-19 10:50:59,256 starting prediction of 6303 cells
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_cell_type' as categorical


###### get performance

In [14]:
for dtype, d1 in folds_fmap.items():
    for disease, d2 in d1.items():
        for fold, d3 in d2.items():
            print(dtype, disease, fold)
            adata = sc.read_h5ad(os.path.join(result_dir, 'interdataset_with_folds', f'{dtype}_{disease}_{fold}.h5ad'))
            y_pred = adata.obs['predicted_cell_type'].to_list()
            y_true = adata.obs['cell_type'].to_list()
            d, cm = get_metrics(y_true, y_pred)

            d.to_csv(os.path.join(result_dir, 'interdataset_with_folds', f'{dtype}_{disease}_clf_report.txt'), sep='\t')
            cm.to_csv(os.path.join(result_dir, 'interdataset_with_folds', f'{dtype}_{disease}_confusion_matrix.txt'), sep='\t')

snRNAseq ccrcc fold3
snRNAseq ccrcc fold1
snRNAseq ccrcc fold2
snRNAseq ccrcc fold0
snRNAseq ccrcc fold4
snRNAseq brca fold2
snRNAseq brca fold1
snRNAseq brca fold3
snRNAseq brca fold0
snRNAseq brca fold4
snRNAseq gbm fold4
snRNAseq gbm fold3
snRNAseq gbm fold0
snRNAseq gbm fold2
snRNAseq gbm fold1
snATACseq ccrcc fold3
snATACseq ccrcc fold1
snATACseq ccrcc fold2
snATACseq ccrcc fold0
snATACseq ccrcc fold4
snATACseq brca fold2
snATACseq brca fold1
snATACseq brca fold3
snATACseq brca fold0
snATACseq brca fold4
snATACseq gbm fold4
snATACseq gbm fold3
snATACseq gbm fold0
snATACseq gbm fold2
snATACseq gbm fold1
scRNAseq hnscc fold0
scRNAseq hnscc fold4
scRNAseq hnscc fold1
scRNAseq hnscc fold3
scRNAseq hnscc fold2
scRNAseq cesc fold4
scRNAseq cesc fold3
scRNAseq cesc fold2
scRNAseq cesc fold0
scRNAseq cesc fold1
scRNAseq brca fold2
scRNAseq brca fold1
scRNAseq brca fold3
scRNAseq brca fold0
scRNAseq brca fold4
scRNAseq myeloma fold0
scRNAseq myeloma fold1


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


scRNAseq myeloma fold3


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


scRNAseq myeloma fold4


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


scRNAseq myeloma fold2
scRNAseq melanoma fold3
scRNAseq melanoma fold0
scRNAseq melanoma fold2
scRNAseq melanoma fold1
scRNAseq melanoma fold4
scRNAseq pdac fold2
scRNAseq pdac fold3
scRNAseq pdac fold1
scRNAseq pdac fold4
scRNAseq pdac fold0


In [41]:
for dtype, d1 in fmap.items():
    for disease, d2 in d1.items():
        print(dtype, disease)
        adata = sc.read_h5ad(os.path.join(result_dir, f'{dtype}_{disease}.h5ad'))
        y_pred = adata.obs['predicted_cell_type'].to_list()
        y_true = adata.obs['cell_type'].to_list()
        d, cm = get_metrics(y_true, y_pred)
        
        d.to_csv(os.path.join(result_dir, f'{dtype}_{disease}_clf_report.txt'), sep='\t')
        cm.to_csv(os.path.join(result_dir, f'{dtype}_{disease}_confusion_matrix.txt'), sep='\t')

snRNAseq ccrcc
snRNAseq brca
snRNAseq gbm
snATACseq brca
snATACseq ccrcc
snATACseq gbm
scRNAseq cesc
scRNAseq myeloma
scRNAseq brca


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


scRNAseq hnscc
scRNAseq pdac
scRNAseq melanoma


In [53]:
for dtype, d1 in fmap.items():
    for disease, d2 in d1.items():
        for disease_2, disease_2_d in d1.items():
            print(dtype, disease, disease_2)
            adata = sc.read_h5ad(os.path.join(result_dir, 'cross_disease', f'{dtype}_{disease}_{disease_2}.h5ad'))
            y_pred = adata.obs['predicted_cell_type'].to_list()
            y_true = adata.obs['cell_type'].to_list()
            d, cm = get_metrics(y_true, y_pred, overlapping_only=True)

            d.to_csv(os.path.join(result_dir, 'cross_disease', f'{dtype}_{disease}_{disease_2}_clf_report.txt'), sep='\t')
            cm.to_csv(os.path.join(result_dir, 'cross_disease', f'{dtype}_{disease}_{disease_2}_confusion_matrix.txt'), sep='\t')

snRNAseq ccrcc ccrcc
snRNAseq ccrcc brca


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


snRNAseq ccrcc gbm


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


snRNAseq brca ccrcc


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


snRNAseq brca brca
snRNAseq brca gbm


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


snRNAseq gbm ccrcc


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


snRNAseq gbm brca


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


snRNAseq gbm gbm
snATACseq brca brca
snATACseq brca ccrcc
snATACseq brca gbm


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


snATACseq ccrcc brca


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


snATACseq ccrcc ccrcc
snATACseq ccrcc gbm


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


snATACseq gbm brca


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


snATACseq gbm ccrcc
snATACseq gbm gbm
scRNAseq cesc cesc
scRNAseq cesc myeloma
scRNAseq cesc brca


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


scRNAseq cesc hnscc


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


scRNAseq cesc pdac
scRNAseq cesc melanoma
scRNAseq myeloma cesc


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


scRNAseq myeloma myeloma
scRNAseq myeloma brca
scRNAseq myeloma hnscc


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


scRNAseq myeloma pdac
scRNAseq myeloma melanoma
scRNAseq brca cesc


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


scRNAseq brca myeloma
scRNAseq brca brca
scRNAseq brca hnscc


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


scRNAseq brca pdac
scRNAseq brca melanoma
scRNAseq hnscc cesc


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


scRNAseq hnscc myeloma
scRNAseq hnscc brca
scRNAseq hnscc hnscc
scRNAseq hnscc pdac
scRNAseq hnscc melanoma
scRNAseq pdac cesc


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


scRNAseq pdac myeloma
scRNAseq pdac brca


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


scRNAseq pdac hnscc


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


scRNAseq pdac pdac
scRNAseq pdac melanoma
scRNAseq melanoma cesc


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


scRNAseq melanoma myeloma
scRNAseq melanoma brca
scRNAseq melanoma hnscc


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


scRNAseq melanoma pdac
scRNAseq melanoma melanoma


In [24]:
for dtype, d1 in fmap.items():
    for disease, d2 in d1.items():
        print(dtype, disease)
        adata = sc.read_h5ad(os.path.join(result_dir, 'generalized', f'{dtype}_{disease}.h5ad'))
        y_pred = adata.obs['predicted_cell_type'].to_list()
        y_true = adata.obs['cell_type'].to_list()
        d, cm = get_metrics(y_true, y_pred, overlapping_only=True)
        
        d.to_csv(os.path.join(result_dir, 'generalized', f'{dtype}_{disease}_clf_report.txt'), sep='\t')
        cm.to_csv(os.path.join(result_dir, 'generalized', f'{dtype}_{disease}_confusion_matrix.txt'), sep='\t')

snRNAseq ccrcc


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


snRNAseq brca


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


snRNAseq gbm


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


snATACseq brca


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


snATACseq ccrcc
snATACseq gbm


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


scRNAseq cesc
scRNAseq myeloma


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


scRNAseq brca


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


scRNAseq hnscc


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


scRNAseq pdac
scRNAseq melanoma


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [25]:
adata = sc.read_h5ad(os.path.join(result_dir, 'generalized', f'scRNAseq_hca.h5ad'))
y_pred = adata.obs['predicted_cell_type'].to_list()
y_true = adata.obs['cell_type'].to_list()
d, cm = get_metrics(y_true, y_pred, overlapping_only=True)

d.to_csv(os.path.join(result_dir, 'generalized', f'scRNAseq_hca_clf_report.txt'), sep='\t')
cm.to_csv(os.path.join(result_dir, 'generalized', f'scRNAseq_hca_confusion_matrix.txt'), sep='\t')

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [83]:
fps = sorted(utils.listfiles('/data/pollock/benchmarking/results/pollock/dropout'))
for fp in fps:
    root = fp.split('/')[-1].replace('.h5ad', '')
    disease, rate = root.split('_')
    print(disease, rate)
    adata = sc.read_h5ad(fp)
    y_pred = adata.obs['predicted_cell_type'].to_list()
    y_true = adata.obs['cell_type'].to_list()
    d, cm = get_metrics(y_true, y_pred)
    d.to_csv(os.path.join(result_dir, 'dropout', f'{disease}_{rate}_clf_report.txt'), sep='\t')
    cm.to_csv(os.path.join(result_dir, 'dropout', f'{disease}_{rate}_confusion_matrix.txt'), sep='\t')

brca 0.00
brca 0.20
brca 0.40
brca 0.60
brca 0.80
brca 0.90
brca 0.95
cesc 0.00
cesc 0.20
cesc 0.40
cesc 0.60
cesc 0.80
cesc 0.90
cesc 0.95
hnscc 0.00
hnscc 0.20
hnscc 0.40
hnscc 0.60
hnscc 0.80
hnscc 0.90
hnscc 0.95
melanoma 0.00
melanoma 0.20
melanoma 0.40
melanoma 0.60
melanoma 0.80
melanoma 0.90
melanoma 0.95
myeloma 0.00


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


myeloma 0.20
myeloma 0.40


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


myeloma 0.60
myeloma 0.80


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


myeloma 0.90
myeloma 0.95


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


pdac 0.00
pdac 0.20
pdac 0.40
pdac 0.60
pdac 0.80
pdac 0.90
pdac 0.95


In [29]:
fps = sorted(utils.listfiles('/data/pollock/panimmune/results', regex='.h5ad$'))
for fp in fps:
    disease = fp.split('/')[-1].replace('.h5ad', '')
    print(disease)
    adata = sc.read_h5ad(fp)
    y_pred = adata.obs['predicted_cell_type'].to_list()
    y_true = adata.obs['cell_type'].to_list()
    d, cm = get_metrics(y_true, y_pred, overlapping_only=True)
    d.to_csv(os.path.join('/data/pollock/panimmune/results', f'{disease}_clf_report.txt'), sep='\t')
    cm.to_csv(os.path.join('/data/pollock/panimmune/results', f'{disease}_confusion_matrix.txt'), sep='\t')

brca-panimmune_val
brca


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


cesc


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


hnscc


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


melanoma


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


myeloma
pdac


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#### explain 

In [60]:
model = utils.load_model('/data/pollock/panimmune/brca_panimmune')
# model = utils.load_model('/data/pollock/panimmune/brca_panimmune_no_zinb/')
val = sc.read_h5ad('/data/pollock/panimmune/data/brca-panimmune_val.h5ad')

In [61]:
background_ids, _ = utils.get_splits(val, 'cell_type', 10, oversample=False, split=1.)
len(background_ids)

170

In [62]:
from collections import Counter
Counter(val[background_ids].obs['cell_type']).most_common()

[('Monocyte/Macrophage', 10),
 ('Treg', 10),
 ('CD8 T cell - preexhausted', 10),
 ('CD4 T cell', 10),
 ('CD8 T cell - CTL', 10),
 ('CD8 T cell - exhausted', 10),
 ('NKT', 10),
 ('CD4 T cell - activated', 10),
 ('NK', 10),
 ('cDC2', 10),
 ('B cell', 10),
 ('Plasma', 10),
 ('cDC1', 10),
 ('Mast', 10),
 ('CD4 T cell - exhausted', 10),
 ('pDC', 10),
 ('CD8 T cell - proliferating', 10)]

In [63]:
background = val[background_ids]

In [64]:
explain_ids, _ = utils.get_splits(val, 'cell_type', 500, oversample=False, split=1.)
len(explain_ids)

6303

In [65]:
Counter(val[explain_ids].obs['cell_type']).most_common()

[('Monocyte/Macrophage', 500),
 ('Treg', 500),
 ('CD8 T cell - preexhausted', 500),
 ('CD4 T cell', 500),
 ('CD8 T cell - exhausted', 500),
 ('NKT', 500),
 ('CD4 T cell - activated', 500),
 ('cDC2', 500),
 ('B cell', 500),
 ('Plasma', 500),
 ('CD4 T cell - exhausted', 500),
 ('Mast', 255),
 ('CD8 T cell - CTL', 205),
 ('NK', 106),
 ('CD8 T cell - proliferating', 106),
 ('pDC', 77),
 ('cDC1', 54)]

In [66]:
explain = val[explain_ids]
explain.shape

(6303, 27131)

In [67]:
explain_dfs = {}
cell_types = sorted(set(explain.obs['cell_type']))
for cell_type in cell_types:
    f = explain[explain.obs['cell_type']==cell_type]
    
    print(cell_type, f.shape, background.shape)
    df = explain_adata(model, f, background, cell_type, method='integrated')
    
    explain_dfs[cell_type] = df

Trying to set attribute `.var` of view, copying.
2022-01-19 11:46:53,423 16301 genes overlap with model after filtering
2022-01-19 11:46:53,424 7648 genes missing from dataset after filtering


B cell (500, 27131) (170, 27131)


Trying to set attribute `.var` of view, copying.
2022-01-19 11:46:53,794 15375 genes overlap with model after filtering
2022-01-19 11:46:53,795 8574 genes missing from dataset after filtering
Trying to set attribute `.var` of view, copying.
2022-01-19 11:47:04,045 15957 genes overlap with model after filtering
2022-01-19 11:47:04,046 7992 genes missing from dataset after filtering


CD4 T cell (500, 27131) (170, 15420)


2022-01-19 11:47:04,319 15375 genes overlap with model after filtering
2022-01-19 11:47:04,320 8574 genes missing from dataset after filtering
Trying to set attribute `.var` of view, copying.
2022-01-19 11:47:13,248 15273 genes overlap with model after filtering
2022-01-19 11:47:13,248 8676 genes missing from dataset after filtering


CD4 T cell - activated (500, 27131) (170, 15420)


2022-01-19 11:47:13,524 15375 genes overlap with model after filtering
2022-01-19 11:47:13,525 8574 genes missing from dataset after filtering
Trying to set attribute `.var` of view, copying.
2022-01-19 11:47:23,632 16185 genes overlap with model after filtering
2022-01-19 11:47:23,632 7764 genes missing from dataset after filtering


CD4 T cell - exhausted (500, 27131) (170, 15420)


2022-01-19 11:47:23,920 15375 genes overlap with model after filtering
2022-01-19 11:47:23,921 8574 genes missing from dataset after filtering
Trying to set attribute `.var` of view, copying.
2022-01-19 11:47:33,211 13489 genes overlap with model after filtering
2022-01-19 11:47:33,212 10460 genes missing from dataset after filtering


CD8 T cell - CTL (205, 27131) (170, 15420)


2022-01-19 11:47:33,387 15375 genes overlap with model after filtering
2022-01-19 11:47:33,387 8574 genes missing from dataset after filtering
Trying to set attribute `.var` of view, copying.
2022-01-19 11:47:36,691 16169 genes overlap with model after filtering
2022-01-19 11:47:36,691 7780 genes missing from dataset after filtering


CD8 T cell - exhausted (500, 27131) (170, 15420)


2022-01-19 11:47:36,964 15375 genes overlap with model after filtering
2022-01-19 11:47:36,965 8574 genes missing from dataset after filtering
Trying to set attribute `.var` of view, copying.
2022-01-19 11:47:46,647 15683 genes overlap with model after filtering
2022-01-19 11:47:46,648 8266 genes missing from dataset after filtering


CD8 T cell - preexhausted (500, 27131) (170, 15420)


2022-01-19 11:47:46,920 15375 genes overlap with model after filtering
2022-01-19 11:47:46,921 8574 genes missing from dataset after filtering
Trying to set attribute `.var` of view, copying.
2022-01-19 11:47:56,883 14091 genes overlap with model after filtering
2022-01-19 11:47:56,883 9858 genes missing from dataset after filtering
2022-01-19 11:47:57,030 15375 genes overlap with model after filtering
2022-01-19 11:47:57,031 8574 genes missing from dataset after filtering


CD8 T cell - proliferating (106, 27131) (170, 15420)


Trying to set attribute `.var` of view, copying.
2022-01-19 11:47:58,720 14497 genes overlap with model after filtering
2022-01-19 11:47:58,721 9452 genes missing from dataset after filtering


Mast (255, 27131) (170, 15420)


2022-01-19 11:47:58,908 15375 genes overlap with model after filtering
2022-01-19 11:47:58,909 8574 genes missing from dataset after filtering
Trying to set attribute `.var` of view, copying.
2022-01-19 11:48:03,562 18093 genes overlap with model after filtering
2022-01-19 11:48:03,563 5856 genes missing from dataset after filtering


Monocyte/Macrophage (500, 27131) (170, 15420)


2022-01-19 11:48:03,877 15375 genes overlap with model after filtering
2022-01-19 11:48:03,878 8574 genes missing from dataset after filtering
Trying to set attribute `.var` of view, copying.
2022-01-19 11:48:13,577 12215 genes overlap with model after filtering
2022-01-19 11:48:13,577 11734 genes missing from dataset after filtering
2022-01-19 11:48:13,708 15375 genes overlap with model after filtering
2022-01-19 11:48:13,709 8574 genes missing from dataset after filtering


NK (106, 27131) (170, 15420)


Trying to set attribute `.var` of view, copying.
2022-01-19 11:48:15,584 16535 genes overlap with model after filtering
2022-01-19 11:48:15,584 7414 genes missing from dataset after filtering


NKT (500, 27131) (170, 15420)


2022-01-19 11:48:15,866 15375 genes overlap with model after filtering
2022-01-19 11:48:15,867 8574 genes missing from dataset after filtering
Trying to set attribute `.var` of view, copying.
2022-01-19 11:48:25,500 17351 genes overlap with model after filtering
2022-01-19 11:48:25,501 6598 genes missing from dataset after filtering


Plasma (500, 27131) (170, 15420)


2022-01-19 11:48:25,759 15375 genes overlap with model after filtering
2022-01-19 11:48:25,760 8574 genes missing from dataset after filtering
Trying to set attribute `.var` of view, copying.
2022-01-19 11:48:34,815 16003 genes overlap with model after filtering
2022-01-19 11:48:34,816 7946 genes missing from dataset after filtering


Treg (500, 27131) (170, 15420)


2022-01-19 11:48:35,140 15375 genes overlap with model after filtering
2022-01-19 11:48:35,141 8574 genes missing from dataset after filtering
Trying to set attribute `.var` of view, copying.
2022-01-19 11:48:45,560 12781 genes overlap with model after filtering
2022-01-19 11:48:45,560 11168 genes missing from dataset after filtering
2022-01-19 11:48:45,671 15375 genes overlap with model after filtering
2022-01-19 11:48:45,672 8574 genes missing from dataset after filtering


cDC1 (54, 27131) (170, 15420)


Trying to set attribute `.var` of view, copying.
2022-01-19 11:48:46,624 18212 genes overlap with model after filtering
2022-01-19 11:48:46,624 5737 genes missing from dataset after filtering


cDC2 (500, 27131) (170, 15420)


2022-01-19 11:48:46,951 15375 genes overlap with model after filtering
2022-01-19 11:48:46,951 8574 genes missing from dataset after filtering
Trying to set attribute `.var` of view, copying.
2022-01-19 11:48:57,011 13097 genes overlap with model after filtering
2022-01-19 11:48:57,011 10852 genes missing from dataset after filtering
2022-01-19 11:48:57,136 15375 genes overlap with model after filtering
2022-01-19 11:48:57,137 8574 genes missing from dataset after filtering


pDC (77, 27131) (170, 15420)


In [68]:
explain_df = None
for k, df in explain_dfs.items():
    df['cell_type'] = k
    
    if explain_df is None:
        explain_df = df
    else:
        explain_df = pd.concat((explain_df, df), axis=0)
explain_df

Unnamed: 0,AL627309.1,AL627309.3,AL627309.4,AL669831.2,AL669831.5,FAM87B,LINC00115,FAM41C,AL645608.3,AL645608.1,...,AC011043.1,AL592183.1,AC007325.4,AC007325.2,AL354822.1,AC004556.1,AC233755.2,AC233755.1,AC240274.1,cell_type
HT065B1_S1H1_AGGGCCTCAAATTGCC-1,-0.0,1.403170e-06,-0.0,-0.0,-0.000011,0.000003,-0.000004,-0.000006,0.0,-0.0,...,6.613822e-14,0.000001,-7.385790e-14,-0.0,-6.693868e-06,-3.290153e-05,-0.0,-0.000002,0.000009,B cell
HT077B1_S1H3_TTCTTGACAGCGGTCT-1,-0.0,-9.009753e-04,-0.0,0.0,-0.000023,0.000016,0.000007,-0.000013,0.0,-0.0,...,-3.801396e-12,-0.000052,-6.576878e-13,-0.0,-2.108621e-05,-6.721157e-05,-0.0,-0.000022,0.000026,B cell
HT077B1_S1H1_GCCGTGAGTGGATCAG-1,0.0,8.585659e-07,-0.0,0.0,-0.000005,0.000010,-0.000002,-0.000010,0.0,-0.0,...,-1.370780e-12,-0.000001,-2.680380e-13,0.0,-1.950330e-05,-4.171603e-05,-0.0,-0.000013,0.000018,B cell
HT105B1_S1H1_TTCCTAAGTGTAGCAG-1,-0.0,6.672578e-06,-0.0,-0.0,-0.000017,0.000012,0.000009,-0.000012,0.0,-0.0,...,-1.753536e-12,-0.000029,-3.543450e-12,-0.0,-2.367532e-05,-3.624754e-05,-0.0,-0.000010,0.000020,B cell
HT067B1_S1H2_GCTGCAGAGTTACGTC-1,-0.0,2.470283e-06,0.0,0.0,-0.000002,0.000006,-0.000007,-0.000013,0.0,-0.0,...,-1.659743e-12,0.000456,-5.032971e-13,-0.0,-1.438353e-05,-3.111664e-05,0.0,-0.000010,0.000010,B cell
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HT171B1_S1H8_AAGAACAGTACGCGTC-1,-0.0,0.000000e+00,-0.0,0.0,0.000008,-0.000000,-0.000016,0.000021,0.0,0.0,...,1.990896e-12,0.000016,-1.422592e-05,0.0,3.783021e-12,2.769595e-06,0.0,-0.000000,0.000000,pDC
HT105B1_N1K1_CAGCCAGAGATCCAAA-1,-0.0,0.000000e+00,-0.0,0.0,0.000003,0.000000,-0.000013,0.000009,0.0,0.0,...,1.099713e-12,0.000022,-2.002442e-05,0.0,3.098553e-12,4.109980e-06,0.0,-0.000000,0.000000,pDC
HT065B1_S1H7_ACAGCCGAGCGTATAA-1,-0.0,-0.000000e+00,0.0,0.0,0.000016,-0.000000,-0.000014,0.000034,0.0,0.0,...,1.120236e-12,0.000045,-1.661529e-05,0.0,6.668974e-12,-1.991830e-05,0.0,-0.000000,0.000000,pDC
HT171B1_S1H8_TGTTCATTCATTGTTC-1,-0.0,-0.000000e+00,-0.0,0.0,-0.000010,-0.000000,-0.000008,0.000019,0.0,0.0,...,4.069212e-13,0.000023,-1.286675e-05,0.0,3.695494e-12,-3.767527e-06,0.0,-0.000000,0.000000,pDC


In [80]:
explain_df.to_csv('/data/pollock/panimmune/results/explain_integrated.txt', sep='\t')