In [1]:
from pathlib import Path
from collections import Counter
import os
import re
import random
import subprocess

import anndata
import scanpy as sc
import pandas as pd
import numpy as np
import scipy

import mgitools.os_helpers as os_helpers

In [2]:
# !pip install git+https://github.com/estorrs/mgitools
# !pip install tensorflow==2.1.0

In [3]:
%load_ext autoreload

In [4]:
%autoreload 2

In [5]:
# !pip install -e /home/estorrs/pollock/
import pollock
from pollock.models.model import PollockDataset, PollockModel, load_from_directory, predict_from_anndata

In [6]:
# !conda install -y scanpy

In [7]:
# !pip install git+https://github.com/estorrs/mgitools

In [20]:
CELL_TYPE_KEY = 'cell_type'
N_PER_CELL_TYPE = 500
DATA_DIR = '/home/estorrs/pollock/benchmarking/data/11302020_harmonized/teir_1/'
RESULTS_DIR = '/home/estorrs/pollock/benchmarking/results/11302020_teir1/'
RESULTS_CROSS_DISEASE_DIR = '/home/estorrs/pollock/benchmarking/results/11302020_teir1_cross_disease'
RESULTS_CROSS_DTYPE_DIR = '/home/estorrs/pollock/benchmarking/results/11302020_teir1_cross_datatype'
SANDBOX_DIR = '/home/estorrs/pollock/benchmarking/sandbox'

Path(RESULTS_DIR).mkdir(parents=True, exist_ok=True)
Path(RESULTS_CROSS_DISEASE_DIR).mkdir(parents=True, exist_ok=True)
Path(RESULTS_CROSS_DTYPE_DIR).mkdir(parents=True, exist_ok=True)

##### create training and validation datasets

only run if you haven't created these datasets yet

In [None]:
def cap_list(ls, n=100, split=.8, oversample=True):
    """
    Grabs items from a pool.
    
    if split * pool size is greater than n, then just randomly sample 80% of the pool
    otherwise sample 80% of the pool, then oversample so you end up with a final size of n
    """
    # just return list if it is of length 1
    if len(ls) <= 1: return ls
    cap = int(len(ls) * split)
    if cap > n:
        return random.sample(ls, n)

    if oversample:
        pool = random.sample(ls, cap)
        ## oversample to
        return random.choices(pool, k=n)

    return random.sample(ls, cap)

def balancedish_training_generator(adata, cell_type_key, n_per_cell_type, oversample=True, split=.8):
    """
    Return balanced train and validation sets
    """
    cell_type_to_idxs = {}
    for cell_id, cell_type in zip(adata.obs.index, adata.obs[cell_type_key]):
        if cell_type not in cell_type_to_idxs:
            cell_type_to_idxs[cell_type] = [cell_id]
        else:
            cell_type_to_idxs[cell_type].append(cell_id)

    cell_type_to_idxs = {k:cap_list(ls, n_per_cell_type, oversample=oversample, split=split)
                         for k, ls in cell_type_to_idxs.items()}

    train_ids = np.asarray([x for ls in cell_type_to_idxs.values() for x in ls])
    train_idxs = np.arange(adata.shape[0])[np.isin(np.asarray(adata.obs.index), train_ids)]
    val_idxs = np.delete(np.arange(adata.shape[0]), train_idxs)

    train_adata = adata[train_idxs, :]
    val_adata = adata[val_idxs, :]

    return train_adata, val_adata

# def create_train_val_datasets(adata, cell_type_key, oversample=True):
#     counts = Counter(adata.obs[cell_type_key])
#     min_count = counts.most_common()[-1][1]
#     n_per_cell_type = max(min_count, )
#     train_adata, val_adata = balancedish_training_generator(adata, cell_type_key,
#                                                             n_per_cell_type, oversample=oversample)
#     return train_adata, val_adata

In [None]:
fps = sorted(os_helpers.listfiles(DATA_DIR, regex='.h5ad$'))
fp_map = {fp.split('/')[-2]:{} for fp in fps}
for fp in fps:
    if '/_train.h5ad' not in fp and '/_val.h5ad' not in fp:
        dtype = fp.split('/')[-2]
        disease = fp.split('/')[-1].replace('.h5ad', '')
        fp_map[dtype][disease] = fp
fp_map

In [None]:
for dtype, d in fp_map.items():
    for disease, fp in d.items():
        print(dtype, disease)
        adata = sc.read_h5ad(fp)
        # check for cell type key
        if CELL_TYPE_KEY not in adata.obs: raise RuntimeError(f'{CELL_TYPE_KEY} not in {fp}')
        
        train_adata, val_adata = balancedish_training_generator(adata, CELL_TYPE_KEY, N_PER_CELL_TYPE)
        # resample validation data to make dataset smaller while keeping rare cell types
        val_adata, _ = balancedish_training_generator(val_adata, CELL_TYPE_KEY, 500, oversample=False,
                                                     split=1.)
        train_adata.write_h5ad(fp.replace('.h5ad', '_train.h5ad'))
        val_adata.write_h5ad(fp.replace('.h5ad', '_val.h5ad'))
        

##### load in training and validation datasets

In [9]:
fps = sorted(os_helpers.listfiles(DATA_DIR, regex='.h5ad$'))
adata_map = {fp.split('/')[-2]:{} for fp in fps}
for fp in fps:
    dtype = fp.split('/')[-2]
    disease = re.sub(r'^(.*)((_train)|(_val)).h5ad$', r'\1', fp.split('/')[-1])
    if disease not in adata_map[dtype] and '.h5ad' not in disease: adata_map[dtype][disease] = {}
    if 'train.h5ad' in fp:
        adata_map[dtype][disease]['train'] = fp
    if 'val.h5ad' in fp:
        adata_map[dtype][disease]['val'] = fp
for dtype, d in adata_map.items():
    for disease, m in d.items():
        print(dtype, disease)

scRNAseq brca
scRNAseq cesc
scRNAseq hnscc
scRNAseq melanoma
scRNAseq pbmc
scRNAseq pdac
snATACseq brca_gene_activity
snATACseq brca_motif
snATACseq brca_peaks
snATACseq ccrcc_gene_activity
snATACseq ccrcc_motif
snATACseq ccrcc_peaks
snATACseq gbm_gene_activity
snATACseq gbm_motif
snATACseq gbm_peaks
snRNAseq brca
snRNAseq ccrcc
snRNAseq gbm


### run workflows

In [21]:
def run_workflow_for_datasets(adata_map, workflow, workflow_identifier, output_dir):
    for dtype, d in adata_map.items():
#         if dtype != 'snATACseq':
        for disease, m in d.items():
            # make dir if doesnt exist yet
            directory = os.path.join(output_dir, dtype, disease)
            Path(directory).mkdir(parents=True, exist_ok=True)
            train, val = sc.read_h5ad(m['train']), sc.read_h5ad(m['val'])

            print(dtype, disease, train.shape, val.shape)
            run_workflow(workflow, workflow_identifier,
                train, val, directory)
            
def run_workflow_for_cross_disease(adata_map, workflow, workflow_identifier, output_dir):
    for dtype, d in adata_map.items():
        for disease1, m1 in d.items():
            for disease2, m2 in d.items():
#                 if disease1 != disease2:
                # make dir if doesnt exist yet
                directory = os.path.join(output_dir, dtype, f'{disease1}_train_{disease2}_val')
                Path(directory).mkdir(parents=True, exist_ok=True)
                train, val = sc.read_h5ad(m1['train']), sc.read_h5ad(m2['val'])

                print(dtype, f'{disease1}_train_{disease2}_val', train.shape, val.shape)
                run_workflow(workflow, workflow_identifier,
                    train, val, directory)
                    
                    
def run_workflow_for_cross_datatype(adata_map, workflow, workflow_identifier, output_dir):
    for dtype1, d1 in adata_map.items():
        for dtype2, d2 in adata_map.items():
            for disease1, m1 in d1.items():
                for disease2, m2 in d2.items():
                    # make dir if doesnt exist yet
                    directory = os.path.join(output_dir, f'{dtype1}_{dtype2}',
                                             f'{dtype1}_{disease1}_train_{dtype2}_{disease2}_val')
                    Path(directory).mkdir(parents=True, exist_ok=True)
                    train, val = sc.read_h5ad(m1['train']), sc.read_h5ad(m2['val'])

                    print(f'{dtype1}_{dtype2}',
                          f'{dtype1}_{disease1}_train_{dtype2}_{disease2}_val', train.shape, val.shape)
                    run_workflow(workflow, workflow_identifier,
                        train, val, directory)

def run_workflow(workflow, workflow_identifier, train, val, output_dir):
    """
    Run the workflow defined by the workflow function.
    
    workflow function takes a train adata and a val adata as inputs,
    and returns dataframe with cell_id, groundtruth, predicted, and probability columns
    """
#     try:
        # if it is pollock it needs to know where to save the module
    if workflow_identifier == 'pollock':
        df = workflow(train, val, CELL_TYPE_KEY, os.path.join(output_dir, f'{workflow_identifier}_module'))
    else:
        df = workflow(train, val, CELL_TYPE_KEY)

    df.to_csv(os.path.join(output_dir, f'{workflow_identifier}.tsv'), sep='\t', index=False, header=True)
#     except Exception as e:
#         print('failed ' + os.path.join(output_dir, f'{workflow_identifier}.tsv'))
#         print(e)

##### pollock

In [24]:
def run_pollock_workflow(train, val, cell_type_key, module_fp):
    train.obs['is_validation'] = [False] * train.shape[0]
    val.obs['is_validation'] = [True] * val.shape[0]
    combined = train.concatenate(val)
    
    pds = PollockDataset(combined.copy(), cell_type_key=cell_type_key,
                     dataset_type='training', validation_key='is_validation')
    
    pm = PollockModel(pds.cell_types, pds.train_adata.shape[1], alpha=.005, latent_dim=25)
    
    pm.fit(pds, epochs=15)
    
    # only score validation if cell types match
    train_cells = set(train.obs[cell_type_key])
    val_cells = set(val.obs[cell_type_key])
    score_val = True if len(train_cells.intersection(val_cells)) == len(train_cells.union(val_cells)) else False
    print(score_val)
    pm.save(pds, module_fp, score_train=True, score_val=score_val)

    preds = predict_from_anndata(val.copy(), module_fp, adata_batch_size=2500)
    
    df = pd.DataFrame.from_dict({
        'cell_id': preds.index.to_list(),
        'groundtruth': val.obs.loc[preds.index][cell_type_key].to_list(),
        'predicted': preds['predicted_cell_type'],
        'probability': preds['cell_type_probability']
    })

    return df

In [None]:
run_workflow_for_datasets(adata_map, run_pollock_workflow, 'pollock', RESULTS_DIR)

scRNAseq brca (6103, 27131) (5750, 27131)


2020-12-03 10:22:26,459 input dataset shape: (11853, 27131)
2020-12-03 10:22:26,460 possible cell types: ['B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Erythrocyte', 'Fibroblast', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Plasma', 'Treg']
2020-12-03 10:22:26,461 using validation key
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-12-03 10:22:52,848 epoch: 1, train loss: 38.36082077026367, val loss: 45.467193603515625
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-12-03 10:23:10,038 epoch: 2, train loss: 37.4571533203125, val loss: 44.019691467285156
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-12-03 10:23:30,233 epoch: 3, train loss: 36.80590057373047, val loss: 43.24712371826172
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-12-03 10:23:52,200 epoch: 4, train loss: 36.420230865478516, val loss: 42.88362

2020-12-03 10:27:08,965 epoch: 13, train loss: 34.88040542602539, val loss: 41.50397872924805
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-12-03 10:27:28,751 epoch: 14, train loss: 34.70000076293945, val loss: 41.473228454589844
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-12-03 10:27:46,212 epoch: 15, train loss: 34.598087310791016, val loss: 41.45781707763672


True


  d['descr'] = dtype_to_descr(array.dtype)
  if not is_categorical(df_full[k]):
2020-12-03 10:28:07,052 0 genes in training set are missing from prediction set
2020-12-03 10:28:07,793 starting batch 1 of 3
  if not is_categorical(df_full[k]):
2020-12-03 10:28:07,891 0 genes in training set are missing from prediction set
2020-12-03 10:28:14,101 starting batch 2 of 3
  if not is_categorical(df_full[k]):
2020-12-03 10:28:14,244 0 genes in training set are missing from prediction set
2020-12-03 10:28:20,723 starting batch 3 of 3
  if not is_categorical(df_full[k]):
2020-12-03 10:28:20,811 0 genes in training set are missing from prediction set


scRNAseq cesc (4659, 22928) (4278, 22928)


2020-12-03 10:28:33,785 input dataset shape: (8937, 22928)
2020-12-03 10:28:33,794 possible cell types: ['CD4 T cell', 'CD8 T cell', 'Endothelial', 'Epithelial', 'Erythrocyte', 'Fibroblast', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Plasma']
2020-12-03 10:28:33,796 using validation key
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-12-03 10:28:51,103 epoch: 1, train loss: 39.077796936035156, val loss: 47.60074234008789
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-12-03 10:29:01,803 epoch: 2, train loss: 37.848052978515625, val loss: 45.92450714111328
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-12-03 10:29:12,460 epoch: 3, train loss: 37.07207489013672, val loss: 44.42198181152344
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-12-03 10:29:27,454 epoch: 4, train loss: 36.58844757080078, val loss: 43.511268615722656
  if no

2020-12-03 10:31:26,043 epoch: 13, train loss: 35.03598403930664, val loss: 41.851226806640625
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-12-03 10:31:37,379 epoch: 14, train loss: 34.87642288208008, val loss: 41.81293487548828
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-12-03 10:31:48,024 epoch: 15, train loss: 34.74740982055664, val loss: 41.770774841308594


True


  d['descr'] = dtype_to_descr(array.dtype)
  if not is_categorical(df_full[k]):
2020-12-03 10:32:02,763 0 genes in training set are missing from prediction set
2020-12-03 10:32:03,656 starting batch 1 of 2
  if not is_categorical(df_full[k]):
2020-12-03 10:32:03,815 0 genes in training set are missing from prediction set
2020-12-03 10:32:11,640 starting batch 2 of 2
  if not is_categorical(df_full[k]):
2020-12-03 10:32:11,748 0 genes in training set are missing from prediction set


scRNAseq hnscc (5283, 26929) (5205, 26929)


2020-12-03 10:32:27,015 input dataset shape: (10488, 26929)
2020-12-03 10:32:27,017 possible cell types: ['B cell', 'CD4 T cell', 'CD8 T cell', 'Endothelial', 'Erythrocyte', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Plasma', 'Treg']
2020-12-03 10:32:27,018 using validation key
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-12-03 10:32:51,556 epoch: 1, train loss: 32.96239471435547, val loss: 21.446332931518555
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-12-03 10:33:06,430 epoch: 2, train loss: 32.227989196777344, val loss: 21.131284713745117
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-12-03 10:33:23,927 epoch: 3, train loss: 31.691574096679688, val loss: 20.95756721496582
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-12-03 10:33:42,657 epoch: 4, train loss: 31.359819412231445, val loss: 20.855640411376953
  if not is_c

2020-12-03 10:36:13,443 epoch: 13, train loss: 30.041610717773438, val loss: 20.442378997802734
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-12-03 10:36:28,650 epoch: 14, train loss: 29.944503784179688, val loss: 20.41019058227539
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-12-03 10:36:43,247 epoch: 15, train loss: 29.76396942138672, val loss: 20.382484436035156


True


  d['descr'] = dtype_to_descr(array.dtype)
  if not is_categorical(df_full[k]):
2020-12-03 10:36:54,881 0 genes in training set are missing from prediction set
2020-12-03 10:36:55,305 starting batch 1 of 3
  if not is_categorical(df_full[k]):
2020-12-03 10:36:55,393 0 genes in training set are missing from prediction set
2020-12-03 10:37:01,198 starting batch 2 of 3
  if not is_categorical(df_full[k]):
2020-12-03 10:37:01,321 0 genes in training set are missing from prediction set
2020-12-03 10:37:07,551 starting batch 3 of 3
  if not is_categorical(df_full[k]):
2020-12-03 10:37:07,649 0 genes in training set are missing from prediction set


scRNAseq melanoma (4210, 23452) (3525, 23452)


2020-12-03 10:37:15,145 input dataset shape: (7735, 23452)
2020-12-03 10:37:15,146 possible cell types: ['B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Fibroblast', 'Malignant', 'Monocyte', 'NK', 'Plasma', 'Treg']
2020-12-03 10:37:15,147 using validation key
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-12-03 10:37:27,045 epoch: 1, train loss: 39.387149810791016, val loss: 43.925411224365234
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-12-03 10:37:36,554 epoch: 2, train loss: 38.884273529052734, val loss: 43.524192810058594
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-12-03 10:37:46,171 epoch: 3, train loss: 38.01335525512695, val loss: 42.876625061035156
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-12-03 10:37:55,846 epoch: 4, train loss: 37.45105743408203, val loss: 42.474178314208984
  if not is_categorical(d

2020-12-03 10:39:23,264 epoch: 13, train loss: 35.94448471069336, val loss: 41.64372634887695
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-12-03 10:39:32,826 epoch: 14, train loss: 35.80657196044922, val loss: 41.60167694091797
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-12-03 10:39:42,483 epoch: 15, train loss: 35.657691955566406, val loss: 41.576602935791016


True


  d['descr'] = dtype_to_descr(array.dtype)
  if not is_categorical(df_full[k]):
2020-12-03 10:39:50,594 0 genes in training set are missing from prediction set
2020-12-03 10:39:51,131 starting batch 1 of 2
  if not is_categorical(df_full[k]):
2020-12-03 10:39:51,226 0 genes in training set are missing from prediction set
2020-12-03 10:39:56,060 starting batch 2 of 2
  if not is_categorical(df_full[k]):
2020-12-03 10:39:56,413 0 genes in training set are missing from prediction set


scRNAseq pbmc (1607, 32738) (888, 32738)


2020-12-03 10:40:01,286 input dataset shape: (2495, 32738)
2020-12-03 10:40:01,288 possible cell types: ['B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Megakaryocyte', 'Monocyte', 'NK']
2020-12-03 10:40:01,288 using validation key
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):




2020-12-03 10:40:07,673 5 out of the last 12 calls to <function compute_loss at 0x7fb9f2238560> triggered tf.function retracing. Tracing is expensive and the excessive number of tracings is likely due to passing python objects instead of tensors. Also, tf.function has experimental_relax_shapes=True option that relaxes argument shapes that can avoid unnecessary retracing. Please refer to https://www.tensorflow.org/tutorials/customization/performance#python_or_tensor_args and https://www.tensorflow.org/api_docs/python/tf/function for more details.




2020-12-03 10:40:07,762 6 out of the last 13 calls to <function compute_loss at 0x7fb9f2238560> triggered tf.function retracing. Tracing is expensive and the excessive number of tracings is likely due to passing python objects instead of tensors. Also, tf.function has experimental_relax_shapes=True option that relaxes argument shapes that can avoid unnecessary retracing. Please refer to https://www.tensorflow.org/tutorials/customization/performance#python_or_tensor_args and https://www.tensorflow.org/api_docs/python/tf/function for more details.




2020-12-03 10:40:07,964 6 out of the last 14 calls to <function compute_loss at 0x7fb9f2238560> triggered tf.function retracing. Tracing is expensive and the excessive number of tracings is likely due to passing python objects instead of tensors. Also, tf.function has experimental_relax_shapes=True option that relaxes argument shapes that can avoid unnecessary retracing. Please refer to https://www.tensorflow.org/tutorials/customization/performance#python_or_tensor_args and https://www.tensorflow.org/api_docs/python/tf/function for more details.
  if not is_categorical(df_full[k]):
2020-12-03 10:40:08,523 epoch: 1, train loss: 43.81148147583008, val loss: 43.58734130859375
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-12-03 10:40:14,266 epoch: 2, train loss: 42.503963470458984, val loss: 41.86241912841797
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-12-03 10:40:19,708 epoch: 3, train loss: 41.37276077270508, val loss: 4

  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-12-03 10:41:19,760 epoch: 14, train loss: 40.69096374511719, val loss: 41.00183868408203
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-12-03 10:41:25,403 epoch: 15, train loss: 40.782752990722656, val loss: 41.007896423339844


True


  d['descr'] = dtype_to_descr(array.dtype)
  if not is_categorical(df_full[k]):
2020-12-03 10:41:29,510 0 genes in training set are missing from prediction set
2020-12-03 10:41:30,216 starting batch 1 of 1
  if not is_categorical(df_full[k]):
2020-12-03 10:41:30,266 0 genes in training set are missing from prediction set


scRNAseq pdac (7937, 28756) (7826, 28756)


2020-12-03 10:41:48,214 input dataset shape: (15763, 28756)
2020-12-03 10:41:48,216 possible cell types: ['Acinar', 'B cell', 'CD4 T cell', 'CD8 T cell', 'Dendritic', 'Endothelial', 'Epithelial', 'Erythrocyte', 'Fibroblast', 'Islet', 'Malignant', 'Mast', 'Monocyte', 'NK', 'Plasma', 'Treg', 'Tuft']
2020-12-03 10:41:48,217 using validation key
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-12-03 10:42:13,060 epoch: 1, train loss: 32.58137130737305, val loss: 31.20792579650879
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-12-03 10:42:33,801 epoch: 2, train loss: 31.189313888549805, val loss: 29.943016052246094
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-12-03 10:42:54,750 epoch: 3, train loss: 30.90347671508789, val loss: 29.386436462402344
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-12-03 10:43:16,070 epoch: 4, train l

2020-12-03 10:46:23,332 epoch: 13, train loss: 29.210481643676758, val loss: 28.016592025756836
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-12-03 10:46:44,125 epoch: 14, train loss: 28.94912338256836, val loss: 27.96592903137207
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-12-03 10:47:05,245 epoch: 15, train loss: 28.86359977722168, val loss: 27.933374404907227


True


  d['descr'] = dtype_to_descr(array.dtype)
  if not is_categorical(df_full[k]):
2020-12-03 10:47:23,412 0 genes in training set are missing from prediction set
2020-12-03 10:47:23,756 starting batch 1 of 4
  if not is_categorical(df_full[k]):
2020-12-03 10:47:24,165 0 genes in training set are missing from prediction set
2020-12-03 10:47:30,578 starting batch 2 of 4
  if not is_categorical(df_full[k]):
2020-12-03 10:47:30,748 0 genes in training set are missing from prediction set
2020-12-03 10:47:36,933 starting batch 3 of 4
  if not is_categorical(df_full[k]):
2020-12-03 10:47:37,061 0 genes in training set are missing from prediction set
2020-12-03 10:47:43,161 starting batch 4 of 4
  if not is_categorical(df_full[k]):
2020-12-03 10:47:43,281 0 genes in training set are missing from prediction set


snATACseq brca_gene_activity (3576, 19891) (3519, 19891)


2020-12-03 10:47:49,793 input dataset shape: (7095, 19891)
2020-12-03 10:47:49,795 possible cell types: ['B cell', 'Dendritic', 'Endothelial', 'Fibroblast', 'Malignant', 'Mast', 'Monocyte', 'T cells']
2020-12-03 10:47:49,796 using validation key
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-12-03 10:47:59,350 epoch: 1, train loss: 48.49177169799805, val loss: 72.05081176757812
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-12-03 10:48:07,283 epoch: 2, train loss: 48.39235305786133, val loss: 71.877197265625
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-12-03 10:48:14,909 epoch: 3, train loss: 48.01340103149414, val loss: 71.26495361328125
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-12-03 10:48:22,351 epoch: 4, train loss: 47.757713317871094, val loss: 70.97528076171875
  if not is_categorical(df_full[k]):
  if not is_c

  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-12-03 10:49:38,135 epoch: 14, train loss: 45.738426208496094, val loss: 69.69580078125
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-12-03 10:49:45,838 epoch: 15, train loss: 45.64197540283203, val loss: 69.76399230957031


True


  d['descr'] = dtype_to_descr(array.dtype)
  if not is_categorical(df_full[k]):
2020-12-03 10:49:52,644 0 genes in training set are missing from prediction set
2020-12-03 10:49:52,969 starting batch 1 of 2
  if not is_categorical(df_full[k]):
2020-12-03 10:49:53,182 0 genes in training set are missing from prediction set
2020-12-03 10:50:03,065 starting batch 2 of 2
  if not is_categorical(df_full[k]):
2020-12-03 10:50:03,168 0 genes in training set are missing from prediction set
2020-12-03 10:50:07,005 input dataset shape: (7095, 633)
2020-12-03 10:50:07,007 possible cell types: ['B cell', 'Dendritic', 'Endothelial', 'Fibroblast', 'Malignant', 'Mast', 'Monocyte', 'T cells']
2020-12-03 10:50:07,007 using validation key


snATACseq brca_motif (3576, 633) (3519, 633)


  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-12-03 10:50:09,616 epoch: 1, train loss: 9.596799850463867, val loss: 10.726572036743164
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-12-03 10:50:11,327 epoch: 2, train loss: 8.248610496520996, val loss: 8.532623291015625
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-12-03 10:50:13,041 epoch: 3, train loss: 7.341155052185059, val loss: 7.5806450843811035
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-12-03 10:50:14,755 epoch: 4, train loss: 6.960606098175049, val loss: 7.172427177429199
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-12-03 10:50:16,939 epoch: 5, train loss: 6.67603063583374, val loss: 6.896964073181152
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-12-03 10:50:19,654 epoch: 6, train loss: 6.41405

  if not is_categorical(df_full[k]):
2020-12-03 10:50:38,856 epoch: 14, train loss: 5.2711968421936035, val loss: 5.564957618713379
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-12-03 10:50:41,304 epoch: 15, train loss: 5.198841094970703, val loss: 5.495020866394043


True


  d['descr'] = dtype_to_descr(array.dtype)
  _warn_prf(average, modifier, msg_start, len(result))
  if not is_categorical(df_full[k]):
2020-12-03 10:50:46,366 0 genes in training set are missing from prediction set
2020-12-03 10:50:46,678 starting batch 1 of 2
  if not is_categorical(df_full[k]):
2020-12-03 10:50:46,801 0 genes in training set are missing from prediction set
2020-12-03 10:50:47,075 starting batch 2 of 2
  if not is_categorical(df_full[k]):
2020-12-03 10:50:47,176 0 genes in training set are missing from prediction set


snATACseq brca_peaks (3576, 257715) (3519, 257715)


2020-12-03 10:52:11,285 input dataset shape: (7095, 257715)
2020-12-03 10:52:11,289 possible cell types: ['B cell', 'Dendritic', 'Endothelial', 'Fibroblast', 'Malignant', 'Mast', 'Monocyte', 'T cells']
2020-12-03 10:52:11,289 using validation key
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-12-03 10:53:24,795 epoch: 1, train loss: 181.44300842285156, val loss: 240.04171752929688
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-12-03 10:54:30,144 epoch: 2, train loss: 178.78453063964844, val loss: 238.716796875
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-12-03 10:55:41,427 epoch: 3, train loss: 177.08079528808594, val loss: 238.06378173828125
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-12-03 10:56:49,127 epoch: 4, train loss: 175.436767578125, val loss: 237.00680541992188
  if not is_categorical(df_full[k]):
  if not i

2020-12-03 11:06:35,569 epoch: 13, train loss: 155.3831787109375, val loss: 239.13311767578125
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-12-03 11:07:42,611 epoch: 14, train loss: 152.7743377685547, val loss: 239.20497131347656
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-12-03 11:08:49,326 epoch: 15, train loss: 149.20655822753906, val loss: 239.0794677734375


True


  d['descr'] = dtype_to_descr(array.dtype)
  if not is_categorical(df_full[k]):
2020-12-03 11:09:43,149 0 genes in training set are missing from prediction set
2020-12-03 11:09:48,151 starting batch 1 of 2
  if not is_categorical(df_full[k]):
2020-12-03 11:09:48,370 0 genes in training set are missing from prediction set
2020-12-03 11:10:39,627 starting batch 2 of 2
  if not is_categorical(df_full[k]):
2020-12-03 11:10:39,881 0 genes in training set are missing from prediction set


snATACseq ccrcc_gene_activity (3000, 19843) (3000, 19843)


2020-12-03 11:11:05,642 input dataset shape: (6000, 19843)
2020-12-03 11:11:05,644 possible cell types: ['Endothelial', 'Epithelial', 'Fibroblast', 'Malignant', 'Monocyte', 'T cells']
2020-12-03 11:11:05,645 using validation key
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-12-03 11:11:13,693 epoch: 1, train loss: 44.828800201416016, val loss: 43.48681640625
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-12-03 11:11:20,574 epoch: 2, train loss: 44.686195373535156, val loss: 43.41998291015625
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-12-03 11:11:27,371 epoch: 3, train loss: 44.60805130004883, val loss: 43.370216369628906
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-12-03 11:11:34,192 epoch: 4, train loss: 44.37710952758789, val loss: 43.28217315673828
  if not is_categorical(df_full[k]):
  if not is_categorical(df_fu

  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-12-03 11:12:46,174 epoch: 14, train loss: 42.56522750854492, val loss: 42.58273696899414
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-12-03 11:12:52,893 epoch: 15, train loss: 42.360984802246094, val loss: 42.6008415222168


True


  d['descr'] = dtype_to_descr(array.dtype)
  if not is_categorical(df_full[k]):
2020-12-03 11:12:58,748 0 genes in training set are missing from prediction set
2020-12-03 11:12:59,108 starting batch 1 of 2
  if not is_categorical(df_full[k]):
2020-12-03 11:12:59,282 0 genes in training set are missing from prediction set
2020-12-03 11:13:07,556 starting batch 2 of 2
  if not is_categorical(df_full[k]):
2020-12-03 11:13:07,633 0 genes in training set are missing from prediction set


snATACseq ccrcc_motif (3000, 633) (3000, 633)


2020-12-03 11:13:09,587 input dataset shape: (6000, 633)
2020-12-03 11:13:09,588 possible cell types: ['Endothelial', 'Epithelial', 'Fibroblast', 'Malignant', 'Monocyte', 'T cells']
2020-12-03 11:13:09,589 using validation key
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-12-03 11:13:12,376 epoch: 1, train loss: 10.339033126831055, val loss: 10.628636360168457
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-12-03 11:13:13,860 epoch: 2, train loss: 8.775615692138672, val loss: 9.204313278198242
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-12-03 11:13:15,335 epoch: 3, train loss: 8.142725944519043, val loss: 8.748258590698242
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-12-03 11:13:16,776 epoch: 4, train loss: 7.836711406707764, val loss: 8.411665916442871
  if not is_categorical(df_full[k]):
  if not is_categorical(df_fu

  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-12-03 11:13:31,621 epoch: 14, train loss: 5.9318647384643555, val loss: 6.294837951660156
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-12-03 11:13:33,103 epoch: 15, train loss: 5.840847015380859, val loss: 6.2085065841674805


True


  d['descr'] = dtype_to_descr(array.dtype)
  if not is_categorical(df_full[k]):
2020-12-03 11:13:35,567 0 genes in training set are missing from prediction set
2020-12-03 11:13:35,718 starting batch 1 of 2
  if not is_categorical(df_full[k]):
2020-12-03 11:13:35,786 0 genes in training set are missing from prediction set
2020-12-03 11:13:35,958 starting batch 2 of 2
  if not is_categorical(df_full[k]):
2020-12-03 11:13:36,022 0 genes in training set are missing from prediction set


snATACseq ccrcc_peaks (3000, 928628) (3000, 928628)


2020-12-03 11:24:08,310 input dataset shape: (6000, 928628)
2020-12-03 11:24:08,320 possible cell types: ['Endothelial', 'Epithelial', 'Fibroblast', 'Malignant', 'Monocyte', 'T cells']
2020-12-03 11:24:08,321 using validation key
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-12-03 11:29:54,846 epoch: 1, train loss: 635.0525512695312, val loss: 654.512451171875
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-12-03 11:34:19,611 epoch: 2, train loss: 627.9926147460938, val loss: 652.6734619140625
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-12-03 11:38:48,458 epoch: 3, train loss: 623.8427734375, val loss: 649.698486328125
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k]):
2020-12-03 11:43:15,833 epoch: 4, train loss: 621.6390380859375, val loss: 649.1635131835938
  if not is_categorical(df_full[k]):
  if not is_categorical(df_full[k

2020-12-03 12:20:42,981 epoch: 13, train loss: 544.86083984375, val loss: 653.0462036132812


In [None]:
run_workflow_for_cross_disease(adata_map, run_pollock_workflow, 'pollock', RESULTS_CROSS_DISEASE_DIR)

In [None]:
run_workflow_for_cross_datatype(adata_map, run_pollock_workflow, 'pollock', RESULTS_CROSS_DTYPE_DIR)

###### testing stuff

In [None]:
# a = sc.read_h5ad(adata_map['snATACseq']['gbm']['train'])
# a

In [None]:
train, val = sc.read_h5ad(adata_map['scRNAseq']['pbmc']['train']), sc.read_h5ad(adata_map['scRNAseq']['brca']['val'])

In [None]:
module_dir = os.path.join(SANDBOX_DIR, 'temp_module')

In [None]:
train.obs['is_validation'] = [False] * train.shape[0]
val.obs['is_validation'] = [True] * val.shape[0]
combined = train.concatenate(val)
combined

In [None]:
train.shape, val.shape

In [None]:
np.count_nonzero(combined.obs['is_validation']), np.count_nonzero(~combined.obs['is_validation'])

In [None]:
# pds = PollockDataset(train, cell_type_key=CELL_TYPE_KEY,
#                      dataset_type='training')

In [None]:
val.shape

In [None]:
pds = PollockDataset(combined, cell_type_key=CELL_TYPE_KEY,
                     dataset_type='training', validation_key='is_validation')

In [None]:
pm = PollockModel(pds.cell_types, pds.train_adata.shape[1], alpha=.0001, latent_dim=25)

In [None]:
pm.fit(pds, epochs=2)

In [None]:
pm.save(pds, module_dir)

In [None]:
val.shape

In [None]:
preds = predict_from_anndata(val.copy(),
        '/home/estorrs/pollock/benchmarking/sandbox/temp_module', adata_batch_size=10000)
preds

In [None]:
df = pd.DataFrame.from_dict({
    'cell_id': preds.index.to_list(),
    'groundtruth': val.obs.loc[preds.index][CELL_TYPE_KEY].to_list(),
    'predicted': preds['predicted_cell_type'],
    'probability': preds['cell_type_probability']
})
df

##### scanpy ingest

In [None]:
def ingest_preprocess(adata):
    adata.var['mt'] = adata.var_names.str.startswith('MT-')
    sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)
    sc.pp.normalize_total(adata, target_sum=1e4)
    sc.pp.log1p(adata)
    sc.pp.highly_variable_genes(adata, flavor="seurat", n_top_genes=2500)
    adata.raw = adata
    adata = adata[:, adata.var.highly_variable]
    sc.pp.regress_out(adata, ['total_counts', 'pct_counts_mt'])
    sc.pp.scale(adata)
    
    return adata

def run_scanpy_workflow(train, val, cell_type_key):
    var_names = train.var_names.intersection(val.var_names)
    train = train[:, var_names]
    val = val[:, var_names]
    
    groundtruth = val.obs[cell_type_key].to_list()

    sc.pp.pca(train)
    sc.pp.neighbors(train)
    sc.tl.umap(train)
    
    sc.tl.ingest(val, train, obs=cell_type_key)
    
    df = pd.DataFrame.from_dict({
        'cell_id': val.obs.index.to_list(),
        'groundtruth': groundtruth,
        'predicted': val.obs[cell_type_key].to_list(),
        'probability': [np.nan] * val.shape[0]
    })
    
    return df

In [None]:
run_workflow_for_datasets(adata_map, run_scanpy_workflow, 'scanpy_ingest', RESULTS_DIR)

In [None]:
run_workflow_for_cross_disease(adata_map, run_scanpy_workflow, 'scanpy_ingest', RESULTS_CROSS_DISEASE_DIR)

In [None]:
run_workflow_for_cross_datatype(adata_map, run_scanpy_workflow, 'scanpy_ingest', RESULTS_CROSS_DTYPE_DIR)

###### testing stuff

In [None]:
train, val = adata_map['scRNAseq']['pbmc']['train'].copy(), adata_map['scRNAseq']['pbmc']['val'].copy()

In [None]:
train, val = ingest_preprocess(train), ingest_preprocess(val)

var_names = train.var_names.intersection(val.var_names)
train = train[:, var_names]
val = val[:, var_names]

sc.pp.pca(train)
sc.pp.neighbors(train)
sc.tl.umap(train)

In [None]:
sc.pl.umap(train, color='cell_type')

In [None]:
sc.tl.ingest(val, train, obs=CELL_TYPE_KEY)
val.uns[f'{CELL_TYPE_KEY}_colors'] = train.uns[f'{CELL_TYPE_KEY}_colors']

In [None]:
sc.pl.umap(val, color=[CELL_TYPE_KEY], wspace=0.5)


In [None]:
val

In [None]:
val.obs

##### ACTINN

In [None]:
def run_actinn_workflow(train, val, cell_type_key):
    X = train.X.toarray() if 'sparse' in str(type(train.X)) else train.X
    train_counts_df = pd.DataFrame(data=X.transpose(), index=train.var.index.to_list(),
                        columns=train.obs.index.to_list())
    X = val.X.toarray() if 'sparse' in str(type(val.X)) else val.X
    val_counts_df = pd.DataFrame(data=X.transpose(), index=val.var.index.to_list(),
                        columns=val.obs.index.to_list())
    
    train_counts_fp = os.path.join(SANDBOX_DIR, 'train_counts.txt')
    val_counts_fp = os.path.join(SANDBOX_DIR, 'val_counts.txt')
    train_counts_df.to_csv(train_counts_fp, sep='\t')
    val_counts_df.to_csv(val_counts_fp, sep='\t')
    
    train_h5_fp = os.path.join(SANDBOX_DIR, 'train.h5')
    train_annotations_fp = os.path.join(SANDBOX_DIR, 'train_annotations.txt')
    val_h5_fp = os.path.join(SANDBOX_DIR, 'val.h5')

    train.obs[[CELL_TYPE_KEY]].to_csv(train_annotations_fp, sep='\t', index=True, header=False)

    subprocess.check_output(('python', ACTINN_FORMAT, '-i', train_counts_fp,
                            '-o', train_h5_fp.replace('.h5', ''), '-f', 'txt'))
    subprocess.check_output(('python', ACTINN_FORMAT, '-i', val_counts_fp,
                            '-o', val_h5_fp.replace('.h5', ''), '-f', 'txt'))
    # dont use probablity argument or it breaks
    subprocess.check_output(('python', ACTINN_PREDICT, '-trs', train_h5_fp,
                            '-trl', train_annotations_fp, '-ts', val_h5_fp))
    
    prediction_df = pd.read_csv('predicted_label.txt', sep='\t')
    
    df = pd.DataFrame.from_dict({
        'cell_id': prediction_df['cellname'].to_list(),
        'predicted': prediction_df['celltype'].to_list(),
        'probability': [np.nan] * prediction_df.shape[0]
    })
    
    df = pd.merge(df, val.obs, left_on='cell_id', right_index=True)
    df = df[['cell_id', 'cell_type', 'predicted', 'probability']]
    df.columns = ['cell_id', 'groundtruth', 'predicted', 'probability']
    
    return df
    
    
    
    


In [None]:
ACTINN_FORMAT = '/home/estorrs/ACTINN/actinn_format.py'
ACTINN_PREDICT = '/home/estorrs/ACTINN/actinn_predict.py'

run_workflow_for_datasets(adata_map, run_actinn_workflow, 'actinn', RESULTS_DIR)

###### testing stuff

In [None]:
train, val = adata_map['scRNAseq']['pbmc']['train'].copy(), adata_map['scRNAseq']['pbmc']['val'].copy()

In [None]:
# train.obs['dataset'] = ['train'] * train.shape[0]
# val.obs['dataset'] = ['val'] * val.shape[0]
# combined = train.concatenate(val)
# combined

In [None]:
train_counts_df = pd.DataFrame(data=train.X.transpose().toarray(), index=train.var.index.to_list(),
                        columns=train.obs.index.to_list())
val_counts_df = pd.DataFrame(data=val.X.transpose().toarray(), index=val.var.index.to_list(),
                        columns=val.obs.index.to_list())
train_counts_df

In [None]:
train_counts_fp = os.path.join(SANDBOX_DIR, 'train_counts.txt')
val_counts_fp = os.path.join(SANDBOX_DIR, 'val_counts.txt')
train_counts_df.to_csv(train_counts_fp, sep='\t')
val_counts_df.to_csv(val_counts_fp, sep='\t')

python actinn_format.py -i input_file -o output_prefix -f format

python actinn_format.py -i ./test_data/train_set.txt.gz -o train_set -f txt


In [None]:
train_h5_fp = os.path.join(SANDBOX_DIR, 'train.h5')
train_annotations_fp = os.path.join(SANDBOX_DIR, 'train_annotations.txt')
val_h5_fp = os.path.join(SANDBOX_DIR, 'val.h5')

train.obs[[CELL_TYPE_KEY]].to_csv(train_annotations_fp, sep='\t', index=True, header=False)

subprocess.check_output(('python', '/home/estorrs/ACTINN/actinn_format.py', '-i', train_counts_fp,
                        '-o', train_h5_fp.replace('.h5', ''), '-f', 'txt'))

In [None]:
subprocess.check_output(('python', '/home/estorrs/ACTINN/actinn_format.py', '-i', val_counts_fp,
                        '-o', val_h5_fp.replace('.h5', ''), '-f', 'txt'))

In [None]:
train.obs[[CELL_TYPE_KEY]]

python actinn_predict.py -trs training_set -trl training_label -ts test_set -lr learning_rat -ne num_epoch -ms minibatch_size -pc print_cost -op output_probability


-trs Path to the training set, must be HDF5 format with key "dge".

-trl Path to the training label (the cell types for the training set), must be tab separated text file with no column and row names.

-ts Path to test sets, must be HDF5 format with key "dge".

-lr Learning rate (default: 0.0001). We can increase the learning rate if the cost drops too slow, or decrease the learning rate if the cost drops super fast in the beginning and starts to fluctuate in later epochs.

-ne Number of epochs (default: 50). The number of epochs can be determined by looking at the cost after each epoch. If the cost starts to decrease very slowly after ceartain epoch, then the "ne" parameter should be set to that epoch number.

-ms Minibatch size (default: 128). This parameter can be set larger when training a large dataset.

-pc Print cost (default: True). Whether to print cost after each 5 epochs.

-op Output probabilities for each cell being the cell types in the training data (default: False).


In [None]:
subprocess.check_output(('python', '/home/estorrs/ACTINN/actinn_predict.py', '-trs', train_h5_fp,
                        '-trl', train_annotations_fp, '-ts', val_h5_fp))

In [None]:
' '.join(('python', '/home/estorrs/ACTINN/actinn_predict.py', '-trs', train_h5_fp,
                        '-trl', train_annotations_fp, '-ts', val_h5_fp,
                        '-op', 'True'))

In [None]:
prediction_df = pd.read_csv('predicted_label.txt', sep='\t')
prediction_df

In [None]:
df = pd.DataFrame.from_dict({
        'cell_id': prediction_df['cellname'].to_list(),
        'prediction': prediction_df['celltype'].to_list(),
        'probability': [np.nan] * val.shape[0]
    })
df

In [None]:
val.obs

In [None]:
df = pd.merge(df, val.obs, left_on='cell_id', right_index=True)
df = df[['cell_id', 'cell_type', 'prediction', 'probability']]
df.columns = ['cell_id', 'groundtruth', 'prediction', 'probability']
df


##### Seurat

In [None]:
def run_seurat_transfer(train, val, cell_type_key):
    # save the input data for the seurat script
    train_counts_fp, val_counts_fp = (os.path.join(SANDBOX_DIR, 'train_counts.txt'),
                                        os.path.join(SANDBOX_DIR, 'val_counts.txt'))
    train_annotations_fp, val_annotations_fp = (os.path.join(SANDBOX_DIR, 'train_annotations.txt'),
                                                os.path.join(SANDBOX_DIR, 'val_annotations.txt'))

    ## prepare train and val count matrices
    X = train.X.toarray() if 'sparse' in str(type(train.X)) else train.X
    train_counts = pd.DataFrame(data=X.transpose().astype(np.int32), index=train.var.index,
                                columns=train.obs.index)
    train_counts.index.name = ''
    # for some reason SCTransform fails if the integer values are too high, so capping them here
    cap = pow(2, 14)
    train_counts.values[train_counts.values>cap] = cap
    train_counts.to_csv(train_counts_fp, sep='\t', header=True, index=True)
    
    X = val.X.toarray() if 'sparse' in str(type(val.X)) else val.X
    val_counts = pd.DataFrame(data=X.transpose().astype(np.int32), index=val.var.index,
                                columns=val.obs.index)
    val_counts.index.name = ''
    val_counts.values[val_counts.values>cap] = cap
    val_counts.to_csv(val_counts_fp, sep='\t', header=True, index=True)

    train.obs[[CELL_TYPE_KEY]].to_csv(train_annotations_fp, sep='\t', header=False, index=False)
    val.obs[[CELL_TYPE_KEY]].to_csv(val_annotations_fp, sep='\t', header=False, index=False)
    
    # actually run the script and read the results back in
    prediction_fp = os.path.join(SANDBOX_DIR, 'seurat_predictions.txt')
    try:
        subprocess.check_output(('Rscript', SEURAT_SCRIPT, train_counts_fp, train_annotations_fp,
                            val_counts_fp, val_annotations_fp, prediction_fp))
    except subprocess.CalledProcessError as e:
        print(f'called process error', e)
        return pd.DataFrame()
    
    # format the predictions dataframe
    df = pd.read_csv(prediction_fp, sep='\t')
    df.index = [x.replace('.', '-') for x in df.index]
    # also remove that weird X thing seurat sometimes puts there if first char is _
    df.index = [x[1:] if x[:2]=='X_' else x for x in df.index]
    df = pd.merge(df, val.obs, left_index=True, right_index=True)
    df['cell_id'] = df.index.to_list()
    try:
        df = df[['cell_id', 'cell_type', 'predicted.id', 'prediction.score.max']]        
        df.columns = ['cell_id', 'groundtruth', 'predicted', 'probability']
        return df
    except KeyError as e:
        print(f'key error', e)
        return pd.DataFrame()

In [None]:
SEURAT_SCRIPT = '/home/estorrs/pollock/benchmarking/tools/run_seurat_workflow.R'
run_workflow_for_datasets(adata_map, run_seurat_transfer, 'seurat_transfer', RESULTS_DIR)

In [None]:
run_workflow_for_cross_disease(adata_map, run_seurat_transfer, 'seurat_transfer', RESULTS_CROSS_DISEASE_DIR)

In [None]:
run_workflow_for_cross_datatype(adata_map, run_seurat_transfer, 'seurat_transfer', RESULTS_CROSS_DTYPE_DIR)

###### testing stuff

In [None]:
# train, val = adata_map['scRNAseq']['pbmc']['train'].copy(), adata_map['scRNAseq']['pbmc']['val'].copy()
train, val = sc.read_h5ad(adata_map['scRNAseq']['brca']['train']), sc.read_h5ad(adata_map['scRNAseq']['brca']['val'])

In [None]:
pow(2, 14)

In [None]:
# save the input data for the seurat script
train_counts_fp, val_counts_fp = (os.path.join(SANDBOX_DIR, 'train_counts.txt'),
                                    os.path.join(SANDBOX_DIR, 'val_counts.txt'))
train_annotations_fp, val_annotations_fp = (os.path.join(SANDBOX_DIR, 'train_annotations.txt'),
                                            os.path.join(SANDBOX_DIR, 'val_annotations.txt'))

## prepare train and val count matrices
X = train.X.toarray() if 'sparse' in str(type(train.X)) else train.X
train_counts = pd.DataFrame(data=X.transpose().astype(np.int32), index=train.var.index,
                            columns=train.obs.index)
train_counts.index.name = ''
# for some reason SCTransform fails if the integer values are too high, so capping them here
cap = pow(2, 14)
train_counts.values[train_counts.values>cap] = cap
train_counts.to_csv(train_counts_fp, sep='\t', header=True, index=True)

X = val.X.toarray() if 'sparse' in str(type(val.X)) else val.X
val_counts = pd.DataFrame(data=X.transpose().astype(np.int32), index=val.var.index,
                            columns=val.obs.index)
val_counts.index.name = ''
val_counts.values[val_counts.values>cap] = cap
val_counts.to_csv(val_counts_fp, sep='\t', header=True, index=True)

train.obs[[CELL_TYPE_KEY]].to_csv(train_annotations_fp, sep='\t', header=False, index=False)
val.obs[[CELL_TYPE_KEY]].to_csv(val_annotations_fp, sep='\t', header=False, index=False)

In [None]:
train_counts

In [None]:
train_counts

In [None]:
type(train_counts.values), type(train_counts.values[0, 0])

In [None]:
vals = sorted(set(train_counts.values.flatten()))
vals

In [None]:
vals[:10], vals[-10:]

In [None]:
train_counts.values[train_counts.values>1000] = 1000

In [None]:
np.where(train_counts>1)

In [None]:
# actually run the script and read the results back in
prediction_fp = os.path.join(SANDBOX_DIR, 'seurat_predictions.txt')
subprocess.check_output(('Rscript', SEURAT_SCRIPT, train_counts_fp, train_annotations_fp,
                    val_counts_fp, val_annotations_fp, prediction_fp))

In [None]:
# format the predictions dataframe
df = pd.read_csv(prediction_fp, sep='\t')
df.index = [x.replace('.', '-') for x in df.index]
# also remove that weird X thing seurat sometimes puts there
df.index = [x[1:] if x[:2]=='X_' else x for x in df.index]
df = pd.merge(df, val.obs, left_index=True, right_index=True)
df['cell_id'] = df.index.to_list()
df = df[['cell_id', 'cell_type', 'predicted.id', 'prediction.score.max']]        
df.columns = ['cell_id', 'groundtruth', 'predicted', 'probability']
df

In [None]:
val.obs

##### SingleCellNet

In [None]:
# !pip install git+https://github.com/pcahan1/PySingleCellNet/

In [None]:
import pySingleCellNet as pySCN

In [None]:
import scipy

In [None]:
def run_SingleCellNet(train, val, cell_type_key):
#     if 'sparse' in str(type(train.X)): train.X = train.X.toarray()
    if 'sparse' not in str(type(train.X)): train.X = scipy.sparse.csr_matrix(train.X)
    if 'sparse' in str(type(val.X)): val.X = val.X.toarray()
    
    # save the input data for the seurat script
    cgenesA, xpairs, tspRF = pySCN.scn_train(train,
            nTopGenes=100, nRand=100, nTrees=1000, nTopGenePairs=100,
            dLevel=cell_type_key, stratify=True, limitToHVG=True, )
    predictions = pySCN.scn_classify(val, cgenesA, xpairs, tspRF, nrand = 0)
    
    df = pd.merge(predictions.obs[['SCN_class']], val.obs, left_index=True, right_index=True)
    
    df = df[['cell_type', 'SCN_class']]
    df.columns = ['groundtruth', 'predicted']
    df['cell_id'] = df.index.to_list()
    df['probability'] = [np.nan] * df.shape[0]
    df = df[['cell_id', 'groundtruth', 'predicted', 'probability']]
    

    return df

In [None]:
run_workflow_for_datasets(adata_map, run_SingleCellNet, 'SingleCellNet', RESULTS_DIR)

In [None]:
run_workflow_for_cross_disease(adata_map, run_SingleCellNet, 'SingleCellNet', RESULTS_CROSS_DISEASE_DIR)

In [None]:
run_workflow_for_cross_datatype(adata_map, run_SingleCellNet, 'SingleCellNet', RESULTS_CROSS_DTYPE_DIR)

###### testing stuff

In [None]:
train, val = (sc.read_h5ad(adata_map['snATACseq']['brca_motif']['train']),
              sc.read_h5ad(adata_map['snATACseq']['brca_motif']['val']))

In [None]:
if 'sparse' not in str(type(train.X)): train.X = scipy.sparse.csr_matrix(train.X)
if 'sparse' in str(type(val.X)): val.X = val.X.toarray()

In [None]:
train.var

In [None]:
len(set(train.var.index))

In [None]:
cgenesA, xpairs, tspRF = pySCN.scn_train(train,
            nTopGenes = 100, nRand = 100, nTrees = 1000 ,nTopGenePairs = 100,
            dLevel = "cell_type", stratify=True, limitToHVG=True, )

In [None]:
train.shape, val.shape

In [None]:
val.obs

In [None]:
tspRF

In [None]:
val.X

In [None]:
val.X = val.X.toarray()

In [None]:
predictions = pySCN.scn_classify(val, cgenesA, xpairs, tspRF, nrand = 0)


In [None]:
predictions.obs

In [None]:
df = pd.merge(predictions.obs[['SCN_class']], val.obs, left_index=True, right_index=True)

df = df[['cell_type', 'SCN_class']]
df.index.name = 'cell_id'
df.columns = ['groundtruth', 'predictions']
df['probability'] = [np.nan] * df.shape[0]
df

##### MARS

In [None]:
from args_parser import get_parser
from model.mars import MARS
from model.experiment_dataset import ExperimentDataset
from sklearn.utils.linear_assignment_ import linear_assignment

In [None]:
##pulled from evaluation.py and modified to remove class thing
def hungarian_match(y_true, y_pred):
    """Matches predicted labels to original using hungarian algorithm."""
    
    y_true = adjust_range(y_true)
    y_pred = adjust_range(y_pred)
    
    D = max(y_pred.max(), y_true.max()) + 1
    w = np.zeros((D, D), dtype=np.int64)
    # Confusion matrix.
    for i in range(y_pred.size):
        w[y_pred[i], y_true[i]] += 1
    ind = linear_assignment(-w)
    d = {i:j for i, j in ind}
    y_pred = np.array([d[v] for v in y_pred])
    
    return y_true, y_pred


def adjust_range(y):
    """Assures that the range of indices if from 0 to n-1."""
    y = np.array(y, dtype=np.int64)
    val_set = set(y)
    mapping = {val:i for  i,val in enumerate(val_set)}
    y = np.array([mapping[val] for val in y], dtype=np.int64)
    return y
    
    
def run_mars_workflow(train, val, cell_type_key):
    params, unknown = get_parser().parse_known_args()
    params.device = 'cpu'
    
    if 'sparse' not in str(type(train.X)): train.X = scipy.sparse.csr_matrix(train.X)
    if 'sparse' not in str(type(val.X)): val.X = scipy.sparse.csr_matrix(val.X)
        
    var_names = train.var_names.intersection(val.var_names)
    train = train[:, var_names]
    val = val[:, var_names]

    train.obs['dataset'] = 'train'
    val.obs['dataset'] = 'val'

    adata = anndata.concat((train, val))

    sc.pp.normalize_per_cell(adata, counts_per_cell_after=1e4)
    sc.pp.scale(adata, zero_center=True, max_value=10.)

    sc.pp.neighbors(adata, n_neighbors=30, use_rep='X')
    sc.pp.pca(adata, n_comps=50)
    
    train = adata[adata.obs['dataset'] == 'train',:]
    val = adata[adata.obs['dataset'] == 'val',:]

    train = train[train.obs.sort_values(cell_type_key).index]
    val = val[val.obs.sort_values(cell_type_key).index]
    
    class_to_int = {c:i for i, c in enumerate(sorted(set(train.obs[cell_type_key])))}
    int_to_class = {i:c for c, i in class_to_int.items()}
    
    y_train = np.array([class_to_int[c] for c in train.obs[cell_type_key]], dtype=np.int64)
    annotated = ExperimentDataset(train.X.toarray(), train.obs_names, train.var_names, 'train', y_train, )
    y_val = np.array([class_to_int[c] for c in val.obs[cell_type_key]], dtype=np.int64)
    unannotated = ExperimentDataset(val.X.toarray(), val.obs_names, val.var_names, 'val', y_val)
    
    pretrain_data = ExperimentDataset(val.X.toarray(), val.obs_names, val.var_names, 'val')
    n_clusters = len(np.unique(unannotated.y))
    mars = MARS(n_clusters, params, [annotated], unannotated, pretrain_data, hid_dim_1=1000, hid_dim_2=100)
    a, landmarks, scores = mars.train(evaluation_mode=True, save_all_embeddings=True)
    
    preds = a[a.obs['experiment']=='val'].copy()
    preds.obs.index = [x.replace('-val', '') for x in preds.obs.index]
    _, adjusted = hungarian_match(preds.obs['truth_labels'], preds.obs['MARS_labels'])
    preds.obs['adjusted_predicted'] = adjusted
    preds.obs['groundtruth'] = [val.obs.loc[x, cell_type_key] for x in preds.obs.index]
    preds.obs['predicted'] = [int_to_class[x] for x in preds.obs['adjusted_predicted']]

    df = preds.obs[['groundtruth', 'predicted']]
    df['cell_id'] = df.index.to_list()
    df['probability'] = np.nan
    df = df[['cell_id', 'groundtruth', 'predicted', 'probability']]

    return df

In [None]:
run_workflow_for_datasets(adata_map, run_mars_workflow, 'mars', RESULTS_DIR)

In [None]:
run_workflow_for_cross_disease(adata_map, run_mars_workflow, 'mars', RESULTS_CROSS_DISEASE_DIR)

In [None]:
run_workflow_for_cross_datatype(adata_map, run_mars_workflow, 'mars', RESULTS_CROSS_DTYPE_DIR)

###### testing stuff

In [None]:
from model.mars import MARS
from model.experiment_dataset import ExperimentDataset

In [None]:
train, val = (sc.read_h5ad(adata_map['scRNAseq']['pbmc']['train']),
              sc.read_h5ad(adata_map['scRNAseq']['pbmc']['val']))

In [None]:
if 'sparse' not in str(type(train.X)): train.X = scipy.sparse.csr_matrix(train.X)
if 'sparse' not in str(type(val.X)): val.X = scipy.sparse.csr_matrix(val.X)


In [None]:
var_names = train.var_names.intersection(val.var_names)
train = train[:, var_names]
val = val[:, var_names]

train.obs['dataset'] = 'train'
val.obs['dataset'] = 'val'

combined = train.concat(val)

sc.pp.normalize_per_cell(adata, counts_per_cell_after=1e4)
sc.pp.scale(adata, zero_center=True)

sc.pp.neighbors(adata, n_neighbors=30, use_rep='X')
sc.pp.pca(adata, n_comps=50)

In [None]:
train = adata[adata.obs['dataset'] == 'train',:]
val = adata[adata.obs['dataset'] == 'val',:]

In [None]:
y_train = np.array(train.obs['cell_type'])
annotated = ExperimentDataset(train.X.toarray(), train.obs_names, train.var_names, 'train', y_train)

In [None]:
y_val = np.array(val.obs['cell_type'])
unannotated = ExperimentDataset(val.X.toarray(), val.obs_names, val.var_names, 'val', y_val)

In [None]:
pretrain_data = ExperimentDataset(val.X.toarray(), val.obs_names, val.var_names, 'val')

In [None]:

n_clusters = len(np.unique(unannnotated.y))

In [None]:
mars = MARS(n_clusters, params, [annotated], unannnotated, pretrain_data, hid_dim_1=1000, hid_dim_2=100)

In [None]:
# return both annotated and unannotated datasets with save_all_embeddings
adata, landmarks, scores = mars.train(evaluation_mode=True, save_all_embeddings=True) # evaluation mode

In [None]:
adata.obs