In [1]:
import json
import logging
import os
import re
from pathlib import Path
import subprocess

import anndata
import numpy as np
import pandas as pd
import scanpy as sc

In [2]:
folds_fmap = {'snRNAseq': {'ccrcc': {'fold3': {'train': '/data/pollock/benchmarking/pollock_datasets_with_folds/snRNAseq/ccrcc_fold3_train.h5ad',
    'val': '/data/pollock/benchmarking/pollock_datasets_with_folds/snRNAseq/ccrcc_fold3_val.h5ad'},
   'fold1': {'val': '/data/pollock/benchmarking/pollock_datasets_with_folds/snRNAseq/ccrcc_fold1_val.h5ad',
    'train': '/data/pollock/benchmarking/pollock_datasets_with_folds/snRNAseq/ccrcc_fold1_train.h5ad'},
   'fold2': {'train': '/data/pollock/benchmarking/pollock_datasets_with_folds/snRNAseq/ccrcc_fold2_train.h5ad',
    'val': '/data/pollock/benchmarking/pollock_datasets_with_folds/snRNAseq/ccrcc_fold2_val.h5ad'},
   'fold0': {'val': '/data/pollock/benchmarking/pollock_datasets_with_folds/snRNAseq/ccrcc_fold0_val.h5ad',
    'train': '/data/pollock/benchmarking/pollock_datasets_with_folds/snRNAseq/ccrcc_fold0_train.h5ad'},
   'fold4': {'val': '/data/pollock/benchmarking/pollock_datasets_with_folds/snRNAseq/ccrcc_fold4_val.h5ad',
    'train': '/data/pollock/benchmarking/pollock_datasets_with_folds/snRNAseq/ccrcc_fold4_train.h5ad'}},
  'brca': {'fold2': {'train': '/data/pollock/benchmarking/pollock_datasets_with_folds/snRNAseq/brca_fold2_train.h5ad',
    'val': '/data/pollock/benchmarking/pollock_datasets_with_folds/snRNAseq/brca_fold2_val.h5ad'},
   'fold1': {'train': '/data/pollock/benchmarking/pollock_datasets_with_folds/snRNAseq/brca_fold1_train.h5ad',
    'val': '/data/pollock/benchmarking/pollock_datasets_with_folds/snRNAseq/brca_fold1_val.h5ad'},
   'fold3': {'train': '/data/pollock/benchmarking/pollock_datasets_with_folds/snRNAseq/brca_fold3_train.h5ad',
    'val': '/data/pollock/benchmarking/pollock_datasets_with_folds/snRNAseq/brca_fold3_val.h5ad'},
   'fold0': {'val': '/data/pollock/benchmarking/pollock_datasets_with_folds/snRNAseq/brca_fold0_val.h5ad',
    'train': '/data/pollock/benchmarking/pollock_datasets_with_folds/snRNAseq/brca_fold0_train.h5ad'},
   'fold4': {'train': '/data/pollock/benchmarking/pollock_datasets_with_folds/snRNAseq/brca_fold4_train.h5ad',
    'val': '/data/pollock/benchmarking/pollock_datasets_with_folds/snRNAseq/brca_fold4_val.h5ad'}},
  'gbm': {'fold4': {'train': '/data/pollock/benchmarking/pollock_datasets_with_folds/snRNAseq/gbm_fold4_train.h5ad',
    'val': '/data/pollock/benchmarking/pollock_datasets_with_folds/snRNAseq/gbm_fold4_val.h5ad'},
   'fold3': {'train': '/data/pollock/benchmarking/pollock_datasets_with_folds/snRNAseq/gbm_fold3_train.h5ad',
    'val': '/data/pollock/benchmarking/pollock_datasets_with_folds/snRNAseq/gbm_fold3_val.h5ad'},
   'fold0': {'train': '/data/pollock/benchmarking/pollock_datasets_with_folds/snRNAseq/gbm_fold0_train.h5ad',
    'val': '/data/pollock/benchmarking/pollock_datasets_with_folds/snRNAseq/gbm_fold0_val.h5ad'},
   'fold2': {'val': '/data/pollock/benchmarking/pollock_datasets_with_folds/snRNAseq/gbm_fold2_val.h5ad',
    'train': '/data/pollock/benchmarking/pollock_datasets_with_folds/snRNAseq/gbm_fold2_train.h5ad'},
   'fold1': {'train': '/data/pollock/benchmarking/pollock_datasets_with_folds/snRNAseq/gbm_fold1_train.h5ad',
    'val': '/data/pollock/benchmarking/pollock_datasets_with_folds/snRNAseq/gbm_fold1_val.h5ad'}}},
 'snATACseq': {'ccrcc': {'fold3': {'train': '/data/pollock/benchmarking/pollock_datasets_with_folds/snATACseq/ccrcc_fold3_train.h5ad',
    'val': '/data/pollock/benchmarking/pollock_datasets_with_folds/snATACseq/ccrcc_fold3_val.h5ad'},
   'fold1': {'val': '/data/pollock/benchmarking/pollock_datasets_with_folds/snATACseq/ccrcc_fold1_val.h5ad',
    'train': '/data/pollock/benchmarking/pollock_datasets_with_folds/snATACseq/ccrcc_fold1_train.h5ad'},
   'fold2': {'train': '/data/pollock/benchmarking/pollock_datasets_with_folds/snATACseq/ccrcc_fold2_train.h5ad',
    'val': '/data/pollock/benchmarking/pollock_datasets_with_folds/snATACseq/ccrcc_fold2_val.h5ad'},
   'fold0': {'val': '/data/pollock/benchmarking/pollock_datasets_with_folds/snATACseq/ccrcc_fold0_val.h5ad',
    'train': '/data/pollock/benchmarking/pollock_datasets_with_folds/snATACseq/ccrcc_fold0_train.h5ad'},
   'fold4': {'val': '/data/pollock/benchmarking/pollock_datasets_with_folds/snATACseq/ccrcc_fold4_val.h5ad',
    'train': '/data/pollock/benchmarking/pollock_datasets_with_folds/snATACseq/ccrcc_fold4_train.h5ad'}},
  'brca': {'fold2': {'train': '/data/pollock/benchmarking/pollock_datasets_with_folds/snATACseq/brca_fold2_train.h5ad',
    'val': '/data/pollock/benchmarking/pollock_datasets_with_folds/snATACseq/brca_fold2_val.h5ad'},
   'fold1': {'train': '/data/pollock/benchmarking/pollock_datasets_with_folds/snATACseq/brca_fold1_train.h5ad',
    'val': '/data/pollock/benchmarking/pollock_datasets_with_folds/snATACseq/brca_fold1_val.h5ad'},
   'fold3': {'train': '/data/pollock/benchmarking/pollock_datasets_with_folds/snATACseq/brca_fold3_train.h5ad',
    'val': '/data/pollock/benchmarking/pollock_datasets_with_folds/snATACseq/brca_fold3_val.h5ad'},
   'fold0': {'val': '/data/pollock/benchmarking/pollock_datasets_with_folds/snATACseq/brca_fold0_val.h5ad',
    'train': '/data/pollock/benchmarking/pollock_datasets_with_folds/snATACseq/brca_fold0_train.h5ad'},
   'fold4': {'train': '/data/pollock/benchmarking/pollock_datasets_with_folds/snATACseq/brca_fold4_train.h5ad',
    'val': '/data/pollock/benchmarking/pollock_datasets_with_folds/snATACseq/brca_fold4_val.h5ad'}},
  'gbm': {'fold4': {'train': '/data/pollock/benchmarking/pollock_datasets_with_folds/snATACseq/gbm_fold4_train.h5ad',
    'val': '/data/pollock/benchmarking/pollock_datasets_with_folds/snATACseq/gbm_fold4_val.h5ad'},
   'fold3': {'train': '/data/pollock/benchmarking/pollock_datasets_with_folds/snATACseq/gbm_fold3_train.h5ad',
    'val': '/data/pollock/benchmarking/pollock_datasets_with_folds/snATACseq/gbm_fold3_val.h5ad'},
   'fold0': {'train': '/data/pollock/benchmarking/pollock_datasets_with_folds/snATACseq/gbm_fold0_train.h5ad',
    'val': '/data/pollock/benchmarking/pollock_datasets_with_folds/snATACseq/gbm_fold0_val.h5ad'},
   'fold2': {'val': '/data/pollock/benchmarking/pollock_datasets_with_folds/snATACseq/gbm_fold2_val.h5ad',
    'train': '/data/pollock/benchmarking/pollock_datasets_with_folds/snATACseq/gbm_fold2_train.h5ad'},
   'fold1': {'train': '/data/pollock/benchmarking/pollock_datasets_with_folds/snATACseq/gbm_fold1_train.h5ad',
    'val': '/data/pollock/benchmarking/pollock_datasets_with_folds/snATACseq/gbm_fold1_val.h5ad'}}},
 'scRNAseq': {'hnscc': {'fold0': {'val': '/data/pollock/benchmarking/pollock_datasets_with_folds/scRNAseq/hnscc_fold0_val.h5ad',
    'train': '/data/pollock/benchmarking/pollock_datasets_with_folds/scRNAseq/hnscc_fold0_train.h5ad'},
   'fold4': {'val': '/data/pollock/benchmarking/pollock_datasets_with_folds/scRNAseq/hnscc_fold4_val.h5ad',
    'train': '/data/pollock/benchmarking/pollock_datasets_with_folds/scRNAseq/hnscc_fold4_train.h5ad'},
   'fold1': {'val': '/data/pollock/benchmarking/pollock_datasets_with_folds/scRNAseq/hnscc_fold1_val.h5ad',
    'train': '/data/pollock/benchmarking/pollock_datasets_with_folds/scRNAseq/hnscc_fold1_train.h5ad'},
   'fold3': {'val': '/data/pollock/benchmarking/pollock_datasets_with_folds/scRNAseq/hnscc_fold3_val.h5ad',
    'train': '/data/pollock/benchmarking/pollock_datasets_with_folds/scRNAseq/hnscc_fold3_train.h5ad'},
   'fold2': {'train': '/data/pollock/benchmarking/pollock_datasets_with_folds/scRNAseq/hnscc_fold2_train.h5ad',
    'val': '/data/pollock/benchmarking/pollock_datasets_with_folds/scRNAseq/hnscc_fold2_val.h5ad'}},
  'cesc': {'fold4': {'train': '/data/pollock/benchmarking/pollock_datasets_with_folds/scRNAseq/cesc_fold4_train.h5ad',
    'val': '/data/pollock/benchmarking/pollock_datasets_with_folds/scRNAseq/cesc_fold4_val.h5ad'},
   'fold3': {'train': '/data/pollock/benchmarking/pollock_datasets_with_folds/scRNAseq/cesc_fold3_train.h5ad',
    'val': '/data/pollock/benchmarking/pollock_datasets_with_folds/scRNAseq/cesc_fold3_val.h5ad'},
   'fold2': {'val': '/data/pollock/benchmarking/pollock_datasets_with_folds/scRNAseq/cesc_fold2_val.h5ad',
    'train': '/data/pollock/benchmarking/pollock_datasets_with_folds/scRNAseq/cesc_fold2_train.h5ad'},
   'fold0': {'train': '/data/pollock/benchmarking/pollock_datasets_with_folds/scRNAseq/cesc_fold0_train.h5ad',
    'val': '/data/pollock/benchmarking/pollock_datasets_with_folds/scRNAseq/cesc_fold0_val.h5ad'},
   'fold1': {'train': '/data/pollock/benchmarking/pollock_datasets_with_folds/scRNAseq/cesc_fold1_train.h5ad',
    'val': '/data/pollock/benchmarking/pollock_datasets_with_folds/scRNAseq/cesc_fold1_val.h5ad'}},
  'brca': {'fold2': {'train': '/data/pollock/benchmarking/pollock_datasets_with_folds/scRNAseq/brca_fold2_train.h5ad',
    'val': '/data/pollock/benchmarking/pollock_datasets_with_folds/scRNAseq/brca_fold2_val.h5ad'},
   'fold1': {'train': '/data/pollock/benchmarking/pollock_datasets_with_folds/scRNAseq/brca_fold1_train.h5ad',
    'val': '/data/pollock/benchmarking/pollock_datasets_with_folds/scRNAseq/brca_fold1_val.h5ad'},
   'fold3': {'train': '/data/pollock/benchmarking/pollock_datasets_with_folds/scRNAseq/brca_fold3_train.h5ad',
    'val': '/data/pollock/benchmarking/pollock_datasets_with_folds/scRNAseq/brca_fold3_val.h5ad'},
   'fold0': {'val': '/data/pollock/benchmarking/pollock_datasets_with_folds/scRNAseq/brca_fold0_val.h5ad',
    'train': '/data/pollock/benchmarking/pollock_datasets_with_folds/scRNAseq/brca_fold0_train.h5ad'},
   'fold4': {'train': '/data/pollock/benchmarking/pollock_datasets_with_folds/scRNAseq/brca_fold4_train.h5ad',
    'val': '/data/pollock/benchmarking/pollock_datasets_with_folds/scRNAseq/brca_fold4_val.h5ad'}},
  'myeloma': {'fold0': {'train': '/data/pollock/benchmarking/pollock_datasets_with_folds/scRNAseq/myeloma_fold0_train.h5ad',
    'val': '/data/pollock/benchmarking/pollock_datasets_with_folds/scRNAseq/myeloma_fold0_val.h5ad'},
   'fold1': {'val': '/data/pollock/benchmarking/pollock_datasets_with_folds/scRNAseq/myeloma_fold1_val.h5ad',
    'train': '/data/pollock/benchmarking/pollock_datasets_with_folds/scRNAseq/myeloma_fold1_train.h5ad'},
   'fold3': {'val': '/data/pollock/benchmarking/pollock_datasets_with_folds/scRNAseq/myeloma_fold3_val.h5ad',
    'train': '/data/pollock/benchmarking/pollock_datasets_with_folds/scRNAseq/myeloma_fold3_train.h5ad'},
   'fold4': {'val': '/data/pollock/benchmarking/pollock_datasets_with_folds/scRNAseq/myeloma_fold4_val.h5ad',
    'train': '/data/pollock/benchmarking/pollock_datasets_with_folds/scRNAseq/myeloma_fold4_train.h5ad'},
   'fold2': {'train': '/data/pollock/benchmarking/pollock_datasets_with_folds/scRNAseq/myeloma_fold2_train.h5ad',
    'val': '/data/pollock/benchmarking/pollock_datasets_with_folds/scRNAseq/myeloma_fold2_val.h5ad'}},
  'melanoma': {'fold3': {'train': '/data/pollock/benchmarking/pollock_datasets_with_folds/scRNAseq/melanoma_fold3_train.h5ad',
    'val': '/data/pollock/benchmarking/pollock_datasets_with_folds/scRNAseq/melanoma_fold3_val.h5ad'},
   'fold0': {'val': '/data/pollock/benchmarking/pollock_datasets_with_folds/scRNAseq/melanoma_fold0_val.h5ad',
    'train': '/data/pollock/benchmarking/pollock_datasets_with_folds/scRNAseq/melanoma_fold0_train.h5ad'},
   'fold2': {'val': '/data/pollock/benchmarking/pollock_datasets_with_folds/scRNAseq/melanoma_fold2_val.h5ad',
    'train': '/data/pollock/benchmarking/pollock_datasets_with_folds/scRNAseq/melanoma_fold2_train.h5ad'},
   'fold1': {'train': '/data/pollock/benchmarking/pollock_datasets_with_folds/scRNAseq/melanoma_fold1_train.h5ad',
    'val': '/data/pollock/benchmarking/pollock_datasets_with_folds/scRNAseq/melanoma_fold1_val.h5ad'},
   'fold4': {'train': '/data/pollock/benchmarking/pollock_datasets_with_folds/scRNAseq/melanoma_fold4_train.h5ad',
    'val': '/data/pollock/benchmarking/pollock_datasets_with_folds/scRNAseq/melanoma_fold4_val.h5ad'}},
  'pdac': {'fold2': {'train': '/data/pollock/benchmarking/pollock_datasets_with_folds/scRNAseq/pdac_fold2_train.h5ad',
    'val': '/data/pollock/benchmarking/pollock_datasets_with_folds/scRNAseq/pdac_fold2_val.h5ad'},
   'fold3': {'val': '/data/pollock/benchmarking/pollock_datasets_with_folds/scRNAseq/pdac_fold3_val.h5ad',
    'train': '/data/pollock/benchmarking/pollock_datasets_with_folds/scRNAseq/pdac_fold3_train.h5ad'},
   'fold1': {'train': '/data/pollock/benchmarking/pollock_datasets_with_folds/scRNAseq/pdac_fold1_train.h5ad',
    'val': '/data/pollock/benchmarking/pollock_datasets_with_folds/scRNAseq/pdac_fold1_val.h5ad'},
   'fold4': {'train': '/data/pollock/benchmarking/pollock_datasets_with_folds/scRNAseq/pdac_fold4_train.h5ad',
    'val': '/data/pollock/benchmarking/pollock_datasets_with_folds/scRNAseq/pdac_fold4_val.h5ad'},
   'fold0': {'val': '/data/pollock/benchmarking/pollock_datasets_with_folds/scRNAseq/pdac_fold0_val.h5ad',
    'train': '/data/pollock/benchmarking/pollock_datasets_with_folds/scRNAseq/pdac_fold0_train.h5ad'}}}}

In [3]:
ACTINN_FORMAT = '/home/estorrs/ACTINN/actinn_format.py'
ACTINN_PREDICT = '/home/estorrs/ACTINN/actinn_predict.py'
SANDBOX_DIR = '/data/sandbox/actinn'

In [4]:
def run_actinn(train, val):
    cell_type_key = 'cell_type'
    X = train.X.toarray() if 'sparse' in str(type(train.X)) else train.X
    train_counts_df = pd.DataFrame(data=X.transpose(), index=train.var.index.to_list(),
                        columns=train.obs.index.to_list())
    X = val.X.toarray() if 'sparse' in str(type(val.X)) else val.X
    val_counts_df = pd.DataFrame(data=X.transpose(), index=val.var.index.to_list(),
                        columns=val.obs.index.to_list())
    
    train_counts_fp = os.path.join(SANDBOX_DIR, 'train_counts.txt')
    val_counts_fp = os.path.join(SANDBOX_DIR, 'val_counts.txt')
    train_counts_df.to_csv(train_counts_fp, sep='\t')
    val_counts_df.to_csv(val_counts_fp, sep='\t')
    
    train_h5_fp = os.path.join(SANDBOX_DIR, 'train.h5')
    train_annotations_fp = os.path.join(SANDBOX_DIR, 'train_annotations.txt')
    val_h5_fp = os.path.join(SANDBOX_DIR, 'val.h5')

    train.obs[[cell_type_key]].to_csv(train_annotations_fp, sep='\t', index=True, header=False)

    subprocess.check_output(('python', ACTINN_FORMAT, '-i', train_counts_fp,
                            '-o', train_h5_fp.replace('.h5', ''), '-f', 'txt'))
    subprocess.check_output(('python', ACTINN_FORMAT, '-i', val_counts_fp,
                            '-o', val_h5_fp.replace('.h5', ''), '-f', 'txt'))
    # dont use probablity argument or it breaks
    subprocess.check_output(('python', ACTINN_PREDICT, '-trs', train_h5_fp,
                            '-trl', train_annotations_fp, '-ts', val_h5_fp))
    
    prediction_df = pd.read_csv('predicted_label.txt', sep='\t')
    print(prediction_df.columns)
    
    to_label = {k:v for k, v in zip(prediction_df['cellname'], prediction_df['celltype'])}
    
    val.obs['predicted_cell_type'] = [to_label[c] for c in val.obs.index.to_list()]

    return val
    

In [5]:
# with folds
result_dir = '/data/pollock/benchmarking/results/actinn/interdataset_with_folds'

In [12]:
to_prediction = {}
for dtype, d1 in folds_fmap.items():
    to_prediction[dtype] = {}
    for disease, d2 in d1.items():
        to_prediction[dtype][disease] = {}
        for fold, d3 in d2.items():
            print(dtype, disease, fold)
            train = sc.read_h5ad(d3['train'])
            val = sc.read_h5ad(d3['val'])
            
            try:
                to_prediction[dtype][disease][fold] = run_actinn(train, val)
            except:
                print(f'{dtype}_{disease}_{fold} failed')
        

snRNAseq ccrcc fold3
Index(['cellname', 'celltype'], dtype='object')
snRNAseq ccrcc fold1
Index(['cellname', 'celltype'], dtype='object')
snRNAseq ccrcc fold2
Index(['cellname', 'celltype'], dtype='object')
snRNAseq ccrcc fold0
Index(['cellname', 'celltype'], dtype='object')
snRNAseq ccrcc fold4
Index(['cellname', 'celltype'], dtype='object')
snRNAseq brca fold2
Index(['cellname', 'celltype'], dtype='object')
snRNAseq brca fold1
Index(['cellname', 'celltype'], dtype='object')
snRNAseq brca fold3
Index(['cellname', 'celltype'], dtype='object')
snRNAseq brca fold0
Index(['cellname', 'celltype'], dtype='object')
snRNAseq brca fold4
Index(['cellname', 'celltype'], dtype='object')
snRNAseq gbm fold4
Index(['cellname', 'celltype'], dtype='object')
snRNAseq gbm fold3
Index(['cellname', 'celltype'], dtype='object')
snRNAseq gbm fold0
Index(['cellname', 'celltype'], dtype='object')
snRNAseq gbm fold2
Index(['cellname', 'celltype'], dtype='object')
snRNAseq gbm fold1
Index(['cellname', 'celltype

In [13]:
for dtype, d1 in to_prediction.items():
    for disease, d2 in d1.items():
        for fold, a in d2.items():
            a.write_h5ad(os.path.join(result_dir, f'{dtype}_{disease}_{fold}.h5ad'))

... storing 'predicted_cell_type' as categorical
... storing 'predicted_cell_type' as categorical
... storing 'predicted_cell_type' as categorical
... storing 'predicted_cell_type' as categorical
... storing 'predicted_cell_type' as categorical
... storing 'predicted_cell_type' as categorical
... storing 'predicted_cell_type' as categorical
... storing 'predicted_cell_type' as categorical
... storing 'predicted_cell_type' as categorical
... storing 'predicted_cell_type' as categorical
... storing 'predicted_cell_type' as categorical
... storing 'predicted_cell_type' as categorical
... storing 'predicted_cell_type' as categorical
... storing 'predicted_cell_type' as categorical
... storing 'predicted_cell_type' as categorical
... storing 'predicted_cell_type' as categorical
... storing 'predicted_cell_type' as categorical
... storing 'predicted_cell_type' as categorical
... storing 'predicted_cell_type' as categorical
... storing 'predicted_cell_type' as categorical
... storing 'predict

In [15]:
a = sc.read_h5ad(folds_fmap['scRNAseq']['melanoma']['fold1']['train'])
a

AnnData object with n_obs × n_vars = 4404 × 23452
    obs: 'cell_type', 'barcode', 'sample'

In [16]:
a.obs

Unnamed: 0,cell_type,barcode,sample
554_07,CD8 T cell,07,554
1022_07,CD8 T cell,07,1022
29262_07,CD8 T cell,07,29262
30904_07,CD8 T cell,07,30904
10380_07,CD8 T cell,07,10380
...,...,...,...
17483_07,NK,07,17483
37805_07,NK,07,37805
24872_08,NK,08,24872
32221_07,NK,07,32221


In [17]:
len(set(a.obs.index))

4404

In [19]:
set(a.X[0].toarray().flatten())

{0.0,
 1.0,
 2.0,
 3.0,
 4.0,
 5.0,
 6.0,
 7.0,
 8.0,
 9.0,
 10.0,
 12.0,
 13.0,
 14.0,
 15.0,
 16.0,
 17.0,
 18.0,
 19.0,
 20.0,
 22.0,
 23.0,
 25.0,
 27.0,
 28.0,
 30.0,
 31.0,
 33.0,
 34.0,
 36.0,
 39.0,
 44.0,
 47.0,
 54.0,
 72.0,
 79.0,
 253.0}

In [7]:
a = to_prediction['snRNAseq']['ccrcc']['fold3']

In [8]:
a

AnnData object with n_obs × n_vars = 4255 × 33538
    obs: 'cell_type', 'barcode', 'sample', 'predicted_cell_type'

In [9]:
a.obs

Unnamed: 0,cell_type,barcode,sample,predicted_cell_type
77047_16,Malignant,16,77047,Malignant
34864_1,Malignant,1,34864,Malignant
105942_5,Malignant,5,105942,Malignant
21249_17,Malignant,17,21249,Malignant
75345_15,Malignant,15,75345,Malignant
...,...,...,...,...
90608_7,Treg,7,90608,Treg
97193_7,Treg,7,97193,Treg
25659_7,Treg,7,25659,Treg
94821_7,Treg,7,94821,Treg
