In [18]:
import os
import re
import pickle
from pathlib import Path
import yaml
import json

import numpy as np
import matplotlib.pyplot as plt
import scanpy as sc
import pandas as pd
import torch
import tifffile
from einops import rearrange, repeat

In [2]:
%load_ext autoreload

In [3]:
%autoreload 2

In [4]:
import mushroom.utils as utils
import mushroom.visualization.utils as vis_utils

In [5]:
project_dir = Path('/data/estorrs/mushroom/data/projects/submission_v1')

In [16]:
source_root = '/diskmnt/Projects/Users/estorrs/mushroom/data'
target_root = '/data/estorrs/mushroom/data'

def alter_filesystem(config, source_root, target_root):
    for entry in config['sections']:
        for mapping in entry['data']:
            mapping['filepath'] = mapping['filepath'].replace(source_root, target_root)
    
    if 'trainer_kwargs' in config and config['trainer_kwargs']['data_mask'] is not None:
        config['trainer_kwargs']['data_mask'] = config['trainer_kwargs']['data_mask'].replace(source_root, target_root)
        
    return config

In [6]:
case_order = [
    'HT206B1',
    'HT268B1',
    'HT339B2-H1',
    'HT397B1',
    'HT565B1-H2',
    'HT704B1',
    'HT891Z1',
    'HT913Z1',
    'S18-5591-C8',
    'S18-9906',
]

In [None]:
def downsample_adata(adata, n, key, sample=None):
    cell_types = sorted(set(adata.obs[key]))
    cell_ids = []
    for ct in cell_types:
        f = adata.obs[adata.obs[key]==ct]
        
        if sample is not None:
            if not isinstance(sample, list):
                f = f[f['orig.ident']==sample]
            else:
                m = None
                for sid in sample:
                    if m is None:
                        m = f['orig.ident']==sid
                    else:
                        m |= f['orig.ident']==sid
                f = f[m]
        
        if f.shape[0] > n:
            cell_ids += np.random.choice(f.index.to_list(), size=n, replace=False).tolist()
        else:
            cell_ids += f.index.to_list()
    return adata[cell_ids]

def get_sc_dfs(adata, n, key, samples=None):
    down = downsample_adata(adata, n, key, sample=samples)
    
    exp_df = pd.DataFrame(data=down.X.toarray().astype(int),
                          columns=down.var.index.to_list(),
                          index=down.obs.index.to_list()).T
    exp_df.index.name = 'GENES'

    sc_annot = down.obs[[key]]
    sc_annot.index.name = 'Cell IDs'
    sc_annot.columns = ['CellType']
    
    return exp_df, sc_annot

## load in single cell data

integratedprostate.annotated.rds from simon

In [61]:
out_dir = Path('/diskmnt/Projects/Users/estorrs/mushroom/data/projects/submission_v1/cytospace/prostate')
out_dir.mkdir(parents=True, exist_ok=True)

In [68]:
adata = sc.read_h5ad('/diskmnt/Projects/Users/estorrs/mushroom/data/projects/submission_v1/single_cell/prostate_v2.h5ad')
adata

AnnData object with n_obs × n_vars = 56111 × 36601
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'is_cell', 'excluded_reason', 'gex_raw_reads', 'gex_mapped_reads', 'gex_conf_intergenic_reads', 'gex_conf_exonic_reads', 'gex_conf_intronic_reads', 'gex_conf_exonic_unique_reads', 'gex_conf_exonic_antisense_reads', 'gex_conf_exonic_dup_reads', 'gex_exonic_umis', 'gex_conf_intronic_unique_reads', 'gex_conf_intronic_antisense_reads', 'gex_conf_intronic_dup_reads', 'gex_intronic_umis', 'gex_conf_txomic_unique_reads', 'gex_umis_count', 'gex_genes_count', 'atac_raw_reads', 'atac_unmapped_reads', 'atac_lowmapq', 'atac_dup_reads', 'atac_chimeric_reads', 'atac_mitochondrial_reads', 'atac_fragments', 'atac_TSS_fragments', 'atac_peak_region_fragments', 'atac_peak_region_cutsites', 'rho', 'percent.mt', 'log10GenesPerUMI', 'nUMI', 'nGene', 'nCount_ATAC', 'nFeature_ATAC', 'doublet_score_rna', 'predicted_doublet_rna', 'doublet_score_atac', 'predicted_doublet_atac', 'predicted_doublet', 'nCount_ATA

In [69]:
np.unique(adata.raw.X[0].toarray())

array([  0.,   1.,   2.,   3.,   4.,   5.,   6.,   7.,   8.,   9.,  10.,
        11.,  12.,  13.,  14.,  15.,  16.,  17.,  20.,  23.,  26.,  27.,
       259.])

In [70]:
np.unique(adata.X[0])

array([<1x36601 sparse matrix of type '<class 'numpy.float64'>'
       	with 1750 stored elements in Compressed Sparse Row format>],
      dtype=object)

In [71]:
adata = adata[:, [x for x in adata.var.index.to_list() if 'MT-' not in x]]
adata.shape

  if not is_categorical_dtype(df_full[k]):


(56111, 36588)

In [72]:
adata.obs

Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,is_cell,excluded_reason,gex_raw_reads,gex_mapped_reads,gex_conf_intergenic_reads,gex_conf_exonic_reads,gex_conf_intronic_reads,...,seurat_clusters,percent.rb,doublet_score,SCT_snn_res.0.5,custom_snn_res.0.7,custom_snn_res.1,RNA_snn_res.2,RNA_snn_res.1,dotcat,celltypes
HT771_AAACAGCCAGGACACA-1,HT771Z1-S1H1A3Y1Nd1,3342.0,1708,1,0,19784,18974,1120,3626,13595,...,0,,,,18,25,0,0,Luminal Cells,Luminal 1 Cells
HT771_AAACAGCCAGTGAACG-1,HT771Z1-S1H1A3Y1Nd1,2456.0,1333,1,0,13450,12844,821,2264,9187,...,3,,,,18,25,3,0,Luminal Cells,Luminal 1 Cells
HT771_AAACAGCCATGGTTAT-1,HT771Z1-S1H1A3Y1Nd1,3107.0,1567,1,0,17521,16823,826,3434,11981,...,3,,,,18,25,3,0,Luminal Cells,Luminal 1 Cells
HT771_AAACAGCCATTCCTGT-1,HT771Z1-S1H1A3Y1Nd1,1344.0,870,1,0,3777,3549,268,1220,1722,...,0,,,,18,25,0,7,Luminal Cells,Luminal 1 Cells
HT771_AAACATGCAGCATGTC-1,HT771Z1-S1H1A3Y1Nd1,4131.0,1845,1,0,23859,22845,1052,4341,16818,...,3,,,,18,25,3,0,Luminal Cells,Luminal 1 Cells
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HT913_TTTGGTGCACGAATCC-1,HT913Z1-S2H1A3Y1Nd1,3167.0,1527,1,0,16276,15360,1084,2854,10170,...,9,,,,7,8,9,1,Tumor,Tumor 2
HT913_TTTGTCCCACGTGCTG-1,HT913Z1-S2H1A3Y1Nd1,4141.0,1910,1,0,20664,19545,1316,3510,13395,...,8,,,,7,8,8,2,Luminal Cells,Luminal 1 Cells
HT913_TTTGTCTAGTTTGAGC-1,HT913Z1-S2H1A3Y1Nd1,8403.0,2964,1,0,52327,49990,3295,8505,35607,...,35,,,,7,8,35,24,Tumor,Tumor 2
HT913_TTTGTGGCAGCACGAA-1,HT913Z1-S2H1A3Y1Nd1,4257.0,1993,1,0,20153,19083,1289,3210,13927,...,2,,,,7,8,2,6,Tumor,Tumor 1


In [73]:
set(adata.obs['orig.ident'])

{'HT771Z1-S1H1A3Y1Nd1',
 'HT781Z1-S1H1A3Y1Nd1',
 'HT814Z1-S1A3Y1Nd1',
 'HT817Z1-S1H1A3Y1Nd1',
 'HT832Z1-S1H1A2Y1Nd1',
 'HT849Z1-S1H1Fc2Nd1',
 'HT891Z1-S2H3A2Y1Nd1_1Z1_1Bmn1_1',
 'HT898Z1-S1H1A3Nd1',
 'HT913Z1-S2H1A3Y1Nd1'}

In [75]:
for sid in set(adata.obs['orig.ident']):
    print(sid)
    f = adata[adata.obs['orig.ident']==sid]
    print(Counter(f.obs['celltypes']).most_common())

HT814Z1-S1A3Y1Nd1
[('Luminal 1 Cells', 2696), ('Luminal 2 Cells', 365), ('Tumor 2', 353), ('Fibroblasts', 169), ('T-Cells', 115), ('Macrophages', 93), ('Tumor 1', 41), ('Club Cells', 30), ('Club/Hillock Cells', 26), ('Smooth Muscle Cells', 23), ('Basal Cells', 21), ('Endothelial Cells', 20), ('Mast Cells', 16), ('B-Cells', 13), ('Pericytes', 13), ('Basophils', 10)]
HT913Z1-S2H1A3Y1Nd1
[('Luminal 1 Cells', 813), ('Tumor 2', 357), ('Fibroblasts', 206), ('Macrophages', 164), ('Luminal 2 Cells', 140), ('Basal Cells', 91), ('T-Cells', 91), ('Mast Cells', 45), ('Tumor 1', 37), ('Club Cells', 30), ('Smooth Muscle Cells', 25), ('Endothelial Cells', 11), ('Basophils', 10), ('B-Cells', 9), ('Pericytes', 7), ('Club/Hillock Cells', 6)]
HT781Z1-S1H1A3Y1Nd1
[('Luminal 1 Cells', 4603), ('Basal Cells', 575), ('T-Cells', 420), ('B-Cells', 410), ('Macrophages', 332), ('Luminal 2 Cells', 258), ('Club Cells', 207), ('Fibroblasts', 155), ('Mast Cells', 111), ('Endothelial Cells', 71), ('Club/Hillock Cells'

In [76]:
set(adata.obs['celltypes'])

{'B-Cells',
 'Basal Cells',
 'Basophils',
 'Club Cells',
 'Club/Hillock Cells',
 'Endothelial Cells',
 'Fibroblasts',
 'Luminal 1 Cells',
 'Luminal 2 Cells',
 'Macrophages',
 'Mast Cells',
 'Pericytes',
 'Smooth Muscle Cells',
 'T-Cells',
 'Tumor 1',
 'Tumor 2'}

In [79]:
case_map = {
    'HT891Z1': ['HT891Z1-S2H3A2Y1Nd1_1Z1_1Bmn1_1'],
    'HT913Z1': ['HT913Z1-S2H1A3Y1Nd1'],
}

mapping = {
    'B cell': ['B-Cells'],
    'Basal': ['Basal Cells'],
    'Epithelial': ['Luminal 1 Cells', 'Luminal 2 Cells', 'Tumor 1', 'Tumor 2'],
    'T cell': ['T-Cells'],
    'Macrophage': ['Macrophages'],
    'Fibroblast': ['Fibroblasts'],
    'Smooth Muscle Cells': ['Smooth Muscle Cells'],
    'Endothelial': ['Endothelial Cells'],
    'Club Cells': ['Club Cells', 'Club/Hillock Cells'],
    'Mast': ['Mast Cells'],
    'Pericyte': ['Pericytes'],
    'Exclude': ['Basophils']
}
cell_type_map = {
    v:k for k, vs in mapping.items() for v in vs
}

In [80]:
adata.obs['renamed_cell_types'] = [cell_type_map[c] for c in adata.obs['celltypes']]
adata = adata[adata.obs['renamed_cell_types']!='Exclude']
adata

  adata.obs['renamed_cell_types'] = [cell_type_map[c] for c in adata.obs['celltypes']]
  if not is_categorical_dtype(df_full[k]):


View of AnnData object with n_obs × n_vars = 55700 × 36588
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'is_cell', 'excluded_reason', 'gex_raw_reads', 'gex_mapped_reads', 'gex_conf_intergenic_reads', 'gex_conf_exonic_reads', 'gex_conf_intronic_reads', 'gex_conf_exonic_unique_reads', 'gex_conf_exonic_antisense_reads', 'gex_conf_exonic_dup_reads', 'gex_exonic_umis', 'gex_conf_intronic_unique_reads', 'gex_conf_intronic_antisense_reads', 'gex_conf_intronic_dup_reads', 'gex_intronic_umis', 'gex_conf_txomic_unique_reads', 'gex_umis_count', 'gex_genes_count', 'atac_raw_reads', 'atac_unmapped_reads', 'atac_lowmapq', 'atac_dup_reads', 'atac_chimeric_reads', 'atac_mitochondrial_reads', 'atac_fragments', 'atac_TSS_fragments', 'atac_peak_region_fragments', 'atac_peak_region_cutsites', 'rho', 'percent.mt', 'log10GenesPerUMI', 'nUMI', 'nGene', 'nCount_ATAC', 'nFeature_ATAC', 'doublet_score_rna', 'predicted_doublet_rna', 'doublet_score_atac', 'predicted_doublet_atac', 'predicted_doublet', 'nC

In [81]:
n = 2000

for sid, samples in case_map.items():
    print(sid, samples)
    exp_df, sc_annot = get_sc_dfs(adata, n, 'renamed_cell_types', samples=samples)
    print(exp_df.shape)
    exp_df.to_csv(os.path.join(out_dir, f'{sid}_input_sc_exp.txt'), sep='\t')
    sc_annot.to_csv(os.path.join(out_dir, f'{sid}_input_sc_annot.txt'), sep='\t')

print('all')
exp_df, sc_annot = get_sc_dfs(adata, n, 'renamed_cell_types', samples=[v for vs in case_map.values() for v in vs])
exp_df.to_csv(os.path.join(out_dir, f'all_input_sc_exp.txt'), sep='\t')
sc_annot.to_csv(os.path.join(out_dir, f'all_input_sc_annot.txt'), sep='\t')




HT891Z1 ['HT891Z1-S2H3A2Y1Nd1_1Z1_1Bmn1_1']


  if not is_categorical_dtype(df_full[k]):


(36588, 2845)
HT913Z1 ['HT913Z1-S2H1A3Y1Nd1']


  if not is_categorical_dtype(df_full[k]):


(36588, 2032)
all


  if not is_categorical_dtype(df_full[k]):


breast

In [40]:
out_dir = Path('/diskmnt/Projects/Users/estorrs/mushroom/data/projects/submission_v1/cytospace/breast')
out_dir.mkdir(parents=True, exist_ok=True)

In [11]:
adata = sc.read_h5ad('/diskmnt/Projects/Users/estorrs/mushroom/data/projects/submission_v1/single_cell/breast.h5ad')
adata

AnnData object with n_obs × n_vars = 131348 × 36601
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'percent.mito', 'nCount_SCT', 'nFeature_SCT', 'SCT_snn_res.0.5', 'seurat_clusters', 'cell_type_final', 'is_cell', 'excluded_reason', 'gex_raw_reads', 'gex_mapped_reads', 'gex_conf_intergenic_reads', 'gex_conf_exonic_reads', 'gex_conf_intronic_reads', 'gex_conf_exonic_unique_reads', 'gex_conf_exonic_antisense_reads', 'gex_conf_exonic_dup_reads', 'gex_exonic_umis', 'gex_conf_intronic_unique_reads', 'gex_conf_intronic_antisense_reads', 'gex_conf_intronic_dup_reads', 'gex_intronic_umis', 'gex_conf_txomic_unique_reads', 'gex_umis_count', 'gex_genes_count', 'atac_raw_reads', 'atac_unmapped_reads', 'atac_lowmapq', 'atac_dup_reads', 'atac_chimeric_reads', 'atac_mitochondrial_reads', 'atac_fragments', 'atac_TSS_fragments', 'atac_peak_region_fragments', 'atac_peak_region_cutsites', 'percent.mt', 'nCount_ATAC', 'nFeature_ATAC', 'nCount_ATAC_MACS2', 'nFeature_ATAC_MACS2', 'pct_read_in_peaks_500

In [34]:
np.unique(adata.raw.X[0].toarray())

array([  0.,   1.,   2.,   3.,   4.,   5.,   6.,   7.,   8.,   9.,  10.,
        11.,  12.,  13.,  14.,  15.,  16.,  17.,  18.,  19.,  20.,  21.,
        22.,  23.,  24.,  25.,  29.,  30.,  31.,  32.,  34.,  37.,  40.,
        47.,  50.,  58.,  59.,  82.,  92., 455.])

In [29]:
np.unique(adata.X[0].toarray())

array([0.        , 0.51213571, 0.84916889, 1.10079482, 1.30166369,
       1.46885495, 1.61205638, 1.73729643, 1.84858278, 1.94871539,
       2.03972776, 2.12314347, 2.20013356, 2.27161757, 2.33833075,
       2.40087027, 2.45972775, 2.51531274, 2.56797006, 2.61799274,
       2.66563191, 2.71110441, 2.75459879, 2.79628   , 2.83629318,
       2.87476672, 3.01537292, 3.04763888, 3.07889621, 3.10920607,
       3.16720156, 3.24833787, 3.32338225, 3.47926967, 3.53929356,
       3.68370086, 3.70036928, 4.02259485, 4.13571593, 5.72138487])

In [None]:
adata.X = adata.raw.X

In [58]:
adata = adata[:, [x for x in adata.var.index.to_list() if 'MT-' not in x]]
adata.shape

  if not is_categorical_dtype(df_full[k]):


(45740, 36588)

In [12]:
adata.obs

Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,percent.mito,nCount_SCT,nFeature_SCT,SCT_snn_res.0.5,seurat_clusters,cell_type_final,is_cell,...,TSS.enrichment,TSS.percentile,high.tss,nucleosome_group,ATAC_qual,SCT.weight,ATAC_MACS2.weight,wsnn_res.0.8,SCT_snn_res.0.8,sample_id
HT206B1-S1H4_AAACCCACAGATTCGT-1,HT206B1-S1H4,14951,5706,0.000535081265467193,3405,2489,1,15,T,,...,,,,,,,,,15,HT206B1-S1H4
HT206B1-S1H4_AAACCCACAGGCATTT-1,HT206B1-S1H4,2964,1550,0.0823211875843455,2999,1550,1,9,T,,...,,,,,,,,,9,HT206B1-S1H4
HT206B1-S1H4_AAACCCACATGACGTT-1,HT206B1-S1H4,2585,1596,0.0421663442940039,2743,1596,2,9,T,,...,,,,,,,,,9,HT206B1-S1H4
HT206B1-S1H4_AAACCCAGTCCTACAA-1,HT206B1-S1H4,1662,1120,0.00842358604091456,2697,1121,13,9,NK,,...,,,,,,,,,9,HT206B1-S1H4
HT206B1-S1H4_AAACCCAGTCTAGTGT-1,HT206B1-S1H4,1290,969,0.0124031007751938,2609,978,2,9,T,,...,,,,,,,,,9,HT206B1-S1H4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HT486B1-S1H1_combo_TTTGTGGCATTAAACC-1,HT486B1-S1H1_combo,1582,987,,2661,992,,22,,1,...,4.42498677792799,0.53,High,NS < 4,ATAC_good,0.997894767494635,0.00210523250536479,7,22,HT486B1-S1H1_combo
HT486B1-S1H1_combo_TTTGTGTTCATTGACA-1,HT486B1-S1H1_combo,5713,2166,,3855,2097,,11,,1,...,4.58041958041958,0.58,High,NS < 4,ATAC_poor,0.407859736253307,0.592140263746693,0,11,HT486B1-S1H1_combo
HT486B1-S1H1_combo_TTTGTGTTCATTTGTC-1,HT486B1-S1H1_combo,23435,5935,,3292,1939,,11,,1,...,4.90509490509489,0.68,High,NS < 4,ATAC_good,0.504711994422915,0.495288005577085,11,11,HT486B1-S1H1_combo
HT486B1-S1H1_combo_TTTGTGTTCGATTATG-1,HT486B1-S1H1_combo,1825,950,,2639,950,,11,,1,...,4.84131253362022,0.66,High,NS < 4,ATAC_poor,0.512599953219932,0.487400046780068,2,11,HT486B1-S1H1_combo


In [13]:
set(adata.obs['orig.ident'])

{'HT206B1-S1H4',
 'HT235B1-S1H1_combo',
 'HT243B1-H3A2',
 'HT243B1-S1H4_combo',
 'HT262B1-S1H3',
 'HT263B1-S1H1_combo',
 'HT271B1-S1H3_combo',
 'HT297B1-S1H1_combo',
 'HT305B1-S1H1_combo',
 'HT308B1-S1V1_combo',
 'HT323B1-S1H1_combo',
 'HT339B1-S1H3_combo',
 'HT339B2-S1H2_combo',
 'HT365B1-S1H1_combo',
 'HT397B1-S1H4_combo',
 'HT425B1-S1H1_combo',
 'HT486B1-S1H1_combo'}

In [17]:
from collections import Counter
Counter(adata.obs['cell_type_final']).most_common()

[('NA', 84283),
 ('Tumor', 24070),
 ('T', 6554),
 ('Macrophage', 5918),
 ('Fibroblast', 2315),
 ('Basal', 1224),
 ('T_reg', 1045),
 ('LumHR', 804),
 ('Remove_Unknown', 725),
 ('B', 713),
 ('Plasma', 628),
 ('Endothelial', 606),
 ('pDC', 485),
 ('Doublet', 475),
 ('NK', 385),
 ('Pericyte', 350),
 ('LumSec', 206),
 ('mregDC', 190),
 ('Mast', 151),
 ('Lymphatic', 125),
 ('cDC1', 55),
 ('Basal/LumSec', 41)]

In [18]:
for sid in set(adata.obs['orig.ident']):
    print(sid)
    f = adata[adata.obs['orig.ident']==sid]
    print(Counter(f.obs['cell_type_final']).most_common())

HT365B1-S1H1_combo
[('NA', 5833)]
HT297B1-S1H1_combo
[('NA', 4574)]
HT235B1-S1H1_combo
[('NA', 7516)]
HT486B1-S1H1_combo


  if not is_categorical_dtype(df_full[k]):


[('NA', 4867)]
HT339B1-S1H3_combo
[('Tumor', 1979), ('Macrophage', 1242), ('T', 756), ('Fibroblast', 689), ('LumHR', 520), ('Basal', 490), ('T_reg', 255), ('NK', 119), ('Mast', 111), ('Plasma', 103), ('pDC', 39)]
HT262B1-S1H3
[('NA', 3772)]
HT243B1-S1H4_combo
[('NA', 7970)]
HT206B1-S1H4
[('T', 3714), ('Macrophage', 1950), ('Tumor', 1296), ('T_reg', 742), ('Remove_Unknown', 725), ('Fibroblast', 567), ('B', 314), ('Plasma', 288), ('pDC', 274), ('NK', 266), ('LumSec', 206), ('LumHR', 169), ('Basal', 143), ('mregDC', 122), ('Endothelial', 119), ('Pericyte', 65), ('cDC1', 55), ('Mast', 40)]
HT425B1-S1H1_combo
[('NA', 3627)]
HT323B1-S1H1_combo
[('NA', 8607)]
HT308B1-S1V1_combo
[('Tumor', 8289), ('Macrophage', 1115), ('T', 496), ('Doublet', 228), ('Plasma', 156), ('Fibroblast', 140), ('LumHR', 115), ('B', 51), ('T_reg', 48), ('Basal/LumSec', 41), ('Endothelial', 32)]
HT271B1-S1H3_combo
[('NA', 13161)]
HT339B2-S1H2_combo
[('Tumor', 7751), ('Macrophage', 378), ('Fibroblast', 320), ('Endothelial

In [37]:
case_map = {
    'HT206B1': ['HT206B1-S1H4'],
    'HT339B2': ['HT339B2-S1H2_combo', 'HT339B1-S1H3_combo'],
    'HT397B1': ['HT397B1-S1H4_combo'],
}

mapping = {
    'B cell': ['B'],
    'Epithelial': ['Basal', 'Basal/LumSec', 'LumHR', 'LumSec', 'Tumor'],
    'T cell': ['T'],
    'NK cell': ['NK'],
    'Macrophage': ['Macrophage'],
    'Fibroblast': ['Fibroblast'],
    'Treg': ['T_reg'],
    'Plasma': ['Plasma'],
    'Endothelial': ['Endothelial'],
    'pDC': ['pDC'],
    'mregDC': ['mregDC'],
    'Mast': ['Mast'],
    'cDC': ['cDC1'],
    'Pericyte': ['Pericyte'],
    'Exclude': ['NA', 'Remove_Unknown', 'Doublet', 'Lymphatic']
}
cell_type_map = {
    v:k for k, vs in mapping.items() for v in vs
}

In [48]:
adata.obs['renamed_cell_types'] = [cell_type_map[c] for c in adata.obs['cell_type_final']]
adata = adata[adata.obs['renamed_cell_types']!='Exclude']
adata

  if not is_categorical_dtype(df_full[k]):


View of AnnData object with n_obs × n_vars = 45740 × 36601
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'percent.mito', 'nCount_SCT', 'nFeature_SCT', 'SCT_snn_res.0.5', 'seurat_clusters', 'cell_type_final', 'is_cell', 'excluded_reason', 'gex_raw_reads', 'gex_mapped_reads', 'gex_conf_intergenic_reads', 'gex_conf_exonic_reads', 'gex_conf_intronic_reads', 'gex_conf_exonic_unique_reads', 'gex_conf_exonic_antisense_reads', 'gex_conf_exonic_dup_reads', 'gex_exonic_umis', 'gex_conf_intronic_unique_reads', 'gex_conf_intronic_antisense_reads', 'gex_conf_intronic_dup_reads', 'gex_intronic_umis', 'gex_conf_txomic_unique_reads', 'gex_umis_count', 'gex_genes_count', 'atac_raw_reads', 'atac_unmapped_reads', 'atac_lowmapq', 'atac_dup_reads', 'atac_chimeric_reads', 'atac_mitochondrial_reads', 'atac_fragments', 'atac_TSS_fragments', 'atac_peak_region_fragments', 'atac_peak_region_cutsites', 'percent.mt', 'nCount_ATAC', 'nFeature_ATAC', 'nCount_ATAC_MACS2', 'nFeature_ATAC_MACS2', 'pct_read_in_pe

In [60]:
n = 2000

for sid, samples in case_map.items():
    print(sid, samples)
    exp_df, sc_annot = get_sc_dfs(adata, n, 'renamed_cell_types', samples=samples)
    print(exp_df.shape)
    exp_df.to_csv(os.path.join(out_dir, f'{sid}_input_sc_exp.txt'), sep='\t')
    sc_annot.to_csv(os.path.join(out_dir, f'{sid}_input_sc_annot.txt'), sep='\t')

print('all')
exp_df, sc_annot = get_sc_dfs(adata, n, 'renamed_cell_types', samples=[v for vs in case_map.values() for v in vs])
exp_df.to_csv(os.path.join(out_dir, f'all_input_sc_exp.txt'), sep='\t')
sc_annot.to_csv(os.path.join(out_dir, f'all_input_sc_annot.txt'), sep='\t')




all


  if not is_categorical_dtype(df_full[k]):


## create runs

In [7]:
case_order = [
    'HT206B1',
    'HT268B1',
    'HT339B2-H1',
    'HT397B1',
    'HT565B1-H2',
    'HT704B1',
    'HT891Z1',
    'HT913Z1',
    'S18-5591-C8',
    'S18-9906',
]

In [14]:
fps = sorted(utils.listfiles(project_dir / 'cytospace'))
case_to_data = {}
for case in case_order:
    paths = [fp for fp in fps if case in fp]
    if len(paths):
        fp = paths[0]
        data = {
            'sc_exp': '/'.join(fp.split('/')[:-1]) + f'/{case}_input_sc_exp.txt',
            'sc_annot': '/'.join(fp.split('/')[:-1]) + f'/{case}_input_sc_annot.txt',
        }
    else:
        if case in ['HT891Z1', 'HT913Z1', 'S18-5591-C8', 'S18-9906']:
            prefix = '/data/estorrs/mushroom/data/projects/submission_v1/cytospace/prostate'
        else:
            prefix = '/data/estorrs/mushroom/data/projects/submission_v1/cytospace/breast'
        data = {
            'sc_exp': f'{prefix}/all_input_sc_exp.txt',
            'sc_annot': f'{prefix}/all_input_sc_annot.txt',
        }
    case_to_data[case] = data
case_to_data

{'HT206B1': {'sc_exp': '/data/estorrs/mushroom/data/projects/submission_v1/cytospace/breast/HT206B1_input_sc_exp.txt',
  'sc_annot': '/data/estorrs/mushroom/data/projects/submission_v1/cytospace/breast/HT206B1_input_sc_annot.txt'},
 'HT268B1': {'sc_exp': '/data/estorrs/mushroom/data/projects/submission_v1/cytospace/breast/all_input_sc_exp.txt',
  'sc_annot': '/data/estorrs/mushroom/data/projects/submission_v1/cytospace/breast/all_input_sc_annot.txt'},
 'HT339B2-H1': {'sc_exp': '/data/estorrs/mushroom/data/projects/submission_v1/cytospace/breast/HT339B2-H1_input_sc_exp.txt',
  'sc_annot': '/data/estorrs/mushroom/data/projects/submission_v1/cytospace/breast/HT339B2-H1_input_sc_annot.txt'},
 'HT397B1': {'sc_exp': '/data/estorrs/mushroom/data/projects/submission_v1/cytospace/breast/HT397B1_input_sc_exp.txt',
  'sc_annot': '/data/estorrs/mushroom/data/projects/submission_v1/cytospace/breast/HT397B1_input_sc_annot.txt'},
 'HT565B1-H2': {'sc_exp': '/data/estorrs/mushroom/data/projects/submiss

###### for tiled data

In [48]:
def get_spatial_dfs(img, channels, min_counts=10):
    grids = np.meshgrid(np.arange(img.shape[1]), np.arange(img.shape[2]), indexing='ij')
    ids = [f'h{i}_w{j}' for i, j in rearrange(grids, 'z ... -> (...) z')]
    spatial_counts = pd.DataFrame(data=rearrange(img, 'c h w -> (h w) c'),
                                  columns=channels, index=ids)
    spatial_counts = spatial_counts.T
    vals = (spatial_counts.values > 0).sum(0)
    spatial_counts = spatial_counts[[c for c in spatial_counts.columns[vals > min_counts]]]
    
    spatial_counts.index.name = 'V1'
    
    data = []
    for x in spatial_counts.columns:
        row = re.sub(r'^.*h([0-9]+).*$', r'\1', x)
        col = re.sub(r'^.*w([0-9]+).*$', r'\1', x)
        data.append([row, col])
    coord_df = pd.DataFrame(data=data, columns=['row', 'col'], index=spatial_counts.columns)
    coord_df.index.name = 'SpotID'

    return spatial_counts, coord_df

In [80]:
out_dir = project_dir / 'cytospace_inputs'
out_dir.mkdir(parents=True, exist_ok=True)
for case in case_order:
    print(case)
    imaris_dir = project_dir / case / 'imaris' / 'rois'
    config = yaml.safe_load(open(project_dir / case / 'registered' / 'metadata.yaml'))
    config = alter_filesystem(config, source_root, target_root)
    
    meta = json.load(open(imaris_dir / 'tiled' / 'metadata.json'))
    
    sid_to_channels = meta['sid_to_channels']
    dtype_ident_to_dtype = meta['dtype_ident_to_dtype']
    dtype_ident_to_channels = meta['dtype_ident_to_channels']
    sid_to_dtype_ident = meta['sid_to_dtype_ident']
    tiling_size = meta['tiling_size']
    size = meta['size']
    fullres_size = meta['fullres_size']
    dtype_ident_to_tiled_fps = meta['dtype_ident_to_tiled_fps']
    
    dtype_ident_to_tiled = {dti:tifffile.imread(fp) for dti, fp in dtype_ident_to_tiled_fps.items()}
    for dti, tiled in dtype_ident_to_tiled.items():
        print(dti, tiled.shape)
    
    sid_to_tiled_idx = {}
    for dtype_ident in dtype_ident_to_channels.keys():
        if 'vis' not in dtype_ident:
            dtype = dtype_ident_to_dtype[dtype_ident]
            sids = [entry['sid'] for entry in config['sections'] if sid_to_dtype_ident[entry['sid']]==dtype_ident]
            for i, sid in enumerate(sids):
                sid_to_tiled_idx[sid] = i

    for entry in config['sections']:
        sid = entry['sid']
        position = entry['position']
        dti = sid_to_dtype_ident[sid]
        dtype = dti.split('_')[0]
        channels = dtype_ident_to_channels[dti]
        print(sid, dti, len(channels))

        if dti in dtype_ident_to_tiled and 'he_' not in dti and '_he' not in dti and 'multiplex' not in dti:
            img = dtype_ident_to_tiled[dti][sid_to_tiled_idx[sid]]

            img = utils.rescale(img, scale=.2, dim_order='c h w', target_dtype=img.dtype) # get to 50 micron res to match nbhds and make deconv faster
            
        elif 'vis' in dti:
            fp = entry['data'][0]['filepath']
            adata = sc.read_h5ad(fp)
            adata.var_names_make_unique()

            if 'vishd' in dti:
                scale = 50
                new_size = [int(x * .2) for x in size]
            else:
                scale = 100
                adata.obs['y_location'] = adata.obsm['spatial'][:, 1]
                adata.obs['x_location'] = adata.obsm['spatial'][:, 1]
                new_size = [int(x * .1) for x in size]
                
            adata.obs['yi'] = (adata.obs['y_location'] // scale).astype(int)
            adata.obs['xi'] = (adata.obs['x_location'] // scale).astype(int)

            img = np.zeros((adata.shape[1], new_size[0], new_size[1]))
            tups = sorted({(r, c) for r, c in zip(adata.obs['yi'], adata.obs['xi'])})
            X = adata.X
            for r, c in tups:
                r = min(r, new_size[0] - 1)
                c = min(c, new_size[1] - 1)
                f = X[((adata.obs['yi']==r) & (adata.obs['xi']==c))]
                if f.shape[0]:
                    vals = np.asarray(f.sum(0)).flatten()
                    img[:, r, c] = vals
            channels = adata.var.index.to_list()
        else:
            img = None

        if img is not None:
            spatial_counts, coord_df = get_spatial_dfs(img, channels, min_counts=10)
            print(spatial_counts.shape)
            
            spatial_counts.to_csv(os.path.join(out_dir, f'{sid}_input_counts.txt'), sep='\t')
            coord_df.to_csv(os.path.join(out_dir, f'{sid}_input_coords.txt'), sep='\t')

            
            
    

HT206B1
xenium_0 (3, 477, 653, 605)
multiplex_0 (3, 37, 653, 605)
he_0 (3, 3, 653, 605)
multiplex_1 (3, 38, 653, 605)
xenium_1 (3, 377, 653, 605)
HT206B1-U1 xenium_0 477
(477, 9800)
HT206B1-U4 he_0 3
HT206B1-U2 multiplex_0 37
HT206B1-U5 multiplex_1 38
HT206B1-U8 xenium_1 377
(377, 9908)
HT206B1-U9 xenium_0 477
(477, 10096)
HT206B1-U10 multiplex_0 37
HT206B1-U12 he_0 3
HT206B1-U13 multiplex_1 38
HT206B1-U16 xenium_1 377
(377, 10140)
HT206B1-U17 xenium_0 477
(477, 10388)
HT206B1-U18 multiplex_0 37
HT206B1-U20 he_0 3
HT206B1-U21 multiplex_1 38
HT206B1-U24 xenium_1 377
(377, 10314)
HT268B1
HT268B1-U2 visium_0 36601
(36601, 55)
HT268B1-U12 visium_0 36601
(36601, 55)
HT268B1-U22 visium_0 36601
(36601, 55)
HT268B1-U32 visium_0 36601
(36601, 56)
HT339B2-H1
multiplex_0 (1, 33, 678, 668)
HT339B2-H1-U1 visium_0 17943
(17943, 51)
HT339B2-H1-U2 multiplex_0 33
HT397B1
multiplex_0 (2, 25, 678, 669)
batch2_he_0 (6, 3, 678, 669)
multiplex_1 (1, 27, 678, 669)
multiplex_2 (1, 25, 678, 669)
HT397B1-U1 vis

  utils.warn_names_duplicates("var")


(18085, 12092)
HT704B1-U11 multiplex_0 38
HT704B1-U12 he_0 3
HT704B1-U14 cosmx_0 1000
(1000, 11501)
HT704B1-U15 he_0 3
HT704B1-U17 xenium_0 476
(476, 14399)
HT704B1-U19 multiplex_0 38
HT704B1-U20 he_0 3
HT704B1-U22 cosmx_0 1000
(1000, 17218)
HT704B1-U23 he_0 3
HT704B1-U26 xenium_0 476
(476, 14055)
HT704B1-U27 multiplex_0 38
HT704B1-U28 he_0 3
HT704B1-U31 he_0 3
HT704B1-U33 xenium_0 476
(476, 14274)
HT704B1-U35 multiplex_0 38
HT704B1-U36 he_0 3
HT704B1-U39 he_0 3
HT704B1-U41 xenium_0 476
(476, 14055)
HT704B1-U43 multiplex_0 38
HT704B1-U44 he_0 3
HT704B1-U47 cosmx_0 1000
(1000, 8892)
HT704B1-U48 he_0 3
HT704B1-U50 xenium_0 476
(476, 13386)
HT704B1-U51 vishd_0 18085


  utils.warn_names_duplicates("var")


(18085, 12978)
HT704B1-U53 multiplex_0 38
HT704B1-U54 he_0 3
HT704B1-U56 cosmx_0 1000
(1000, 14293)
HT704B1-U57 he_0 3
HT704B1-U60 multiplex_0 38
HT704B1-U61 he_0 3
HT704B1-U65 he_0 3
HT704B1-U68 multiplex_0 38
HT704B1-U69 he_0 3
HT704B1-U73 he_0 3
HT704B1-U76 multiplex_0 38
HT704B1-U81 he_0 3
HT704B1-U82 multiplex_0 38
HT704B1-U84 he_0 3
HT704B1-U85 multiplex_0 38
HT704B1-U90 he_0 3
HT704B1-U91 multiplex_0 38
HT704B1-U93 he_0 3
HT891Z1
xenium_0 (9, 476, 653, 725)
he_0 (25, 3, 653, 725)
multiplex_0 (16, 25, 653, 725)
HT891Z1-U1 xenium_0 476
(476, 10839)
HT891Z1-U4 he_0 3
HT891Z1-U2 vishd_0 18085


  utils.warn_names_duplicates("var")


(18085, 10964)
HT891Z1-U5 he_0 3
HT891Z1-U6 he_0 3
HT891Z1-U7 he_0 3
HT891Z1-U14 multiplex_0 25
HT891Z1-U21 xenium_0 476
(476, 10701)
HT891Z1-U28 he_0 3
HT891Z1-U29 he_0 3
HT891Z1-U30 he_0 3
HT891Z1-U31 xenium_0 476
(476, 10921)
HT891Z1-U32 multiplex_0 25
HT891Z1-U33 vishd_0 18085


  utils.warn_names_duplicates("var")


(18085, 9796)
HT891Z1-U35 he_0 3
HT891Z1-U40 multiplex_0 25
HT891Z1-U41 multiplex_0 25
HT891Z1-U43 multiplex_0 25
HT891Z1-U44 xenium_0 476
(476, 11074)
HT891Z1-U45 multiplex_0 25
HT891Z1-U46 multiplex_0 25
HT891Z1-U47 he_0 3
HT891Z1-U48 he_0 3
HT891Z1-U49 he_0 3
HT891Z1-U50 he_0 3
HT891Z1-U58 multiplex_0 25
HT891Z1-U59 xenium_0 476
(476, 10860)
HT891Z1-U66 he_0 3
HT891Z1-U68 he_0 3
HT891Z1-U69 xenium_0 476
(476, 10700)
HT891Z1-U70 he_0 3
HT891Z1-U79 multiplex_0 25
HT891Z1-U81 xenium_0 476
(476, 10419)
HT891Z1-U84 he_0 3
HT891Z1-U89 he_0 3
HT891Z1-U90 he_0 3
HT891Z1-U93 multiplex_0 25
HT891Z1-U94 xenium_0 476
(476, 9019)
HT891Z1-U101 he_0 3
HT891Z1-U102 he_0 3
HT891Z1-U103 multiplex_0 25
HT891Z1-U104 xenium_0 476
(476, 7435)
HT891Z1-U106 he_0 3
HT891Z1-U107 he_0 3
HT891Z1-U108 multiplex_0 25
HT891Z1-U109 multiplex_0 25
HT891Z1-U110 multiplex_0 25
HT891Z1-U111 multiplex_0 25
HT891Z1-U113 multiplex_0 25
HT891Z1-U114 he_0 3
HT891Z1-U115 he_0 3
HT891Z1-U116 he_0 3
HT913Z1
xenium_0 (3, 5001,

In [81]:
sp_exp_fps = sorted(utils.listfiles(project_dir / 'cytospace_inputs', regex=r'input_counts.txt$'))
sp_coord_fps = sorted(utils.listfiles(project_dir / 'cytospace_inputs', regex=r'input_coords.txt$'))

In [82]:
sp_exp_fps

['/data/estorrs/mushroom/data/projects/submission_v1/cytospace_inputs/HT206B1-U16_input_counts.txt',
 '/data/estorrs/mushroom/data/projects/submission_v1/cytospace_inputs/HT206B1-U17_input_counts.txt',
 '/data/estorrs/mushroom/data/projects/submission_v1/cytospace_inputs/HT206B1-U1_input_counts.txt',
 '/data/estorrs/mushroom/data/projects/submission_v1/cytospace_inputs/HT206B1-U24_input_counts.txt',
 '/data/estorrs/mushroom/data/projects/submission_v1/cytospace_inputs/HT206B1-U8_input_counts.txt',
 '/data/estorrs/mushroom/data/projects/submission_v1/cytospace_inputs/HT206B1-U9_input_counts.txt',
 '/data/estorrs/mushroom/data/projects/submission_v1/cytospace_inputs/HT268B1-U12_input_counts.txt',
 '/data/estorrs/mushroom/data/projects/submission_v1/cytospace_inputs/HT268B1-U22_input_counts.txt',
 '/data/estorrs/mushroom/data/projects/submission_v1/cytospace_inputs/HT268B1-U2_input_counts.txt',
 '/data/estorrs/mushroom/data/projects/submission_v1/cytospace_inputs/HT268B1-U32_input_counts.

run these commands on katmai using cytospace_v2 env

In [89]:
for sp_exp_fp, sp_coord_fp in zip(sp_exp_fps, sp_coord_fps):
    case = re.sub(r'^.*cytospace_inputs/(.*)-U[0-9]*_input_counts.txt', r'\1', sp_exp_fp)
    sc_exp_fp = case_to_data[case]['sc_exp']
    sc_annot_fp = case_to_data[case]['sc_annot']
    
    root = sp_exp_fp.replace('input_counts.txt', '')
    output_dir = os.path.join(root, 'outputs')
    # cytospace -sp input_sc_exp.txt -ctp input_sc_annot.txt -cp input_visium_coords.txt -stp input_visium_counts.txt -o outputs -sm lap_CSPR -mcn 20 -g square -sss -nosss 5000 -nop 5
    cmd = f'cytospace -sp {sc_exp_fp} -ctp {sc_annot_fp} -cp {sp_coord_fp} -stp {sp_exp_fp} -o {output_dir} -sm lap_CSPR -mcn 20 -g square -sss -nosss 5000 -nop 5'
    cmd = cmd.replace('/data/estorrs', '/diskmnt/Projects/Users/estorrs')
    print(cmd)

cytospace -sp /diskmnt/Projects/Users/estorrs/mushroom/data/projects/submission_v1/cytospace/breast/HT206B1_input_sc_exp.txt -ctp /diskmnt/Projects/Users/estorrs/mushroom/data/projects/submission_v1/cytospace/breast/HT206B1_input_sc_annot.txt -cp /diskmnt/Projects/Users/estorrs/mushroom/data/projects/submission_v1/cytospace_inputs/HT206B1-U16_input_coords.txt -stp /diskmnt/Projects/Users/estorrs/mushroom/data/projects/submission_v1/cytospace_inputs/HT206B1-U16_input_counts.txt -o /diskmnt/Projects/Users/estorrs/mushroom/data/projects/submission_v1/cytospace_inputs/HT206B1-U16_/outputs -sm lap_CSPR -mcn 20 -g square -sss -nosss 5000 -nop 5
cytospace -sp /diskmnt/Projects/Users/estorrs/mushroom/data/projects/submission_v1/cytospace/breast/HT206B1_input_sc_exp.txt -ctp /diskmnt/Projects/Users/estorrs/mushroom/data/projects/submission_v1/cytospace/breast/HT206B1_input_sc_annot.txt -cp /diskmnt/Projects/Users/estorrs/mushroom/data/projects/submission_v1/cytospace_inputs/HT206B1-U17_input_co

## collate cytospace results