In [1]:
import os
import re
import json
import pickle 
from pathlib import Path
import yaml

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scanpy as sc
import scipy
import skimage
import pandas as pd
import torch
import torchvision
import torchvision.transforms.functional as TF
import tifffile
from sklearn.metrics import adjusted_rand_score
from scipy.cluster.hierarchy import dendrogram
from einops import rearrange, repeat

In [2]:
import matplotlib as mpl
mpl.rcParams['pdf.fonttype'] = 42
plt.rcParams['svg.fonttype'] = 'none'

In [3]:
%load_ext autoreload

In [4]:
%autoreload 2

In [277]:
import mushroom.utils as utils
import mushroom.data.multiplex as multiplex

## DEG supp files

###### automated 2D

In [128]:
fps = sorted(utils.listfiles(
    '/data/estorrs/mushroom/data/projects/submission_v1/analysis/region_characterization/results',
    regex=r'correlations_raw.txt'))

stacked = []
for fp in fps:
    name = fp.split('/')[-1]
    pattern = r'^([^_]+)_([^_]+)_([^_]+).*$'
    disease = re.sub(pattern, r'\1', name)
    dtype = re.sub(pattern, r'\2', name)
    rtype = re.sub(pattern, r'\3', name)

    source = pd.read_csv(fp, sep='\t', index_col=0)
    source['disease'] = disease
    source['dtype'] = dtype
    source['comparison_type'] = rtype
    stacked.append(source)
auto_2d_stat = pd.concat(stacked)
auto_2d_stat['gene'] = ['_'.join(x.split('_')[1:]) for x in auto_2d_stat['feature']]

auto_2d_stat

Unnamed: 0,case,sid,channel,feature,correlation,pvalue,disease,dtype,comparison_type,gene
0,HT206B1,HT206B1-U15,boundary_ACTA2,boundary_AATK,0.042400,0.530644,brca,cosmx,boundary,AATK
1,HT206B1,HT206B1-U15,boundary_ACTA2,boundary_ABL1,0.064535,0.339606,brca,cosmx,boundary,ABL1
2,HT206B1,HT206B1-U15,boundary_ACTA2,boundary_ABL2,0.117369,0.081695,brca,cosmx,boundary,ABL2
3,HT206B1,HT206B1-U15,boundary_ACTA2,boundary_ACACB,0.085246,0.206808,brca,cosmx,boundary,ACACB
4,HT206B1,HT206B1-U15,boundary_ACTA2,boundary_ACE,0.001241,0.985363,brca,cosmx,boundary,ACE
...,...,...,...,...,...,...,...,...,...,...
47069,S22-53426,S22-53426-A1U3,boundary_KRT5,tme_VIM,-0.022449,0.813425,prad,xenium,tme,VIM
47070,S22-53426,S22-53426-A1U3,boundary_KRT5,tme_VPREB1,-0.019738,0.835614,prad,xenium,tme,VPREB1
47071,S22-53426,S22-53426-A1U3,boundary_KRT5,tme_BLANK_0006,-0.005040,0.957751,prad,xenium,tme,BLANK_0006
47072,S22-53426,S22-53426-A1U3,boundary_KRT5,tme_BLANK_0037,-0.018337,0.847138,prad,xenium,tme,BLANK_0037


In [304]:
set(auto_2d_stat['dtype'])

{'cosmx', 'multiplex', 'vishd', 'xenium'}

###### manual 2D

In [92]:
manual_2d_exp = pd.read_csv('/data/estorrs/mushroom/data/projects/submission_v1/analysis/region_characterization/results/manual_2d_exp.txt',
                      sep='\t', index_col=0)
manual_2d_exp

Unnamed: 0_level_0,count,exp,z,sid,dtype,rid,gene,annotation,disease
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
HT891Z1-U1_bd656535-23c2-413a-b5f9-3c120ed43ae5_ACKR1,1,0.000344,0.014235,HT891Z1-U1,xenium,HT891Z1-U1_bd656535-23c2-413a-b5f9-3c120ed43ae5,ACKR1,normal,prad
HT891Z1-U1_bd656535-23c2-413a-b5f9-3c120ed43ae5_ACTA2,19,0.006543,0.270473,HT891Z1-U1,xenium,HT891Z1-U1_bd656535-23c2-413a-b5f9-3c120ed43ae5,ACTA2,normal,prad
HT891Z1-U1_bd656535-23c2-413a-b5f9-3c120ed43ae5_ACTG2,54,0.018595,0.768714,HT891Z1-U1,xenium,HT891Z1-U1_bd656535-23c2-413a-b5f9-3c120ed43ae5,ACTG2,normal,prad
HT891Z1-U1_bd656535-23c2-413a-b5f9-3c120ed43ae5_ADAM28,21,0.007231,0.298944,HT891Z1-U1,xenium,HT891Z1-U1_bd656535-23c2-413a-b5f9-3c120ed43ae5,ADAM28,normal,prad
HT891Z1-U1_bd656535-23c2-413a-b5f9-3c120ed43ae5_ADAM8,4,0.001377,0.056942,HT891Z1-U1,xenium,HT891Z1-U1_bd656535-23c2-413a-b5f9-3c120ed43ae5,ADAM8,normal,prad
...,...,...,...,...,...,...,...,...,...
S23-27043-A1U2_e810e3fa-7cbd-4dbe-b54f-1eaeb8662746_VCAN,201,0.007366,0.244161,S23-27043-A1U2,xenium,S23-27043-A1U2_e810e3fa-7cbd-4dbe-b54f-1eaeb86...,VCAN,gp3,prad
S23-27043-A1U2_e810e3fa-7cbd-4dbe-b54f-1eaeb8662746_VEGFA,455,0.016673,0.552704,S23-27043-A1U2,xenium,S23-27043-A1U2_e810e3fa-7cbd-4dbe-b54f-1eaeb86...,VEGFA,gp3,prad
S23-27043-A1U2_e810e3fa-7cbd-4dbe-b54f-1eaeb8662746_VSIG4,39,0.001429,0.047375,S23-27043-A1U2,xenium,S23-27043-A1U2_e810e3fa-7cbd-4dbe-b54f-1eaeb86...,VSIG4,gp3,prad
S23-27043-A1U2_e810e3fa-7cbd-4dbe-b54f-1eaeb8662746_VWA5A,48,0.001759,0.058307,S23-27043-A1U2,xenium,S23-27043-A1U2_e810e3fa-7cbd-4dbe-b54f-1eaeb86...,VWA5A,gp3,prad


In [124]:
manual_2d_stat = pd.read_csv('/data/estorrs/mushroom/data/projects/submission_v1/analysis/region_characterization/results/manual_2d_stat.txt',
                              sep='\t')
manual_2d_stat

Unnamed: 0,disease,dtype,comparison,gene,stat,pvalue,pvalue_fdr
0,brca,xenium,normal_idc,NAT8,17.056772,8.867908e-10,0.000004
1,brca,xenium,normal_idc,ACTA2,7.501127,2.438563e-09,0.000006
2,brca,xenium,normal_idc,SYNM,10.072110,7.995139e-09,0.000010
3,brca,xenium,normal_idc,HNRNPA1L2,10.237435,6.216373e-09,0.000010
4,brca,xenium,normal_idc,GTF2I,9.874708,1.083959e-08,0.000011
...,...,...,...,...,...,...,...
132811,prad,vishd,hgpin_gp3,NOD1,0.000918,9.992897e-01,0.999482
132812,prad,vishd,hgpin_gp3,ARHGEF35,0.001016,9.992144e-01,0.999482
132813,prad,vishd,hgpin_gp3,SMIM1,-0.000714,9.994478e-01,0.999512
132814,prad,vishd,hgpin_gp3,GRTP1,0.000781,9.993960e-01,0.999512


###### manual 3D

In [103]:
fps = []
for case in ['HT704B1', 'HT206B1', 'HT891Z1', 'HT913Z1']:
    fps += sorted(utils.listfiles(
        f'/data/estorrs/mushroom/data/projects/submission_v1/{case}/imaris/rois/regions',
        regex=r'json$')
    )
    
case_to_roi_to_data = {}
for fp in fps:
    d = json.load(open(fp))
    data = {
        'rids': d['region_ids'],
        'sids': d['sids'],
        'rid_to_patho': d['region_id_to_pathology']
    }

    case = fp.split('/')[-5]
    roi = fp.split('/')[-1].split('_')[0]

    if case not in case_to_roi_to_data:
        case_to_roi_to_data[case] = {}

    case_to_roi_to_data[case][roi] = data
    

In [110]:
ignore = [('HT206B1', 'roi3'), ('HT891Z1', 'roi2-p2'), ('HT913Z1', 'roi1-p2')]

In [115]:
fps = []
for case in ['HT704B1', 'HT206B1', 'HT891Z1', 'HT913Z1']:
    fps += sorted(utils.listfiles(
        f'/data/estorrs/mushroom/data/projects/submission_v1/{case}/imaris/rois/results',
        regex=r'_expression.txt.gz')
    )
fps = [fp for fp in fps if 'tme' not in fp]

stacked = []
for fp in fps:
    case = fp.split('/')[-5]
    df = pd.read_csv(fp, sep='\t')

    name = fp.split('/')[-1]
    pattern = r'^([^_]+)_(.+)_([^_]+).txt.gz$'
    roi = re.sub(pattern, r'\1', name)
    dti = re.sub(pattern, r'\2', name)

    if (case, roi) not in ignore and 'combined' not in dti:    
        df['disease'] = 'breast' if 'B1' in case else 'prostate'
        df['case'] = case
        df['roi'] = roi
        df['dtype_identifier'] = dti
        df['dtype'] = dti.split('_')[0]
    
        data = case_to_roi_to_data[case][roi]
        df['region_index'] = [data['rids'].index(x) for x in df['region_id']]
        df['sample_id'] = [data['sids'][i] for i in df['region_index']]
        df['annotation'] = [data['rid_to_patho'][x] for x in df['region_id']]
        
        
        
        df = df.melt(id_vars=['disease', 'case', 'roi', 'region_id', 'dtype', 'dtype_identifier', 'region_index', 'sample_id', 'annotation'])
        df.columns = list(df.columns)[:-2] + ['gene', 'expression']
        stacked.append(df)
manual_3d_exp = pd.concat(stacked)
manual_3d_exp

Unnamed: 0,disease,case,roi,region_id,dtype,dtype_identifier,region_index,sample_id,annotation,gene,expression
0,breast,HT704B1,roi1,25f59172-9dc6-4340-a21f-2a2b557176cf,cosmx,cosmx_0,4,HT704B1-U14,DCIS,AATK,1.260581
1,breast,HT704B1,roi1,a4719551-76dd-4117-a82a-c086568f52f2,cosmx,cosmx_0,21,HT704B1-U47,IDC,AATK,1.742139
2,breast,HT704B1,roi1,7ee62508-a502-404f-b93c-47d0fa181aab,cosmx,cosmx_0,27,HT704B1-U56,IDC,AATK,1.425868
3,breast,HT704B1,roi1,25f59172-9dc6-4340-a21f-2a2b557176cf,cosmx,cosmx_0,4,HT704B1-U14,DCIS,ABL1,1.482688
4,breast,HT704B1,roi1,a4719551-76dd-4117-a82a-c086568f52f2,cosmx,cosmx_0,21,HT704B1-U47,IDC,ABL1,1.617586
...,...,...,...,...,...,...,...,...,...,...,...
2851,prostate,HT913Z1,roi1,734069e4-66e4-4a0c-93ee-115a1cb6e4c0,xenium,xenium_1,20,HT913Z1-U34,Normal,VWF,0.010736
2852,prostate,HT913Z1,roi1,611c1c4c-4c78-407f-9592-994e87ee0ca0,xenium,xenium_1,21,HT913Z1-U35,Normal,VWF,0.035741
2853,prostate,HT913Z1,roi1,1215ccaf-0c17-4f34-a945-4fafeb196884,xenium,xenium_1,37,HT913Z1-U61,Normal,VWF,0.021357
2854,prostate,HT913Z1,roi1,6d6519b0-a5cc-4a41-b123-69bcc04fb956,xenium,xenium_1,49,HT913Z1-U81,Normal,VWF,0.243129


In [119]:
fps = []
for case in ['HT704B1', 'HT206B1', 'HT891Z1', 'HT913Z1']:
    fps += sorted(utils.listfiles(
        f'/data/estorrs/mushroom/data/projects/submission_v1/{case}/imaris/rois/results',
        regex=r'_fc.txt.gz')
    )
fps = [fp for fp in fps if 'tme' not in fp]

stacked = []
for fp in fps:
    case = fp.split('/')[-5]
    df = pd.read_csv(fp, sep='\t')
    
    name = fp.split('/')[-1]
    pattern = r'^([^_]+)_(.+)_([^_]+).txt.gz$'
    roi = re.sub(pattern, r'\1', name)
    dti = re.sub(pattern, r'\2', name)

    if (case, roi) not in ignore and 'combined' not in dti:    
        df['disease'] = 'breast' if 'B1' in case else 'prostate'
        df['case'] = case
        df['roi'] = roi
        df['dtype_identifier'] = dti
        df['dtype'] = dti.split('_')[0]
        df['transition_id'] = [x.replace('IDC - partial', 'MI') for x in df['transition_id']]

        df = df.melt(id_vars=['disease', 'case', 'roi', 'dtype', 'dtype_identifier', 'transition_id'])
        df.columns = list(df.columns)[:-2] + ['gene', 'fc']
        stacked.append(df)
manual_3d_fc = pd.concat(stacked)
manual_3d_fc

Unnamed: 0,disease,case,roi,dtype,dtype_identifier,transition_id,gene,fc
0,breast,HT704B1,roi1,xenium,xenium_0,DCIS_MI,ABCC11,-0.246402
1,breast,HT704B1,roi1,xenium,xenium_0,MI_IDC,ABCC11,0.015119
2,breast,HT704B1,roi1,xenium,xenium_0,DCIS_MI,ACE2,
3,breast,HT704B1,roi1,xenium,xenium_0,MI_IDC,ACE2,
4,breast,HT704B1,roi1,xenium,xenium_0,DCIS_MI,ACKR1,
...,...,...,...,...,...,...,...,...
947,prostate,HT891Z1,roi2,xenium,xenium_0,HG PIN_GP3,VSIG4,
948,prostate,HT891Z1,roi2,xenium,xenium_0,Normal_HG PIN,VWA5A,-0.306108
949,prostate,HT891Z1,roi2,xenium,xenium_0,HG PIN_GP3,VWA5A,
950,prostate,HT891Z1,roi2,xenium,xenium_0,Normal_HG PIN,VWF,


## save them all

In [120]:
directory = Path('/data/estorrs/mushroom/data/projects/submission_v1/supplement')
directory.mkdir(parents=True, exist_ok=True)
directory

PosixPath('/data/estorrs/mushroom/data/projects/submission_v1/supplement')

In [140]:
auto_2d_stat.to_csv(directory / 'automated_2d_correlations.txt', sep='\t', index=False)
manual_2d_exp.to_csv(directory / 'manual_2d_exp.txt', sep='\t')
manual_2d_stat.to_csv(directory / 'manual_2d_stat.txt', sep='\t', index=False)
manual_3d_exp.to_csv(directory / 'manual_3d_exp.txt', sep='\t', index=False)
manual_3d_fc.to_csv(directory / 'manual_3d_fc.txt', sep='\t', index=False)




## section metadata table

In [246]:
cases_3d = [
    'HT206B1',
    'HT397B1',
    'HT704B1',
    'HT891Z1',
    'HT913Z1',
    'HT339B2-H1',
    'HT565B1-H2',
    'S18-5591-C8',
    'S18-9906'
]

source_root = '/diskmnt/Projects/Users/estorrs/mushroom/data'
target_root = '/data/estorrs/mushroom/data'

def alter_filesystem(config, source_root, target_root):
    for entry in config['sections']:
        for mapping in entry['data']:
            mapping['filepath'] = mapping['filepath'].replace(source_root, target_root)
    
    if 'trainer_kwargs' in config and config['trainer_kwargs']['data_mask'] is not None:
        config['trainer_kwargs']['data_mask'] = config['trainer_kwargs']['data_mask'].replace(source_root, target_root)
        
    return config

case_to_config = {}
for case in cases_3d:
    project_dir = Path(f'/data/estorrs/mushroom/data/projects/submission_v1/{case}')
    config = yaml.safe_load(open(os.path.join(project_dir, 'registered', 'metadata.yaml')))
    config = alter_filesystem(config, source_root, target_root)
    case_to_config[case] = config
    

In [247]:
to_renamed = {line.split('\t')[0]:line.split('\t')[1] for line in """S18-11798	SP003Z1-Fp1
S18-15142	SP004Z1-Fp1
S18-18215	SP005Z1-Fp1
S18-5591	SP006Z1-Fp1
S18-8795	SP007Z1-Fp1
S18-9251	SP008Z1-Fp1
S18-9259	SP009Z1-Fp1
S19-41530	SP010Z1-Fp1
S20-12521	SP011Z1-Fp1
S21-11816	SP012Z1-Fp1
S21-44455	SP013Z1-Fp1
S22-44238	SP014Z1-Fp1
S22-53426	SP015Z1-Fp1
S18-8122	SP016Z1-Fp1
S18-9906	SP017Z1-Fp1""".split('\n')}

In [258]:
def alter_dtype(x):
    if 'batch' in x:
        return 'he'
    if x == 'multiplex':
        return 'codex'
    if x == 'vishd':
        return 'visiumhd'
    return x


data = []
for case, config in case_to_config.items():
    config = case_to_config[case]
    if 'HT' in case:
        case = re.sub(r'^(HT[0-9]+[BZ][0-9]).*$', r'\1', case)
    else:
        case = re.sub(r'^(S[0-9]+-[0-9]+).*$', r'\1', case)

    for entry in config['sections']:
        mapping = entry['data'][0]

        if case in to_renamed:
            renamed_case = to_renamed[case]
            renamed_case = renamed_case.replace('-Fp1', '')
            sid = entry['sid'].replace(case, renamed_case)
        else:
            renamed_case = case
            sid = entry['sid']
        
        data.append([renamed_case, sid, entry['position'], mapping['dtype'], mapping['filepath']])
meta = pd.DataFrame(data=data, columns=['case', 'section_id', 'z_depth', 'dtype', 'filepath'])

meta['dtype'] = [alter_dtype(x) for x in meta['dtype']]
meta['cohort'] = '3d'
meta['disease'] = ['prostate' if 'Z' in case else 'breast' for case in meta['case']]
meta

Unnamed: 0,case,section_id,z_depth,dtype,filepath,cohort,disease
0,HT206B1,HT206B1-U1,0,xenium,/data/estorrs/mushroom/data/projects/submissio...,3d,breast
1,HT206B1,HT206B1-U4,5,he,/data/estorrs/mushroom/data/projects/submissio...,3d,breast
2,HT206B1,HT206B1-U2,15,codex,/data/estorrs/mushroom/data/projects/submissio...,3d,breast
3,HT206B1,HT206B1-U5,20,codex,/data/estorrs/mushroom/data/projects/submissio...,3d,breast
4,HT206B1,HT206B1-U8,40,xenium,/data/estorrs/mushroom/data/projects/submissio...,3d,breast
...,...,...,...,...,...,...,...
228,SP017Z1,SP017Z1-U17,120,xenium,/data/estorrs/mushroom/data/projects/submissio...,3d,prostate
229,SP017Z1,SP017Z1-U18,125,codex,/data/estorrs/mushroom/data/projects/submissio...,3d,prostate
230,SP017Z1,SP017Z1-U20,135,he,/data/estorrs/mushroom/data/projects/submissio...,3d,prostate
231,SP017Z1,SP017Z1-U24,155,xenium,/data/estorrs/mushroom/data/projects/submissio...,3d,prostate


In [259]:
sids_2d = [x for x in sorted(set(auto_2d_stat['sid'])) if not any([c in x for c in cases_3d])]
sids_2d = [x for x in sids_2d if 'HT268' not in x]

data = []
for sid in sids_2d:
    if 'HT' in sid:
        case = re.sub(r'^(HT[0-9]+[BZ][0-9]).*$', r'\1', sid)
        rest = re.sub(r'^(HT[0-9]+[BZ][0-9])(.*)$', r'\2', sid)
    else:
        case = re.sub(r'^(S[0-9]+-[0-9]+).*$', r'\1', sid)
        rest = re.sub(r'^(S[0-9]+-[0-9]+)(.*)$', r'\2', sid)
        case = to_renamed.get(case, case)
        sid = case
        case = case.replace('-Fp1', '')
    data.append([case, sid, np.nan, 'xenium', None, '2d', 'prostate' if 'Z' in case else 'breast'])
meta_2d = pd.DataFrame(data=data, columns=['case', 'section_id', 'z_depth', 'dtype', 'filepath', 'cohort', 'disease'])
meta = pd.concat((meta, meta_2d))

In [296]:
for x in sids_2d: print(x)

HT591B1-S1H1Fp1Us1_1
HT814Z1-S1H1Fp1Us1_3
HT817Z1-S1H1Fp1Us1_2
HT832Z1-S1H1Fp1U1
HT832Z1U1
HT849Z1-S1H2Fp1U1
HT852Z1-S1H1Fp1U1
S18-11798-A15Us1_1
S18-15142-B17Us2_1
S18-15142Fp1Us1_1
S18-18215-A25Us1_1
S18-8122-B28U1
S18-8795-A10Us2_1
S18-9251-F15U1
S18-9259-B14U1
S19-41530-D1U3
S20-12521-A15U1
S21-11816-A2U3
S21-44455-A1U3
S22-44238-A29U1
S22-53426-A1U3


In [260]:
meta = meta.sort_values(['disease', 'case', 'z_depth'])
meta

Unnamed: 0,case,section_id,z_depth,dtype,filepath,cohort,disease
0,HT206B1,HT206B1-U1,0.0,xenium,/data/estorrs/mushroom/data/projects/submissio...,3d,breast
1,HT206B1,HT206B1-U4,5.0,he,/data/estorrs/mushroom/data/projects/submissio...,3d,breast
2,HT206B1,HT206B1-U2,15.0,codex,/data/estorrs/mushroom/data/projects/submissio...,3d,breast
3,HT206B1,HT206B1-U5,20.0,codex,/data/estorrs/mushroom/data/projects/submissio...,3d,breast
4,HT206B1,HT206B1-U8,40.0,xenium,/data/estorrs/mushroom/data/projects/submissio...,3d,breast
...,...,...,...,...,...,...,...
228,SP017Z1,SP017Z1-U17,120.0,xenium,/data/estorrs/mushroom/data/projects/submissio...,3d,prostate
229,SP017Z1,SP017Z1-U18,125.0,codex,/data/estorrs/mushroom/data/projects/submissio...,3d,prostate
230,SP017Z1,SP017Z1-U20,135.0,he,/data/estorrs/mushroom/data/projects/submissio...,3d,prostate
231,SP017Z1,SP017Z1-U24,155.0,xenium,/data/estorrs/mushroom/data/projects/submissio...,3d,prostate


In [261]:
meta.to_csv(directory / 'section_metadata.txt', sep='\t', index=False)

In [266]:
dtypes = sorted(set(meta['dtype']))
for dtype in dtypes:
    f = meta[meta['dtype']==dtype]
    print(dtype, len(set(f['case'])), f.shape[0])

codex 9 66
cosmx 3 7
he 7 110
visium 4 6
visiumhd 2 4
xenium 25 61


In [267]:
diseases = sorted(set(meta['disease']))
for disease in diseases:
    f = meta[meta['disease']==disease]
    print(disease, len(set(f['case'])), f.shape[0])

breast 6 77
prostate 22 177


In [268]:
cohorts = sorted(set(meta['cohort']))
for cohort in cohorts:
    f = meta[meta['cohort']==cohort]
    print(cohort, len(set(f['case'])), f.shape[0])

2d 19 21
3d 9 233


In [269]:
cohorts = sorted(set(meta['cohort']))
for cohort in cohorts:
    for disease in diseases:
        f = meta[((meta['cohort']==cohort)&(meta['disease']==disease))]
        print(cohort, disease, len(set(f['case'])), f.shape[0])

2d breast 1 1
2d prostate 18 20
3d breast 5 76
3d prostate 4 157


In [271]:
for case in sorted(set(meta['case'])):
    f = meta[meta['case']==case]
    for dtype in dtypes:
        z = f[f['dtype']==dtype]
        print(case, dtype, z.shape[0])

HT206B1 codex 6
HT206B1 cosmx 1
HT206B1 he 3
HT206B1 visium 0
HT206B1 visiumhd 0
HT206B1 xenium 6
HT339B2 codex 1
HT339B2 cosmx 0
HT339B2 he 0
HT339B2 visium 1
HT339B2 visiumhd 0
HT339B2 xenium 0
HT397B1 codex 4
HT397B1 cosmx 0
HT397B1 he 6
HT397B1 visium 2
HT397B1 visiumhd 0
HT397B1 xenium 0
HT565B1 codex 1
HT565B1 cosmx 0
HT565B1 he 0
HT565B1 visium 1
HT565B1 visiumhd 0
HT565B1 xenium 0
HT591B1 codex 0
HT591B1 cosmx 0
HT591B1 he 0
HT591B1 visium 0
HT591B1 visiumhd 0
HT591B1 xenium 1
HT704B1 codex 12
HT704B1 cosmx 4
HT704B1 he 20
HT704B1 visium 0
HT704B1 visiumhd 2
HT704B1 xenium 6
HT814Z1 codex 0
HT814Z1 cosmx 0
HT814Z1 he 0
HT814Z1 visium 0
HT814Z1 visiumhd 0
HT814Z1 xenium 1
HT817Z1 codex 0
HT817Z1 cosmx 0
HT817Z1 he 0
HT817Z1 visium 0
HT817Z1 visiumhd 0
HT817Z1 xenium 1
HT832Z1 codex 0
HT832Z1 cosmx 0
HT832Z1 he 0
HT832Z1 visium 0
HT832Z1 visiumhd 0
HT832Z1 xenium 2
HT849Z1 codex 0
HT849Z1 cosmx 0
HT849Z1 he 0
HT849Z1 visium 0
HT849Z1 visiumhd 0
HT849Z1 xenium 1
HT852Z1 codex 0
HT

In [274]:
for x in sorted(set(meta['case'])): print(x)


HT206B1
HT339B2
HT397B1
HT565B1
HT591B1
HT704B1
HT814Z1
HT817Z1
HT832Z1
HT849Z1
HT852Z1
HT891Z1
HT913Z1
SP003Z1
SP004Z1
SP005Z1
SP006Z1
SP007Z1
SP008Z1
SP009Z1
SP010Z1
SP011Z1
SP012Z1
SP013Z1
SP014Z1
SP015Z1
SP016Z1
SP017Z1


In [307]:
for x in meta[meta['cohort']=='2d']['case'].to_list(): print(x)

HT591B1
HT814Z1
HT817Z1
HT832Z1
HT832Z1
HT849Z1
HT852Z1
SP003Z1
SP004Z1
SP004Z1
SP005Z1
SP007Z1
SP008Z1
SP009Z1
SP010Z1
SP011Z1
SP012Z1
SP013Z1
SP014Z1
SP015Z1
SP016Z1


In [265]:
meta[meta['cohort']=='2d']

Unnamed: 0,case,section_id,z_depth,dtype,filepath,cohort,disease
0,HT591B1,HT591B1-S1H1Fp1Us1_1,,xenium,,2d,breast
1,HT814Z1,HT814Z1-S1H1Fp1Us1_3,,xenium,,2d,prostate
2,HT817Z1,HT817Z1-S1H1Fp1Us1_2,,xenium,,2d,prostate
3,HT832Z1,HT832Z1-S1H1Fp1U1,,xenium,,2d,prostate
4,HT832Z1,HT832Z1U1,,xenium,,2d,prostate
5,HT849Z1,HT849Z1-S1H2Fp1U1,,xenium,,2d,prostate
6,HT852Z1,HT852Z1-S1H1Fp1U1,,xenium,,2d,prostate
7,SP003Z1,SP003Z1-Fp1,,xenium,,2d,prostate
8,SP004Z1,SP004Z1-Fp1,,xenium,,2d,prostate
9,SP004Z1,SP004Z1-Fp1,,xenium,,2d,prostate


## CODEX panel info

In [275]:
meta = pd.read_csv(directory / 'section_metadata.txt', sep='\t')
meta

Unnamed: 0,case,section_id,z_depth,dtype,filepath,cohort,disease
0,HT206B1,HT206B1-U1,0.0,xenium,/data/estorrs/mushroom/data/projects/submissio...,3d,breast
1,HT206B1,HT206B1-U4,5.0,he,/data/estorrs/mushroom/data/projects/submissio...,3d,breast
2,HT206B1,HT206B1-U2,15.0,codex,/data/estorrs/mushroom/data/projects/submissio...,3d,breast
3,HT206B1,HT206B1-U5,20.0,codex,/data/estorrs/mushroom/data/projects/submissio...,3d,breast
4,HT206B1,HT206B1-U8,40.0,xenium,/data/estorrs/mushroom/data/projects/submissio...,3d,breast
...,...,...,...,...,...,...,...
249,SP017Z1,SP017Z1-U17,120.0,xenium,/data/estorrs/mushroom/data/projects/submissio...,3d,prostate
250,SP017Z1,SP017Z1-U18,125.0,codex,/data/estorrs/mushroom/data/projects/submissio...,3d,prostate
251,SP017Z1,SP017Z1-U20,135.0,he,/data/estorrs/mushroom/data/projects/submissio...,3d,prostate
252,SP017Z1,SP017Z1-U24,155.0,xenium,/data/estorrs/mushroom/data/projects/submissio...,3d,prostate


In [276]:
f = meta[meta['dtype']=='codex']
f

Unnamed: 0,case,section_id,z_depth,dtype,filepath,cohort,disease
2,HT206B1,HT206B1-U2,15.0,codex,/data/estorrs/mushroom/data/projects/submissio...,3d,breast
3,HT206B1,HT206B1-U5,20.0,codex,/data/estorrs/mushroom/data/projects/submissio...,3d,breast
6,HT206B1,HT206B1-U10,50.0,codex,/data/estorrs/mushroom/data/projects/submissio...,3d,breast
8,HT206B1,HT206B1-U13,65.0,codex,/data/estorrs/mushroom/data/projects/submissio...,3d,breast
12,HT206B1,HT206B1-U18,90.0,codex,/data/estorrs/mushroom/data/projects/submissio...,3d,breast
...,...,...,...,...,...,...,...
230,SP006Z1,SP006Z1-C8-U24,115.0,codex,/data/estorrs/mushroom/data/projects/submissio...,3d,prostate
243,SP017Z1,SP017Z1-U3,10.0,codex,/data/estorrs/mushroom/data/projects/submissio...,3d,prostate
246,SP017Z1,SP017Z1-U10,45.0,codex,/data/estorrs/mushroom/data/projects/submissio...,3d,prostate
250,SP017Z1,SP017Z1-U18,125.0,codex,/data/estorrs/mushroom/data/projects/submissio...,3d,prostate


In [292]:
# we need to add validation samples

temp = pd.DataFrame(data=[
    ['HT323B1', 'HT323B1-H1A4', np.nan, 'codex', '/data/estorrs/sandbox/mushroom/codex/HT323B1-H1A4__20220210.ome.tiff', 'validation', 'breast'],
    ['HT110B1', 'HT110B1-S1H4', np.nan, 'codex', '/data/estorrs/sandbox/mushroom/codex/HT110B1-S1H4__20221216.ome.tiff', 'validation', 'breast'],
], columns=['case', 'section_id', 'z_depth', 'dtype', 'filepath', 'cohort', 'disease'])
f = pd.concat((f, temp))
f
                    

Unnamed: 0,case,section_id,z_depth,dtype,filepath,cohort,disease
0,HT206B1,HT206B1-U2,15.0,codex,/data/estorrs/mushroom/data/projects/submissio...,3d,breast
1,HT206B1,HT206B1-U5,20.0,codex,/data/estorrs/mushroom/data/projects/submissio...,3d,breast
2,HT206B1,HT206B1-U10,50.0,codex,/data/estorrs/mushroom/data/projects/submissio...,3d,breast
3,HT206B1,HT206B1-U13,65.0,codex,/data/estorrs/mushroom/data/projects/submissio...,3d,breast
4,HT206B1,HT206B1-U18,90.0,codex,/data/estorrs/mushroom/data/projects/submissio...,3d,breast
...,...,...,...,...,...,...,...
63,SP017Z1,SP017Z1-U10,45.0,codex,/data/estorrs/mushroom/data/projects/submissio...,3d,prostate
64,SP017Z1,SP017Z1-U18,125.0,codex,/data/estorrs/mushroom/data/projects/submissio...,3d,prostate
65,SP017Z1,SP017Z1-U25,160.0,codex,/data/estorrs/mushroom/data/projects/submissio...,3d,prostate
0,HT323B1,HT323B1-H1A4,,codex,/data/estorrs/sandbox/mushroom/codex/HT323B1-H...,validation,breast


In [293]:
channel_xs = []
for i, row in f.iterrows():
    channels = multiplex.get_ome_tiff_channels(row['filepath'])
    channel_xs.append(channels)

max_n = np.max([len(x) for x in channel_xs])

tail = np.empty((len(channel_xs), max_n), dtype=object)

for i, channels in enumerate(channel_xs):
    tail[i, :len(channels)] = channels

cols = [f'channel_{i}' for i in range(max_n)]
tail = pd.DataFrame(data=tail, columns=cols)

f.index = list(range(f.shape[0]))

codex_table = pd.concat((f, tail), axis=1)
codex_table

Unnamed: 0,case,section_id,z_depth,dtype,filepath,cohort,disease,channel_0,channel_1,channel_2,...,channel_28,channel_29,channel_30,channel_31,channel_32,channel_33,channel_34,channel_35,channel_36,channel_37
0,HT206B1,HT206B1-U2,15.0,codex,/data/estorrs/mushroom/data/projects/submissio...,3d,breast,DAPI,Granzyme B,Keratin 5,...,Vimentin,CD31,E-cadherin,SMA (D),cd11b,PR (D),CD45 (D),ER,COX6C (D),
1,HT206B1,HT206B1-U5,20.0,codex,/data/estorrs/mushroom/data/projects/submissio...,3d,breast,DAPI,Granzyme B,Keratin 5,...,Vimentin,CD3e,E-cadherin,SMA (D),cd11b,PR (D),CD45 (D),CD31,GATA3 (D),COX6C (D)
2,HT206B1,HT206B1-U10,50.0,codex,/data/estorrs/mushroom/data/projects/submissio...,3d,breast,DAPI,Granzyme B,Keratin 5,...,Vimentin,CD31,E-cadherin,SMA (D),cd11b,PR (D),CD45 (D),ER,COX6C (D),
3,HT206B1,HT206B1-U13,65.0,codex,/data/estorrs/mushroom/data/projects/submissio...,3d,breast,DAPI,Granzyme B,Keratin 5,...,Vimentin,CD3e,E-cadherin,SMA (D),cd11b,PR (D),CD45 (D),CD31,GATA3 (D),COX6C (D)
4,HT206B1,HT206B1-U18,90.0,codex,/data/estorrs/mushroom/data/projects/submissio...,3d,breast,DAPI,Granzyme B,Keratin 5,...,Vimentin,CD31,E-cadherin,SMA (D),cd11b,PR (D),CD45 (D),ER,COX6C (D),
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63,SP017Z1,SP017Z1-U10,45.0,codex,/data/estorrs/mushroom/data/projects/submissio...,3d,prostate,DAPI,CK14,LYVE1,...,,,,,,,,,,
64,SP017Z1,SP017Z1-U18,125.0,codex,/data/estorrs/mushroom/data/projects/submissio...,3d,prostate,DAPI,CK14,LYVE1,...,,,,,,,,,,
65,SP017Z1,SP017Z1-U25,160.0,codex,/data/estorrs/mushroom/data/projects/submissio...,3d,prostate,DAPI,CK14,LYVE1,...,,,,,,,,,,
66,HT323B1,HT323B1-H1A4,,codex,/data/estorrs/sandbox/mushroom/codex/HT323B1-H...,validation,breast,DAPI,CD31,E-cadherin,...,CD36,CK14,CD11c,CD44,,,,,,


In [294]:
codex_table.to_csv(directory / 'codex_metadata.txt', sep='\t', index=False)

## ROI metadata

In [302]:
data = []
for case, d in case_to_roi_to_data.items():
    for roi, dat in d.items():
        rids, sids, rid_to_patho = dat['rids'], dat['sids'], dat['rid_to_patho']
        for i, (rid, sid) in enumerate(zip(rids, sids)):
            annot = rid_to_patho[rid]
            data.append([case, roi, rid, sid, i + 1, annot])
region_3d_meta = pd.DataFrame(data=data, columns=['case', 'roi', 'region_id', 'section_id', 'region_index', 'pathology_annotation'])
region_3d_meta['pathology_annotation'] = [x.replace('IDC - partial', 'MI') for x in region_3d_meta['pathology_annotation']]
region_3d_meta
        

Unnamed: 0,case,roi,region_id,section_id,region_index,pathology_annotation
0,HT704B1,roi1,b28d28d2-098a-41aa-8a99-24f8c8753897,HT704B1-U1,1,DCIS
1,HT704B1,roi1,efac94bc-f48c-410e-a57d-3cfccc9f95f5,HT704B1-U2,2,DCIS
2,HT704B1,roi1,e1967bc8-a7b8-4c08-94f1-20963ffb9e7d,HT704B1-U11,3,DCIS
3,HT704B1,roi1,bdb3ea48-34e3-43e8-b8b6-042ab38be524,HT704B1-U12,4,DCIS
4,HT704B1,roi1,25f59172-9dc6-4340-a21f-2a2b557176cf,HT704B1-U14,5,DCIS
...,...,...,...,...,...,...
540,HT913Z1,roi1,f0b857f5-260b-4245-b625-664515bdbdf7,HT913Z1-U108,65,Normal
541,HT913Z1,roi1,707b6831-fa35-423f-b35c-86fbf4f2b14f,HT913Z1-U110,66,Normal
542,HT913Z1,roi1,a278652c-7add-4781-9926-959de4f531fd,HT913Z1-U111,67,Normal
543,HT913Z1,roi1,19b3be03-9e23-477e-a412-b9e36b31dac0,HT913Z1-U112,68,Normal


In [303]:
region_3d_meta.to_csv(directory / 'region_3d_metadata.txt', sep='\t', index=False)

## 2D DEG correlation

In [None]:
epi_genes = [
    'MGP',
    'FLRT3',
    'PLAT',
    'CD44',
    'KRT8',
    'ANPEP',
    'GDF15',
    'ALDH1A3',
    'FASN',
    'EPHA6',
]

tme_genes = [
    'SPP1',
    'MS4A6A',
    'MFAP5',
    'ASPN'
]



## supporting counts

In [53]:
genes = """MGP
FLRT3
PLAT
CD44
KRT8
ANPEP
GDF15
ALDH1A3
FASN
EPHA6
SPP1
MS4A6A
MFAP5
ASPN
TCIM
TFF1
MKI67
HMGCS2
VEGFA
NTN4
SOX9
FASN
DUOXA1
UPK1A
SERPINA3
CPLX3
TCIM
SORL
FHL2
CP 
CAMK2N1
GFRA1
MYC
CEACAM20
GLUL
LGALS1
SLC40A1
mgp
plat/tpa
ki67
cd44""".split('\n')
genes

['MGP',
 'FLRT3',
 'PLAT',
 'CD44',
 'KRT8',
 'ANPEP',
 'GDF15',
 'ALDH1A3',
 'FASN',
 'EPHA6',
 'SPP1',
 'MS4A6A',
 'MFAP5',
 'ASPN',
 'TCIM',
 'TFF1',
 'MKI67',
 'HMGCS2',
 'VEGFA',
 'NTN4',
 'SOX9',
 'FASN',
 'DUOXA1',
 'UPK1A',
 'SERPINA3',
 'CPLX3',
 'TCIM',
 'SORL',
 'FHL2',
 'CP ',
 'CAMK2N1',
 'GFRA1',
 'MYC',
 'CEACAM20',
 'GLUL',
 'LGALS1',
 'SLC40A1',
 'mgp',
 'plat/tpa',
 'ki67',
 'cd44']

In [68]:
pos_epi = ['MGP',
 'FLRT3',
 'PLAT',
 'TFF1',
 'HMGCS2',
 'VEGFA',
 'NTN4',
 'SOX9',
 'DUOXA1',
 'UPK1A',
 'CPLX3',
 'SORL',
 'FHL2',
 'CP ',
 'CAMK2N1',
 'GFRA1',
 'CEACAM20',
 'SLC40A1',
 'mgp',
 'plat/tpa'
]

neg_epi = [
 'CD44',
 'KRT8',
 'ANPEP',
 'GDF15',
 'ALDH1A3',
 'FASN',
 'EPHA6',
 'TCIM',
 'MKI67',
 'VEGFA',
 'FASN',
 'SERPINA3',
 'CPLX3',
 'TCIM',
 'CAMK2N1',
 'GFRA1',
 'MYC',
 'GLUL',
 'LGALS1',
 'SLC40A1',
 'ki67',
 'cd44']

pos_tme = [
    'MS4A6A',
]

neg_tme = [
    'SPP1',
    'MFAP5',
    'ASPN'
]

In [37]:
df = pd.read_csv(
    '/data/estorrs/mushroom/data/projects/submission_v1/analysis/region_characterization/results/counts_support.txt', sep='\t')
df['gene'] = [x.replace('region_', '').replace('tme_', '') for x in df['feature']]
df

Unnamed: 0,disease,dtype,type,feature,sid_count,case_count,correlation,correlation_abs,direction,gene
0,brca,multiplex,region,region_a-amylase,1,1,-0.100809,0.100809,negative,a-amylase
1,brca,multiplex,region,region_bap1,1,1,-0.077325,0.077325,negative,bap1
2,brca,multiplex,region,region_bca1,6,2,-0.078976,0.078976,negative,bca1
3,brca,multiplex,region,region_cd11b,6,1,0.074927,0.074927,positive,cd11b
4,brca,multiplex,region,region_cd163,4,1,-0.022789,0.022789,negative,cd163
...,...,...,...,...,...,...,...,...,...,...
153673,prad,vishd,region,region_celltypes_Macrophage,1,1,0.256306,0.256306,positive,celltypes_Macrophage
153674,prad,vishd,region,region_celltypes_Mast,0,0,0.025979,0.025979,negative,celltypes_Mast
153675,prad,vishd,region,region_celltypes_Pericyte,1,1,0.136376,0.136376,positive,celltypes_Pericyte
153676,prad,vishd,region,region_celltypes_Smooth Muscle Cells,0,0,-0.000101,0.000101,negative,celltypes_Smooth Muscle Cells


In [62]:
df[df['gene']=='NTN4']

Unnamed: 0,disease,dtype,type,feature,sid_count,case_count,correlation,correlation_abs,direction,gene
prad_cosmx_region_negative_NTN4,prad,cosmx,region,region_NTN4,0,0,,,negative,NTN4
prad_cosmx6k_region_negative_NTN4,prad,cosmx6k,region,region_NTN4,0,0,0.030317,0.030317,negative,NTN4
prad_cosmx_tme_negative_NTN4,prad,cosmx,tme,tme_NTN4,0,0,,,negative,NTN4
prad_cosmx6k_tme_negative_NTN4,prad,cosmx6k,tme,tme_NTN4,0,0,-0.009673,0.009673,negative,NTN4
brca_xenium_region_negative_NTN4,brca,xenium,region,region_NTN4,7,2,0.143625,0.143625,negative,NTN4
brca_xenium5k_region_negative_NTN4,brca,xenium5k,region,region_NTN4,0,0,,,negative,NTN4
brca_xenium_tme_positive_NTN4,brca,xenium,tme,tme_NTN4,8,3,0.243638,0.243638,positive,NTN4
brca_xenium5k_tme_negative_NTN4,brca,xenium5k,tme,tme_NTN4,0,0,,,negative,NTN4
prad_xenium_region_positive_NTN4,prad,xenium,region,region_NTN4,37,19,0.453749,0.453749,positive,NTN4
prad_xenium5k_region_positive_NTN4,prad,xenium5k,region,region_NTN4,5,3,0.62629,0.62629,positive,NTN4


In [59]:
def get_support_df(rtype, direction, genes):
    df.index = [f'{u}_{w}_{x}_{y}_{z}' for u, w, x, y, z in df[['disease', 'dtype', 'type', 'direction', 'gene']].values]
    dtypes = sorted(set(df['dtype']))
    diseases = sorted(set(df['disease']))
    
    data = []
    for gene in genes:
        xs = [gene]
        cols = ['gene']
        for disease in diseases:
            for dtype in dtypes:
                ident = f'{disease}_{dtype}_{rtype}_{direction}_{gene}'
                if ident in df.index:
                    row = df.loc[ident]
                    xs += [row['sid_count'], row['case_count'], row['correlation']]
                else:
                    xs += [np.nan, np.nan, np.nan]
                prefix = f'{disease}|{dtype}'
                cols += [f'{prefix}|{x}' for x in ['sid_count', 'case_count', 'correlation']]
        data.append(xs)
    
    support = pd.DataFrame(data=data, columns=cols)
    return support

In [63]:
rtype, direction = 'region', 'positive'
support = get_support_df(rtype, direction, pos_epi)

In [64]:
support.to_csv(f'/data/estorrs/mushroom/data/projects/submission_v1/analysis/region_characterization/results/key_genes_support_{direction}_{rtype}.txt',
               sep='\t', index=False)

In [66]:
rtype, direction = 'region', 'negative'
support = get_support_df(rtype, direction, neg_epi)

In [67]:
support.to_csv(f'/data/estorrs/mushroom/data/projects/submission_v1/analysis/region_characterization/results/key_genes_support_{direction}_{rtype}.txt',
               sep='\t', index=False)

In [69]:
rtype, direction = 'tme', 'positive'
support = get_support_df(rtype, direction, pos_tme)

In [70]:
support.to_csv(f'/data/estorrs/mushroom/data/projects/submission_v1/analysis/region_characterization/results/key_genes_support_{direction}_{rtype}.txt',
               sep='\t', index=False)

In [71]:
rtype, direction = 'tme', 'negative'
support = get_support_df(rtype, direction, neg_tme)

In [72]:
support.to_csv(f'/data/estorrs/mushroom/data/projects/submission_v1/analysis/region_characterization/results/key_genes_support_{direction}_{rtype}.txt',
               sep='\t', index=False)