In [1]:
import os
import re
import pickle
from pathlib import Path
import yaml

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scanpy as sc
import scipy
import pandas as pd
import torch
import tifffile
from sklearn.metrics import adjusted_rand_score
from einops import rearrange, repeat

In [2]:
import matplotlib as mpl
mpl.rcParams['pdf.fonttype'] = 42
plt.rcParams['svg.fonttype'] = 'none'

In [3]:
%load_ext autoreload

In [4]:
%autoreload 2

In [5]:
import mushroom.utils as utils
import mushroom.visualization.utils as vis_utils
import mushroom.data.datasets as datasets
import mushroom.data.visium as visium
import mushroom.data.xenium as xenium
import mushroom.data.multiplex as multiplex
import mushroom.data.he as he
import mushroom.registration.bigwarp as bigwarp
from mushroom.mushroom import Mushroom

In [6]:
project_dir = '/data/estorrs/mushroom/data/projects/submission_v1'
output_dir = os.path.join(project_dir, 'analysis', 'dataset_summary')
Path(output_dir).mkdir(parents=True, exist_ok=True)

In [7]:
output_dir

'/data/estorrs/mushroom/data/projects/submission_v1/analysis/dataset_summary'

In [8]:
def alter_filesystem(config, source_root, target_root):
    for entry in config['sections']:
        for mapping in entry['data']:
            mapping['filepath'] = mapping['filepath'].replace(source_root, target_root)
    
    if 'trainer_kwargs' in config and config['trainer_kwargs']['data_mask'] is not None:
        config['trainer_kwargs']['data_mask'] = config['trainer_kwargs']['data_mask'].replace(source_root, target_root)
        
    return config

In [9]:
source_root = '/diskmnt/Projects/Users/estorrs/mushroom/data'
target_root = '/data/estorrs/mushroom/data'

In [15]:
fps = sorted(utils.listfiles(project_dir, regex=r'mushroom_50res/nbhd_volumes.*pkl$'))
fps

['/data/estorrs/mushroom/data/projects/submission_v1/C3L-00970/mushroom_50res/nbhd_volumes_l0.pkl',
 '/data/estorrs/mushroom/data/projects/submission_v1/C3L-00970/mushroom_50res/nbhd_volumes_l1.pkl',
 '/data/estorrs/mushroom/data/projects/submission_v1/C3L-00970/mushroom_50res/nbhd_volumes_l2.pkl',
 '/data/estorrs/mushroom/data/projects/submission_v1/C3L-00982/mushroom_50res/nbhd_volumes_l0.pkl',
 '/data/estorrs/mushroom/data/projects/submission_v1/C3L-00982/mushroom_50res/nbhd_volumes_l1.pkl',
 '/data/estorrs/mushroom/data/projects/submission_v1/C3L-00982/mushroom_50res/nbhd_volumes_l2.pkl',
 '/data/estorrs/mushroom/data/projects/submission_v1/C3L-01287/mushroom_50res/nbhd_volumes_l0.pkl',
 '/data/estorrs/mushroom/data/projects/submission_v1/C3L-01287/mushroom_50res/nbhd_volumes_l1.pkl',
 '/data/estorrs/mushroom/data/projects/submission_v1/C3L-01287/mushroom_50res/nbhd_volumes_l2.pkl',
 '/data/estorrs/mushroom/data/projects/submission_v1/C3L-02551/mushroom_50res/nbhd_volumes_l0.pkl',


In [18]:
case_to_level_to_volume = {}
for fp in fps:
    case = fp.split('/')[-3]
    level = int(re.sub(r'^.*nbhd_volumes_l([0-9]+).pkl$', r'\1', fp))
    
    if case not in case_to_level_to_volume:
        case_to_level_to_volume[case] = {}
    case_to_level_to_volume[case][level] = fp

In [19]:
import shutil

In [21]:
# copy to new directory for easy reading locally with napari
for fp in fps:
    case = fp.split('/')[-3]
    level = int(re.sub(r'^.*nbhd_volumes_l([0-9]+).pkl$', r'\1', fp))
    
    shutil.copy(fp, os.path.join(output_dir, 'volumes', f'{case}_l{level}_volumes.pkl'))

In [23]:
fps = sorted(utils.listfiles(project_dir, regex=r'mushroom_50res/config.yaml$'))
fps

['/data/estorrs/mushroom/data/projects/submission_v1/C3L-00970/mushroom_50res/config.yaml',
 '/data/estorrs/mushroom/data/projects/submission_v1/C3L-00982/mushroom_50res/config.yaml',
 '/data/estorrs/mushroom/data/projects/submission_v1/C3L-01287/mushroom_50res/config.yaml',
 '/data/estorrs/mushroom/data/projects/submission_v1/C3L-02551/mushroom_50res/config.yaml',
 '/data/estorrs/mushroom/data/projects/submission_v1/HT206B1/mushroom_50res/config.yaml',
 '/data/estorrs/mushroom/data/projects/submission_v1/HT225C1/mushroom_50res/config.yaml',
 '/data/estorrs/mushroom/data/projects/submission_v1/HT268B1/mushroom_50res/config.yaml',
 '/data/estorrs/mushroom/data/projects/submission_v1/HT339B2-H1/mushroom_50res/config.yaml',
 '/data/estorrs/mushroom/data/projects/submission_v1/HT397B1/mushroom_50res/config.yaml',
 '/data/estorrs/mushroom/data/projects/submission_v1/HT413C1-K2/mushroom_50res/config.yaml',
 '/data/estorrs/mushroom/data/projects/submission_v1/HT413C1-Th1k4A1/mushroom_50res/co

In [98]:
exclude = ['S18-25943-A7']

In [99]:
case_to_sections = {fp.split('/')[-3]:yaml.safe_load(open(fp))['sections'] for fp in fps}
case_to_sections = {k:v for k, v in case_to_sections.items() if k not in exclude}
case_to_sections.keys()

dict_keys(['C3L-00970', 'C3L-00982', 'C3L-01287', 'C3L-02551', 'HT206B1', 'HT225C1', 'HT268B1', 'HT339B2-H1', 'HT397B1', 'HT413C1-K2', 'HT413C1-Th1k4A1', 'HT448C1-Th1K1Fp1', 'HT553P1-H2', 'HT565B1-H2', 'S18-5591-C8', 'S18-9906', 'WD-76845'])

In [100]:
case_meta = pd.read_csv(os.path.join(output_dir, 'case_metadata.txt'), sep='\t')
case_meta['n_sections'] = [len(case_to_sections[x]) for x in case_meta['case']]
    
case_meta

Unnamed: 0,case,tissue_type,cohort,n_sections
0,C3L-00970,CCRCC,inhouse,2
1,C3L-00982,CCRCC,inhouse,5
2,C3L-01287,CCRCC,inhouse,2
3,C3L-02551,CCRCC,inhouse,5
4,HT206B1,BRCA,inhouse,15
5,HT225C1,CRC,inhouse,4
6,HT268B1,BRCA,inhouse,4
7,HT339B2-H1,BRCA,inhouse,2
8,HT397B1,BRCA,inhouse,12
9,HT413C1-K2,CRC,inhouse,2


In [101]:
case_order = case_meta.sort_values(['tissue_type', 'n_sections'])['case'].to_list()
case_order

['HT339B2-H1',
 'HT565B1-H2',
 'HT268B1',
 'HT397B1',
 'HT206B1',
 'C3L-00970',
 'C3L-01287',
 'C3L-00982',
 'C3L-02551',
 'HT413C1-K2',
 'HT225C1',
 'HT448C1-Th1K1Fp1',
 'HT413C1-Th1k4A1',
 'WD-76845',
 'HT553P1-H2',
 'S18-9906',
 'S18-5591-C8']

In [102]:
data = []
for case, sections in case_to_sections.items():
    for entry in sections:
        dtypes = [mapping['dtype'] for mapping in entry['data']]
        if 'visium' in dtypes:
            data.append([case, entry['sid'], entry['position'], 'visium'])
        else:
            for mapping in entry['data']:
                data.append([case, entry['sid'], entry['position'], mapping['dtype']])
df = pd.DataFrame(data=data, columns=['case', 'section_id', 'position', 'dtype'])
df['dtype'] = [x if 'batch' not in x else 'he' for x in df['dtype']]
df

Unnamed: 0,case,section_id,position,dtype
0,C3L-00970,C3L-00970-U1,0,xenium
1,C3L-00970,C3L-00970-U2,5,multiplex
2,C3L-00982,C3L-00982-U1,0,multiplex
3,C3L-00982,C3L-00982-U2,5,xenium
4,C3L-00982,C3L-00982-U4,15,multiplex
...,...,...,...,...
158,WD-76845,WD-76845-U97,485,multiplex
159,WD-76845,WD-76845-U101,505,he
160,WD-76845,WD-76845-U102,510,multiplex
161,WD-76845,WD-76845-U105,525,he


In [103]:
order = ['he', 'visium', 'multiplex', 'xenium', 'cosmx']
dtype_to_color = {
    k:sns.color_palette()[i] for i, k in enumerate(order)
}
dtype_to_color = {
    k:'#%02x%02x%02x' % tuple([int(x * 255.) for x in v]) for k, v in dtype_to_color.items()
}
domain = order
domain_range = [dtype_to_color[dt] for dt in order]

In [104]:
import altair as alt
alt.Chart(df).mark_tick().encode(
    x=alt.X('case', sort=case_order),
    y='position',
    color=alt.Color('dtype').scale(domain=domain, range=domain_range)
).configure_tick(
    thickness=2.6,
#     bandSize=10,
)

In [105]:
alt.Chart(case_meta).mark_rect().encode(
    x=alt.X('case', sort=case_order),
    color=alt.Color('tissue_type').scale(scheme="set2")
)

In [106]:
alt.Chart(case_meta).mark_rect().encode(
    x=alt.X('case', sort=case_order),
    color=alt.Color('cohort').scale(scheme="dark2")
)

In [107]:
alt.Chart(case_meta).mark_bar().encode(
    x=alt.X('case', sort=case_order),
    y=alt.Y('n_sections'),
#     color=alt.Color('cohort')
)

In [125]:
data = []
n = 20
for case, sections in case_to_sections.items():
    dtypes = [mapping['dtype'] for entry in sections for mapping in entry['data']]
    dtypes = sorted(set([dtype for dtype in dtypes if 'batch' not in dtype]))
    
    step_size = n // len(dtypes)
    print(step_size)
    start = 0
    for dtype in dtypes:
        for i in range(step_size):
            start += i
            data.append([f'{case}_{start}', case_order.index(case), dtype, start])
#             data.append([f'{case}', dtype])
source = pd.DataFrame(data=data, columns=['case', 'case_order', 'dtype', 'idx'])
order = source.sort_values(['case_order', 'dtype'])['case'].to_list()
source

10
6
10
6
6
10
10
6
6
6
5
6
6
6
6
5
10


Unnamed: 0,case,case_order,dtype,idx
0,C3L-00970_0,5,multiplex,0
1,C3L-00970_1,5,multiplex,1
2,C3L-00970_3,5,multiplex,3
3,C3L-00970_6,5,multiplex,6
4,C3L-00970_10,5,multiplex,10
...,...,...,...,...
315,WD-76845_60,13,multiplex,60
316,WD-76845_66,13,multiplex,66
317,WD-76845_73,13,multiplex,73
318,WD-76845_81,13,multiplex,81


In [126]:
order

['HT339B2-H1_0',
 'HT339B2-H1_1',
 'HT339B2-H1_3',
 'HT339B2-H1_6',
 'HT339B2-H1_10',
 'HT339B2-H1_15',
 'HT339B2-H1_15',
 'HT339B2-H1_16',
 'HT339B2-H1_18',
 'HT339B2-H1_21',
 'HT339B2-H1_25',
 'HT339B2-H1_30',
 'HT339B2-H1_30',
 'HT339B2-H1_31',
 'HT339B2-H1_33',
 'HT339B2-H1_36',
 'HT339B2-H1_40',
 'HT339B2-H1_45',
 'HT565B1-H2_0',
 'HT565B1-H2_1',
 'HT565B1-H2_3',
 'HT565B1-H2_6',
 'HT565B1-H2_10',
 'HT565B1-H2_15',
 'HT565B1-H2_15',
 'HT565B1-H2_16',
 'HT565B1-H2_18',
 'HT565B1-H2_21',
 'HT565B1-H2_25',
 'HT565B1-H2_30',
 'HT565B1-H2_30',
 'HT565B1-H2_31',
 'HT565B1-H2_33',
 'HT565B1-H2_36',
 'HT565B1-H2_40',
 'HT565B1-H2_45',
 'HT268B1_0',
 'HT268B1_1',
 'HT268B1_3',
 'HT268B1_6',
 'HT268B1_10',
 'HT268B1_15',
 'HT268B1_21',
 'HT268B1_28',
 'HT268B1_36',
 'HT268B1_45',
 'HT268B1_45',
 'HT268B1_46',
 'HT268B1_48',
 'HT268B1_51',
 'HT268B1_55',
 'HT268B1_60',
 'HT268B1_66',
 'HT268B1_73',
 'HT268B1_81',
 'HT268B1_90',
 'HT397B1_0',
 'HT397B1_1',
 'HT397B1_3',
 'HT397B1_6',
 'HT397B

In [127]:
alt.Chart(source).mark_rect().encode(
    x=alt.X('case', sort=order),
    color=alt.Color('dtype').scale(scheme="dark2")
)

In [11]:
outputs = pickle.load(open(os.path.join(project_dir, 'HT206B1', 'mushroom_50res', 'outputs.pkl'), 'rb'))
outputs.keys()

dict_keys(['section_positions', 'section_ids', 'dtype_to_volume', 'dtype_to_volume_probs', 'dtype_to_clusters', 'dtype_to_cluster_probs', 'dtype_to_cluster_probs_all', 'dtype_to_cluster_intensities', 'dtype_to_cluster_to_agg'])

In [13]:
dtype_to_volume = pickle.load(
    open(os.path.join(project_dir, 'HT206B1', 'mushroom_50res', 'nbhd_volumes_l2.pkl'), 'rb'))
dtype_to_volume.keys()

dict_keys(['probs', 'labeled'])

In [14]:
dtype_to_volume['labeled'].keys()

dict_keys(['he', 'multiplex', 'xenium', 'integrated'])