In [1]:
import os
import subprocess
import tifffile
import scanpy as sc
import pandas as pd
import numpy as np
from skimage.morphology import binary_erosion, remove_small_objects, binary_dilation, label
from skimage.measure import regionprops_table, regionprops
from scipy.ndimage import binary_fill_holes
from pathlib import Path

import matplotlib.pyplot as plt
from mip.utils import listfiles, extract_ome_tiff, R_CHANNEL_MAPPING

In [2]:
from skimage.filters import gaussian

## inhouse codex

In [3]:
out_dir = '/diskmnt/Projects/Users/estorrs/multiplex_data/analysis/brca_dcis_v2/preprocessing/htan/'
Path(out_dir).mkdir(parents=True, exist_ok=True)

#### make spatial features

In [4]:
adata_fps = sorted(listfiles('/diskmnt/Projects/Users/estorrs/multiplex_data/codex/htan',
                         regex='level_4/.*h5ad$'))

In [5]:

adata_fps = [fp for fp in adata_fps
             if '.ome.tiff' not in fp
             if 'cell_annotation' not in fp
             if 'metacluster' not in fp
             if 'spatial_features' not in fp]
adata_fps

['/diskmnt/Projects/Users/estorrs/multiplex_data/codex/htan/brca/021323_BRCA_HT365B1_S1H1/level_4/HT365B1_S1H1_02132023.h5ad',
 '/diskmnt/Projects/Users/estorrs/multiplex_data/codex/htan/brca/021723_BCRA_HT271B1_S1H3A5/level_4/HT271B1_S1H3A5_02172023.h5ad',
 '/diskmnt/Projects/Users/estorrs/multiplex_data/codex/htan/brca/031623_BRCA_HT397B1-U2/level_4/HT397B1_U2_03162023.h5ad',
 '/diskmnt/Projects/Users/estorrs/multiplex_data/codex/htan/brca/03172023_BRCA_HT397B1-U12/level_4/HT397B1_U12_03172023.h5ad',
 '/diskmnt/Projects/Users/estorrs/multiplex_data/codex/htan/brca/040623_BRCA_HT397B1-U31/level_4/HT397B1_S1H1A3U31_04062023.h5ad',
 '/diskmnt/Projects/Users/estorrs/multiplex_data/codex/htan/brca/041223_BRCA_HT397B1-S1H3A1-U22/level_4/HT397B1_S1H1A3U22_04122023.h5ad',
 '/diskmnt/Projects/Users/estorrs/multiplex_data/codex/htan/brca/041923_BRCA_HT171B1-S1H9A1-4_top_HT243B1-S1H4A4_HT271B1-S1H6A5_HT308B1-S1H5A4_btm/level_4/HT243B1-S1H4A4_04192023.h5ad',
 '/diskmnt/Projects/Users/estorrs/mul

In [6]:
len(adata_fps)

45

In [7]:
adata_fps = [
    '/diskmnt/Projects/Users/estorrs/multiplex_data/codex/htan/brca/042623_MULTIPLE_HTAN_HT413C1-K2_TOP_HT339B2-H1_HT553P1-H2_HT565B1-H2_BTM/level_4/HT413C1-K2_04262023.h5ad',
    '/diskmnt/Projects/Users/estorrs/multiplex_data/codex/htan/brca/042623_MULTIPLE_HTAN_HT413C1-K2_TOP_HT339B2-H1_HT553P1-H2_HT565B1-H2_BTM/level_4/HT553P1-H2_04262023.h5ad',
    '/diskmnt/Projects/Users/estorrs/multiplex_data/codex/htan/brca/HT206B1-S1H2L4_and_HT427P1_S1H1A3/level_4/HT427P1_S1H1A3_20221020.h5ad',
    '/diskmnt/Projects/Users/estorrs/multiplex_data/codex/htan/crc/041323_CRC_HT488C1-Th1K1Fp1-U2/level_4/HT488C1-Th1K1Fp1-U2_041323.h5ad',
    '/diskmnt/Projects/Users/estorrs/multiplex_data/codex/htan/crc/041823_CRC_HT488C1-Th1K1Fp1-U14/level_4/HT488C1-Th1K1Fp1-U14_041823.h5ad',
    '/diskmnt/Projects/Users/estorrs/multiplex_data/codex/htan/crc/20230304_Liver_mCRC_SenNet_HT342C1_HT347C1/level_4/HT342C1_Th1K4A1_section1_03042022.h5ad',
    '/diskmnt/Projects/Users/estorrs/multiplex_data/codex/htan/crc/20230304_Liver_mCRC_SenNet_HT342C1_HT347C1/level_4/HT342C1_Th1K4A1_section2_03042022.h5ad',
    '/diskmnt/Projects/Users/estorrs/multiplex_data/codex/htan/crc/20230304_Liver_mCRC_SenNet_HT342C1_HT347C1/level_4/HT347C1_Th1K2A1_section1_03042022.h5ad',
    '/diskmnt/Projects/Users/estorrs/multiplex_data/codex/htan/crc/20230304_Liver_mCRC_SenNet_HT342C1_HT347C1/level_4/HT347C1_Th1K2A1_section2_03042022.h5ad'
]

In [8]:
a = sc.read_h5ad(adata_fps[0])
a

AnnData object with n_obs × n_vars = 128858 × 33
    obs: 'row', 'col', 'bbox-r1', 'bbox-c1', 'bbox-r2', 'bbox-c2', 'area', 'DAPI_intensity', 'CD8_intensity', 'Her2_intensity', 'GATA3 (D)_intensity', 'cKit_intensity', 'Pan-Cytokeratin_intensity', 'GLUT1_intensity', 'Podoplanin_intensity', 'CD68_intensity', 'HLA-DR_intensity', 'CK14_intensity', 'TFF1 (D)_intensity', 'MGP_intensity', 'a-Amylase_intensity', 'SMA_intensity', 'CD3e_intensity', 'Ki67_intensity', 'Vimentin_intensity', 'CD11b_intensity', 'PR_intensity', 'Bap1_intensity', 'CD45_intensity', 'ER_intensity', 'CD31_intensity', 'COX6c_intensity', 'CK19_intensity', 'CK17_intensity', 'PLAT/tPA_intensity', 'CTLA4_intensity', 'Hep-Par-1 (D)_intensity', 'CD4_intensity', 'SLC39A6_intensity', 'CK5_intensity', 'cell_type'
    uns: 'gating_strategy', 'thresholds'
    obsm: 'spatial'

In [14]:
sample_to_feats = {}
sample_to_adata = {}
for fp in adata_fps:
    sample_id = fp.split('/')[-1].replace('.h5ad', '')
    sample_to_adata[sample_id] = fp
    a = sc.read_h5ad(fp)
    feats = a.obs[['col', 'row', 'cell_type']]
    feats.columns = ['x', 'y', 'cell_type']
    feats.index.name = 'cell_id'
    sample_to_feats[sample_id] = feats

In [15]:
for s in sample_to_feats.keys(): print(s)

HT413C1-K2_04262023
HT553P1-H2_04262023
HT427P1_S1H1A3_20221020
HT488C1-Th1K1Fp1-U2_041323
HT488C1-Th1K1Fp1-U14_041823
HT342C1_Th1K4A1_section1_03042022
HT342C1_Th1K4A1_section2_03042022
HT347C1_Th1K2A1_section1_03042022
HT347C1_Th1K2A1_section2_03042022


In [16]:
len(sample_to_feats)

9

In [17]:
out = os.path.join(out_dir, 'spatial_features')
Path(out).mkdir(parents=True, exist_ok=True)
for s, feats in sample_to_feats.items():
    feats.to_csv(os.path.join(out, f'{s}.txt'), sep='\t')

In [18]:
sample_to_thresholds = {}
for s, a in sample_to_adata.items():
    a = sc.read_h5ad(a)
    markers = [x.replace('_fraction', '') for x in a.var.index.to_list()]
    thresholds = a.uns['thresholds']
    d = {k:v for k, v in zip(markers, thresholds) if v > 0}
    sample_to_thresholds[s] = d

In [19]:
sample_to_thresholds

{'HT413C1-K2_04262023': {'CD8': 12.0,
  'Pan-Cytokeratin': 15.0,
  'Podoplanin': 14.0,
  'CD68': 254.0,
  'HLA-DR': 21.0,
  'SMA': 10.0,
  'CD3e': 39.0,
  'CD11b': 254.0,
  'CD45': 254.0,
  'CD31': 10.0,
  'CD4': 19.0},
 'HT553P1-H2_04262023': {'CD8': 10.0,
  'Pan-Cytokeratin': 11.0,
  'Podoplanin': 13.0,
  'CD68': 254.0,
  'HLA-DR': 29.0,
  'SMA': 19.0,
  'CD3e': 10.0,
  'CD11b': 254.0,
  'CD45': 10.0,
  'CD31': 10.0,
  'CD4': 24.0},
 'HT427P1_S1H1A3_20221020': {'CD31': 19.0,
  'CD4': 39.0,
  'CD8': 39.0,
  'Pan-Cytokeratin': 20.0,
  'Podoplanin': 20.0,
  'CD3e': 39.0,
  'SMA': 15.0,
  'CD68': 254.0,
  'CD20': 254.0,
  'CD11c': 254.0,
  'HLA-DR': 20.0,
  'CD163': 15.0,
  'FOXP3': 254.0},
 'HT488C1-Th1K1Fp1-U2_041323': {'Podoplanin': 19.0,
  'Pan-Cytokeratin': 20.0,
  'HLA-DR': 20.0,
  'CD8': 10.0,
  'SMA': 14.0,
  'CD45': 254.0,
  'CD3e': 39.0,
  'E-cadherin': 20.0,
  'CD11b': 20.0,
  'CD68': 254.0,
  'CD31': 15.0,
  'FOXP3': 15.0},
 'HT488C1-Th1K1Fp1-U14_041823': {'Podoplanin': 24.0,

In [20]:
json.dump(sample_to_thresholds, open(os.path.join(out_dir, 'thresholds.json'), 'w'))

#### make initial region masks

In [21]:
# ome_fps = sorted(listfiles('/diskmnt/Projects/Users/estorrs/multiplex_data/codex/htan/brca/',
#                          regex='/level_2/[^/]+.ome.tiff$'))
ome_fps = sorted(listfiles('/diskmnt/Projects/Users/estorrs/multiplex_data/codex/htan',
                         regex='/level_2/[^/]+.ome.tiff$'))
# ome_fps = [fp for fp in ome_fps if any([s in fp for s in sample_to_feats.keys()])]
sample_to_ome = {fp.split('/')[-1].replace('.ome.tiff', ''):fp for fp in ome_fps}

In [22]:
# adata_fps = sorted(listfiles('/diskmnt/Projects/Users/estorrs/multiplex_data/codex/htan',
#                          regex='level_4/.*h5ad$'))
# adata_fps = [fp for fp in adata_fps
#              if '.ome.tiff' not in fp
#              if 'cell_annotation' not in fp
#              if 'metacluster' not in fp
#              if 'spatial_features' not in fp]

# sample_to_adata = {fp.split('/')[-1].replace('.h5ad', ''):fp for fp in adata_fps}

In [23]:
sorted(sample_to_ome.keys())

['20230120_PKD_K1301459',
 '20230120_PKD_K200452_2PB',
 '20230120_PKD_K2200446_4PB',
 'CRC112',
 'E16.5_female_section1_04272023',
 'E16.5_female_section2_04272023',
 'E16.5_male_section1_04272023',
 'E16.5_male_section2_04272023',
 'HT110B1-S1H4',
 'HT171B1-S1H1A1',
 'HT171B1-S1H1A4',
 'HT171B1-S1H9A1-4_04192023',
 'HT171B1-S1H9A1-4_left_05122023',
 'HT171B1-S1H9A1-4_right_05122023',
 'HT171B1_20221205',
 'HT171B1_S1H1A1',
 'HT171B1_S1H1A4',
 'HT171B1_S1H8A1_20221122',
 'HT206B1-H1',
 'HT206B1_Ctrl',
 'HT206B1_H1_06252022',
 'HT206B1_S1H2L4',
 'HT243B1-S1H4',
 'HT243B1-S1H4A4_04192023',
 'HT243B1-S1H4A4_left_05122023',
 'HT243B1-S1H4A4_right_05122023',
 'HT271B1-S1H3A5',
 'HT271B1-S1H6A5_04192023',
 'HT271B1-S1H6A5_left_05122023',
 'HT271B1-S1H6A5_right_05122023',
 'HT271B1_S1H3A5',
 'HT297B1-H1-08042022',
 'HT305B1-S1H1',
 'HT308B1',
 'HT308B1-S1H5A4_04192023',
 'HT308B1-S1H5A4_left_05122023',
 'HT308B1-S1H5A4_right_05122023',
 'HT323B1-H1-08042022',
 'HT323B1-H1A1',
 'HT323B1-H1A4',

In [24]:
sorted(sample_to_adata.keys())

['HT342C1_Th1K4A1_section1_03042022',
 'HT342C1_Th1K4A1_section2_03042022',
 'HT347C1_Th1K2A1_section1_03042022',
 'HT347C1_Th1K2A1_section2_03042022',
 'HT413C1-K2_04262023',
 'HT427P1_S1H1A3_20221020',
 'HT488C1-Th1K1Fp1-U14_041823',
 'HT488C1-Th1K1Fp1-U2_041323',
 'HT553P1-H2_04262023']

In [30]:
name_map = {
    'HT342C1_Th1K4A1_section1_03042022': 'HT342C1_Th1K4A1_section1_03042022',
    'HT342C1_Th1K4A1_section2_03042022': 'HT342C1_Th1K4A1_section2_03042022',
    'HT347C1_Th1K2A1_section1_03042022': 'HT347C1_Th1K2A1_section1_03042022',
    'HT347C1_Th1K2A1_section2_03042022': 'HT347C1_Th1K2A1_section2_03042022',
    'HT413C1-K2_04262023': 'HT413C1-K2_04262023',
    'HT488C1-Th1K1Fp1-U14_041823': 'HT488C1-Th1K1Fp1-U14_041823',
    'HT488C1-Th1K1Fp1-U2_041323': 'HT488C1-Th1K1Fp1-U2_041323',
    'HT553P1-H2_04262023': 'HT553P1-H2_04262023',
    'HT427P1_S1H1A3_20221020': 'HT427P1_S1H1A3',
}
r_name_map = {v:k for k, v in name_map.items()}

sample_to_ome = {r_name_map.get(k, k):v for k, v in sample_to_ome.items()}

In [31]:
# name_map = {
#     'HT110B1_S1H4': 'HT110B1-S1H4',
#     'HT171B1-S1H9A1-4_left_05122023': 'HT171B1-S1H9A1-4_left_05122023',
#     'HT171B1-S1H9A1-4_right_05122023': 'HT171B1-S1H9A1-4_right_05122023',
#     'HT206B1_H1': 'HT206B1-H1',
#     'HT206B1_H1_06252022': 'HT206B1_H1_06252022',
#     'HT206B1_S1H2L4_20221028': 'HT206B1_S1H2L4',
#     'HT243B1-S1H4A4_04192023': 'HT243B1-S1H4A4_04192023',
#     'HT243B1-S1H4A4_left_05122023': 'HT243B1-S1H4A4_left_05122023',
#     'HT243B1-S1H4A4_right_05122023': 'HT243B1-S1H4A4_right_05122023',
#     'HT243B1_S1H4': 'HT243B1-S1H4',
#     'HT271B1-S1H6A5_04192023': 'HT271B1-S1H6A5_04192023',
#     'HT271B1-S1H6A5_left_05122023': 'HT271B1-S1H6A5_left_05122023',
#     'HT271B1-S1H6A5_right_05122023': 'HT271B1-S1H6A5_right_05122023',
#     'HT271B1_S1H3A5_02172023': 'HT271B1_S1H3A5',
#     'HT297B1_H1_08042022': 'HT297B1-H1-08042022',
#     'HT305B1_S1H1': 'HT305B1-S1H1',
#     'HT308B1-S1H5A4_04192023': 'HT308B1-S1H5A4_04192023',
#     'HT308B1-S1H5A4_left_05122023': 'HT308B1-S1H5A4_left_05122023',
#     'HT308B1-S1H5A4_right_05122023': 'HT308B1-S1H5A4_right_05122023',
#     'HT323B1_H1A1': 'HT323B1-H1A1',
#     'HT323B1_H1A4': 'HT323B1-H1A4',
#     'HT323B1_H1_08042022': 'HT323B1-H1-08042022',
#     'HT323B1_H3': 'HT323B1-H3',
#     'HT339B1_H1A1': 'HT339B1-H1A1',
#     'HT339B1_H2A1': 'HT339B1-H2A1',
#     'HT339B1_H4A4': 'HT339B1-H4A4',
#     'HT365B1_S1H1_02132023': 'HT365B1_S1H1',
#     'HT397B1_H2A2': 'HT397B1-H2A2',
#     'HT397B1_H3A1': 'HT397B1-H3A1',
#     'HT397B1_S1H1A3U22_04122023': 'HT397B1_S1H1A3U22_04122023',
#     'HT397B1_S1H1A3U31_04062023': 'HT397B1_S1H1A3U31_04062023',
#     'HT397B1_U12_03172023': 'HT397B1_U12_03172023',
#     'HT397B1_U2_03162023': 'HT397B1_U2_03162023',
#     'HT480B1_S1H2_R001': 'HT480B1-S1H2-R001',
#     'HT480B1_S1H2_R002': 'HT480B1-S1H2-R002',
#     'HT565B1-H2_04262023': 'HT565B1-H2_04262023'
# }
# r_name_map = {v:k for k, v in name_map.items()}

# sample_to_ome = {r_name_map.get(k, k):v for k, v in sample_to_ome.items()}

In [32]:
def generate_mask(mask, sigma=1., min_area=10000):
    mask = gaussian(mask, sigma=sigma)
    mask = binary_fill_holes(mask)
    
    labeled = label(mask)
    props = regionprops(labeled)
    
    mask = labeled > 0
    for prop in props:
        if prop.area < min_area:
            r1, c1, r2, c2 = prop.bbox
            r1, c1 = max(r1 - 10, 0), max(c1 - 10, 0)
            r2, c2, = r2 + 10, c2 + 10
            mask[r1:r2, c1:c2][labeled[r1:r2, c1:c2]==prop.label] = 0
            
    return mask

In [33]:
out = os.path.join(out_dir, 'region_mask')
Path(out).mkdir(parents=True, exist_ok=True)

In [34]:
keep = ['Pan-Cytokeratin', 'E-cadherin']
sample_to_mask = {}
for sample in name_map.keys():
    print(sample)
    channel_to_img = extract_ome_tiff(sample_to_ome[sample])
    channel_to_img = {R_CHANNEL_MAPPING.get(k, k):img for k, img in channel_to_img.items()}

    a = sc.read_h5ad(sample_to_adata[sample])
    
    markers = [x.replace('_fraction', '') for x in a.var.index.to_list()]
    thresholds = a.uns['thresholds']
    mask = np.zeros_like(next(iter(channel_to_img.values())), dtype=bool)
    for c in keep:
        if c in markers:
            val = thresholds[markers.index(c)]
            if val > 0:
                m = channel_to_img[c] >= val
                mask |= m
        
    mask = generate_mask(mask, sigma=1., min_area=10000)
    sample_to_mask[sample] = mask
    
    tifffile.imsave(os.path.join(out, f'{sample}.tif'), mask)    

HT342C1_Th1K4A1_section1_03042022
HT342C1_Th1K4A1_section2_03042022
HT347C1_Th1K2A1_section1_03042022
HT347C1_Th1K2A1_section2_03042022
HT413C1-K2_04262023
HT488C1-Th1K1Fp1-U14_041823
HT488C1-Th1K1Fp1-U2_041323
HT553P1-H2_04262023
HT427P1_S1H1A3_20221020
