In [1]:
import itertools
import os
import sys
from pathlib import Path

import cv2
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import skimage.io

from collections import defaultdict
from tqdm.notebook import trange, tqdm, tqdm_notebook
from joblib import Parallel, delayed
import re


In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
data_dir = (Path().cwd().parents[0] / 'data').absolute()
data_processed = data_dir / 'processed'
data_raw = r'Y:\coskun-lab\Shuangyi\ERK, YAP project_2022\PLA\HCC827-derived OCT mouse'


# Get info

Here we look at stitched images in all z stacks

In [4]:
markers_map = {
    'cycle1': {
        1: 'Hoeschst', 
        4: 'TEAD1 & YAP1'
    },
    'cycle2': {
        1: 'Hoeschst', 
        4: 'CylinE & CDK2'
    },
    'cycle3': {
        1: 'Hoeschst', 
        4: 'P-ERK & c-MYC'
    },
    'cycle4': {
        1: 'Hoeschst', 
        4: 'p-AKT & mTOR'
    },
    'cycle5': {
        1: 'Hoeschst', 
        4: 'Mcl-1 & BAK'
    },
    'cycle6': {
        1: 'Hoeschst',
        2: 'p-EGFR',
        3: 'Tom20',
        4: 'Ki67'
    },
    'cycle7': {
        1: 'Hoeschst',
        2: 'Pan-cytokeratin',
        3: 'Golph4',
        4: 'Bim'
    },
    'cycle8': {
        1: 'Hoeschst',
        2: 'Concanavalin A',
        3: 'Phalloidin',
        4: 'WGA'
    },
    'cycle9': {
        1: 'Hoeschst',
        2: 'NBD-C6'
    },
}

def get_info(data_raw, marker_dict = markers_map):
    timepoints = []
    resolutions = []
    fovs = []
    cycles = []
    afs = []
    channels = []
    markers = []
    paths = [] 
    
    # Loop through image folder
    for (dirpath, dirnames, filenames) in os.walk(data_raw):
        for name in sorted(filenames):
            if "tif" in name and "stitched" in name and 'defocused' not in dirpath:
                # Get information from image name
                d_split = dirpath.split('\\')
                n_split = name.split('_')
                                
                time = d_split[-1].split('_')[0]
                fov = d_split[-1].split('_')[-1]
                if 'FW' not in fov:
                    res = '20X'
                    fov = ''
                else:
                    res = '40X'
                    
                cycle = d_split[-1].split('_')[1]
                if 'Af' in cycle:
                    after_bleach = True
                    cycle = cycle[2:]
                else:
                    after_bleach = False
                
                ch = int(n_split[1][0])
                try:
                    marker = marker_dict[cycle][ch]
                except:
                    continue 
                    
                timepoints.append(time)
                resolutions.append(res)
                fovs.append(fov)
                cycles.append(cycle)
                afs.append(after_bleach)
                channels.append(ch)
                markers.append(marker)
                paths.append(os.path.join(dirpath, name))
                
    info = {
            "Timepoint": timepoints,
            "Resolution": resolutions,
            "FOV": fovs,
            "Cycle": cycles,
            "AfBleach": afs,
            "Channels": channels,
            "Markers": markers,
            "Path": paths
        }

    df = pd.DataFrame(info)
    return df

In [5]:
df_meta_path = data_dir / 'OCT mouse' / 'Whole' / 'metadata' / 'info.csv'

try:
    df_meta_path.parent.mkdir(parents=True, exist_ok=False)
except FileExistsError:
    print("Folder is already there")

df_exist = df_meta_path.is_file()

if not df_exist:
    print('Created df')
    df = get_info(data_raw)
    df.to_csv(df_meta_path, index=False)
else:
    print('Loaded df')
    df = pd.read_csv(df_meta_path)

Folder is already there
Loaded df


In [6]:
df

Unnamed: 0,Timepoint,Resolution,FOV,Cycle,AfBleach,Channels,Markers,Path
0,1M,20X,,cycle1,True,1,Hoeschst,"Y:\coskun-lab\Shuangyi\ERK, YAP project_2022\P..."
1,1M,20X,,cycle1,True,4,TEAD1 & YAP1,"Y:\coskun-lab\Shuangyi\ERK, YAP project_2022\P..."
2,1M,40X,FW1,cycle1,True,1,Hoeschst,"Y:\coskun-lab\Shuangyi\ERK, YAP project_2022\P..."
3,1M,40X,FW1,cycle1,True,4,TEAD1 & YAP1,"Y:\coskun-lab\Shuangyi\ERK, YAP project_2022\P..."
4,1M,40X,FW2,cycle1,True,1,Hoeschst,"Y:\coskun-lab\Shuangyi\ERK, YAP project_2022\P..."
...,...,...,...,...,...,...,...,...
330,1W,40X,FW1,cycle9,False,2,NBD-C6,"Y:\coskun-lab\Shuangyi\ERK, YAP project_2022\P..."
331,1W,40X,FW2,cycle9,False,1,Hoeschst,"Y:\coskun-lab\Shuangyi\ERK, YAP project_2022\P..."
332,1W,40X,FW2,cycle9,False,2,NBD-C6,"Y:\coskun-lab\Shuangyi\ERK, YAP project_2022\P..."
333,1W,40X,FW3,cycle9,False,1,Hoeschst,"Y:\coskun-lab\Shuangyi\ERK, YAP project_2022\P..."


# Convert data to hdf5 

Convert stitch data to hdf5 format.

For each file we are organized into the format of: File -> Cycle 

Attributes are Channels and Markers

In [7]:
import h5py

def save_hdf5(
    path: str, name: str, data: np.ndarray, attr_dict=None, mode: str = "a"
) -> None:
    # Read h5 file
    hf = h5py.File(path, mode)
    # Create z_stack_dataset
    if hf.get(name) is None:
        data_shape = data.shape
        data_type = data.dtype
        chunk_shape = (1,) + data_shape[1:]
        max_shape = (data_shape[0],) + data_shape[1:]
        dset = hf.create_dataset(
            name,
            shape=data_shape,
            maxshape=max_shape,
            chunks=True,
            dtype=data_type,
            compression="gzip",
        )
        dset[:] = data
        if attr_dict is not None:
            for attr_key, attr_val in attr_dict.items():
                dset.attrs[attr_key] = attr_val
    else:
        print(f"Dataset {name} exists")

    hf.close()

def read_img(path):
    return skimage.io.imread(path, as_gray=True)

def joblib_loop(task, pics):
    return Parallel(n_jobs=20)(delayed(task)(i) for i in pics)

In [8]:
df_imgs_path = data_dir / 'OCT mouse' / 'Whole' / 'metadata' / 'imgs.csv'

try:
    df_imgs_path.parent.mkdir(parents=True, exist_ok=False)
except FileExistsError:
    print("Folder is already there")
    
temp_path = data_dir / 'OCT mouse' / 'Whole' / 'hdf5' / 'raw'
temp_path.mkdir(parents=True, exist_ok=False)

df_exist = df_imgs_path.is_file()

if not df_exist:
    print('Created df')
    df_z = df[df.Resolution == '40X']
    group = df_z.groupby(['Timepoint', 'Resolution', 'FOV', 'AfBleach'])
    rows = []

    for name, df_group in tqdm(group, total=len(group)):
        file_name = '_'.join(np.array(name).astype(str)) + '.hdf5'
        file_path = temp_path / file_name
        rows.append(list(name)+[file_path])
        
        group_cycle = df_group.groupby('Cycle')
        for cycle, df_cycle in group_cycle:
            channels = df_cycle.Channels.to_list()
            markers = df_cycle.Markers.to_list()
            paths = df_cycle.Path.to_numpy()
    
            imgs = joblib_loop(read_img, paths)
            imgs = np.array(imgs)
            info = {"Channels": channels, "Markers": markers}
            
            # hdf5 as Channel -> Z mapping
            save_hdf5(file_path, cycle, imgs, info)
    df_imgs = pd.DataFrame(rows, columns=['Timepoint', 'Resolution', 'FOV', 'AfBleach', 'Path'])        
    df_imgs.to_csv(df_imgs_path, index=False)
else:
    print('Loaded df')
    df_imgs = pd.read_csv(df_imgs_path)

Folder is already there
Folder is already there
Loaded df


In [9]:
df_imgs = df_imgs[df_imgs.AfBleach == False]

# Save tifffile

In [10]:
import tifffile as tiff
save_path = data_dir / 'OCT mouse' / 'Whole' / 'imgs' / 'raw'

def make_imgs_same_dim(imgs):
    # Get max dimensions
    shapes = np.array([img.shape[1:] for img in imgs])
    min_x, min_y = shapes.min(axis=0)
        
    return [img[:, :min_x, :min_y] for img in imgs]

In [13]:
group = df_imgs.groupby(['Timepoint', 'FOV'])
for name, df_group in group:
    path = df_group.iloc[0].Path
             
    # Read images
    cycles = []
    imgs_all = []
    channels = []
    with h5py.File(path, "r") as f:
        for k in tqdm(f.keys(), total=len(f.keys()), leave=False):
            cycle = k.split('_')[0]
            channel = f[k].attrs['Channels']

            imgs = f[k][:]
            cycles.append(cycle)
            channels.append(channel)
            imgs_all.append(imgs)
    
    imgs_same_shape = make_imgs_same_dim(imgs_all)
    
    for i, imgs in enumerate(imgs_same_shape):
        temp_path = save_path / '_'.join(np.array(name).astype(str))
        temp_path.mkdir(parents=True, exist_ok=True)

        file_name = f'001_{cycles[i]}.tif'
        file_path = temp_path / file_name

        # Write image
        tiff.imwrite(file_path, imgs)


  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

# Registration

In [18]:
from ashlar import fileseries, thumbnail,reg
import matplotlib.pyplot as plt
from ashlar.scripts.ashlar import process_axis_flip

In [19]:
# Loop all images
imgs_dir = data_dir / 'OCT mouse' / 'Whole' / 'imgs' / 'raw'
save_dir = data_dir / 'OCT mouse' / 'Whole' / 'imgs' / 'registered'
save_dir.mkdir(parents=True, exist_ok=True)


imgs_dir_list = os.listdir(imgs_dir)

for dir_path in tqdm(imgs_dir_list):
    
    # Create reader for each cycle
    readers = []
    for i in range(1, 10):
        reader = fileseries.FileSeriesReader(
            str(imgs_dir / dir_path),
            pattern='{series}_cycle'+f'{i}.tif',
            overlap=0.29,
            width=1,
            height=1,
            layout='snake',
            direction='horizontal',
            pixel_size=0.18872, 
        )
        readers.append(reader)
    reader_1 = readers[0]
    
    # Run stitching
    aligner0 = reg.EdgeAligner(reader_1, channel=0, filter_sigma=2, verbose=False,)
    aligner0.run()
    
    # Generate merge image for 1 cycle
    # Parramter
    mosaic_args = {}
    mosaic_args['verbose'] = False

    mosaic = reg.Mosaic(
            aligner0,aligner0.mosaic_shape,**mosaic_args
        )
    writer_class = reg.TiffListWriter
    writer = writer_class(
            [mosaic], str(save_dir / (dir_path + '_cycle1_ch{channel}.ome.tif'))
    )
    writer.run()
    
    # Loop through rest of cycles
    aligners = list()
    aligners.append(aligner0)

    for j in range(1, 9):
        aligners.append(
            reg.LayerAligner(readers[j], aligners[0], channel=0, filter_sigma=2, verbose=False)
        )
        aligners[j].run()
        mosaic = reg.Mosaic(
            aligners[j], aligners[0].mosaic_shape,**mosaic_args
        )
        writer = writer_class(
                [mosaic], str(save_dir / (dir_path +'_cycle'+str(j+1)+'_ch{channel}.ome.tif'))
        )
        writer.run()
    

FileExistsError: [WinError 183] Cannot create a file when that file already exists: 'Y:\\coskun-lab\\Thomas\\15_PLA\\data\\OCT mouse\\Whole\\imgs\\registered'

# Save data

In [8]:
markers_map = {
    'cycle1': {
        'ch0': 'Hoeschst', 
        'ch1': 'TEAD1 & YAP1'
    },
    'cycle2': {
        'ch0': 'Hoeschst', 
        'ch1': 'CylinE & CDK2'
    },
    'cycle3': {
        'ch0': 'Hoeschst', 
        'ch1': 'P-ERK & c-MYC'
    },
    'cycle4': {
        'ch0': 'Hoeschst', 
        'ch1': 'p-AKT & mTOR'
    },
    'cycle5': {
        'ch0': 'Hoeschst', 
        'ch1': 'Mcl-1 & BAK'
    },
    'cycle6': {
        'ch0': 'Hoeschst',
        'ch1': 'p-EGFR',
        'ch2': 'Tom20',
        'ch3': 'Ki67'
    },
    'cycle7': {
        'ch0': 'Hoeschst',
        'ch1': 'Pan-cytokeratin',
        'ch2': 'Golph4',
        'ch3': 'Bim'
    },
    'cycle8': {
        'ch0': 'Hoeschst',
        'ch1': 'Concanavalin A',
        'ch2': 'Phalloidin',
        'ch3': 'WGA'
    },
    'cycle9': {
        'ch0': 'Hoeschst',
        'ch1': 'NBD-C6'
    },
}

def get_info(data_raw, marker_dict = markers_map):
    timepoints = []
    fovs = []
    cycles = []
    channels = []
    markers = []
    paths = [] 

    # Loop through image folder
    for (dirpath, dirnames, filenames) in os.walk(data_raw):
        for name in sorted(filenames):
            if "tif" in name:
                # Get information from image name
                n_split = name.split('_')
                                
                time=n_split[0]
                fov=n_split[1]
                cycle=n_split[2]
                ch = n_split[3][:3]
                try:
                    marker = marker_dict[cycle][ch]
                except:
                    continue 
                    
                timepoints.append(time)
                fovs.append(fov)
                cycles.append(cycle)
                channels.append(ch)
                markers.append(marker)
                paths.append(os.path.join(dirpath, name))
                
    info = {
            "Timepoint": timepoints,
            "FOV": fovs,
            "Cycle": cycles,
            "Channels": channels,
            "Markers": markers,
            "Path": paths
        }

    df = pd.DataFrame(info)
    return df


def get_min(imgs):
    shapes = np.array([np.array(img.shape) for img in imgs])
    return np.min(shapes, axis=0)

In [10]:
data_raw = data_dir / 'OCT mouse' / 'Whole' / 'imgs' / 'registered'
df_meta_path = data_dir / 'OCT mouse' / 'Whole' / 'metadata' / 'info_sti.csv'

try:
    df_meta_path.parent.mkdir(parents=True, exist_ok=False)
except FileExistsError:
    print("Folder is already there")

df_exist = df_meta_path.is_file()

if not df_exist:
    print('Created df')
    df = get_info(data_raw)
    df.to_csv(df_meta_path, index=False)
else:
    print('Loaded df')
    df = pd.read_csv(df_meta_path)

Folder is already there
Loaded df


In [48]:
df_imgs_path = data_dir / 'OCT mouse' / 'Whole' / 'metadata' / 'imgs_reg.csv'

temp_path =data_dir /'OCT mouse' / 'Whole' / 'hdf5' / 'registered'
try:
    temp_path.mkdir(parents=True, exist_ok=False)
except FileExistsError:
    print("Folder is already there")

df_exist = df_imgs_path.is_file()

if not df_exist:
    print('Created df')
    group = df.groupby(['Timepoint','FOV'])
    rows = []

    for name, df_group in tqdm(group, total=len(group)):
        file_name = '_'.join(np.array(name).astype(str)) + '.hdf5'
        file_path = temp_path / file_name
        rows.append(list(name)+[file_path])
        
        if file_path.exists():
            continue
        
        channels = df_group.Channels.to_list()
        cycles = df_group.Cycle.to_list()
        markers = df_group.Markers.to_list()
        paths = df_group.Path.to_numpy()
            
        imgs = joblib_loop(read_img, paths)
        min_shape = get_min(imgs)
        imgs_cropped = np.array([img[:min_shape[0], :min_shape[1]] for img in imgs])
        info = {"Cycle": cycles, "Channel": channels, "Marker": markers}
            
            # hdf5 as Channel -> Z mapping
        save_hdf5(file_path, 'imgs', imgs_cropped, info)
    df_imgs = pd.DataFrame(rows, columns=['Timepoint', 'FOV', 'Path'])        
    df_imgs.to_csv(df_imgs_path, index=False)
else:
    print('Loaded df')
    df_imgs = pd.read_csv(df_imgs_path)

Folder is already there
Created df


  0%|          | 0/5 [00:00<?, ?it/s]

# Create segmentation training set

In [49]:
import tensorflow as tf 
from skimage import exposure, util

def random_crop(image, NEW_IMG_HEIGHT, NEW_IMG_WIDTH):
    cropped_image = tf.image.random_crop(
      image, size=[3, NEW_IMG_HEIGHT, NEW_IMG_WIDTH])

    return cropped_image

def contrast_str(img, n_min=0.01, n_max=99.95):
    p2, p98 = np.percentile(img, (n_min, n_max))
    img_rescale = exposure.rescale_intensity(img, in_range=(p2, p98))
    img_rescale = util.img_as_ubyte(img_rescale)
    return img_rescale

def joblib_loop(task, pics):
    return Parallel(n_jobs=20)(delayed(task)(i) for i in pics)

In [54]:
# cyto_markers = ['p-EGFR', 'Pan-cytokeratin']
cyto_markers = ['Pan-cytokeratin']

In [56]:
whole_seg_path = data_dir / 'OCT mouse' / 'Whole' / 'imgs' / 'segmentation'
crop_seg_path =  data_dir / 'OCT mouse' / 'Whole' / 'imgs' / 'training_seg'

try:
    whole_seg_path .mkdir(parents=True, exist_ok=False)
except FileExistsError:
    print("Folder is already there")

try:
    crop_seg_path .mkdir(parents=True, exist_ok=False)
except FileExistsError:
    print("Folder is already there")

N_crop = 10

for row in df_imgs.itertuples():
    # Read image
    path = row.Path
    with h5py.File(path, "r") as f:
        imgs = f['imgs'][:]
        markers = f['imgs'].attrs['Marker']
    
    # Get dapi and cyto imgaes
    indices = np.isin(markers, cyto_markers)
    img_dapi = imgs[0]
    imgs_cyto = imgs[indices,:]
    
    # Contrast streching and combine to rgb image
    img_dapi = contrast_str(img_dapi)
    # img_cyto = contrast_str(imgs_cyto[0], n_max=99)
    # imgs_cyto_scaled = [contrast_str(imgs_cyto[0],n_max=99.5), contrast_str(imgs_cyto[1])]
    # img_cyto = np.max(np.array(imgs_cyto_scaled), axis=0)
    img_cyto = contrast_str(imgs_cyto[0],n_max=99.)
    img_rgb = np.stack([np.zeros(img_dapi.shape),img_cyto, img_dapi], axis=0).astype(np.uint8)
    
    # Crop and save
    file_name = f'{"_".join(row[1:3])}.tif'
    file_path = whole_seg_path / file_name
    tiff.imwrite(file_path, img_rgb)
    
    for i in range(N_crop):
        img_cropped = random_crop(img_rgb, 1000, 1000).numpy().astype(np.uint8)
        
        file_name = f'{"_".join(row[1:3])}_{i}.tif'
        file_path = crop_seg_path / file_name
        tiff.imwrite(file_path, img_cropped)


Folder is already there
Folder is already there
