In [None]:
import re
import os 
import sys 

import numpy as np
import matplotlib.pyplot as plt
import skimage
from skimage import io

from pathlib import Path
from tqdm.notebook import trange, tqdm
from joblib import Parallel, delayed
from skimage import exposure
import h5py
import pandas as pd
import scanpy as sc
import squidpy as sq
sc.settings.verbosity = 3

from matplotlib.pyplot import rc_context
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from functools import reduce
from matplotlib import cm, colors
import scanorama
import seaborn as sns 
import anndata as ad
from PIL import Image

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# Import path
module_path = str(Path.cwd().parents[0])
if module_path not in sys.path:
    sys.path.append(module_path)
    
module_path = str(Path.cwd().parents[0] / "src")
if module_path not in sys.path:
    sys.path.append(module_path)
    

In [None]:
from config import *


# Read info

In [None]:
from skimage.transform import rotate
from functools import partial
import matplotlib.patches as mpatches
from skimage.segmentation import mark_boundaries
from skimage.filters import median
from skimage.morphology import disk
import skimage.io

def get_info(img_folder):
    """Function returns the info from folder containing multi-cycle staigning on cell

    Args:
        img_folder (str) : imgage folder path to get information
        name_dict (dict) : three level dictionnary mapping cycle -> channel -> marker name

    Returns:
        pandas dataframe with information
    """
    images_path = []
    markers = []
    rois = []
    
    # Loop through image folder
    for (dirpath, dirnames, filenames) in os.walk(img_folder):
        for name in sorted(filenames):
            if 'ome.tiff' not in name:
                continue 
                
            roi = dirpath.split('_')[-1]
            try:
                marker = name.split('_')[2].split('.')[0]
                if marker == 'contaminant':
                    continue
                elif marker == 'DNA':
                    if '191Ir' in name:
                        marker += '1'
                    else:
                        marker += '2'
            except:continue
            
            path = os.path.join(dirpath, name)
            rois.append(roi)
            markers.append(marker)
            images_path.append(path)
            
    info = {
        "ROI": rois,
        "Marker": markers,
        "Path": images_path,
    }
    df = pd.DataFrame(info)
    return df


In [None]:
data_ROI = data_dir / 'ROI_new'
df = get_info(data_ROI)


In [None]:
df = df[~df.Marker.isin(['DNA1', 'DNA2'])]
df

# Read images, process and save to h5 m

In [None]:
from sklearn.neighbors import NearestNeighbors
from skimage.util import img_as_ubyte 

def contrast_streching(img):
    p2, p98 = np.percentile(img, (1,99))
    return exposure.rescale_intensity(img, in_range=(p2, p98))

def read_img(path:str) -> np.ndarray:
    '''
    Read image from path
    '''
    img = io.imread(path, as_gray=True)
    img = contrast_streching(img)
    img = img_as_ubyte(img)
    return img

def joblib_loop(task, pics):
    return Parallel(n_jobs=20)(delayed(task)(i) for i in pics)

def get_NN(data, n):
    fit = NearestNeighbors(n_neighbors=n).fit(data)
    distances, indices = fit.kneighbors(data)

    return distances, indices

def filter_img_knn(img, n=25, th=3.5):
    # Get avg distances per positive expressed pixels
    x, y = np.where(img > 0)
    values = img[x,y]
    
    data = np.column_stack((x,y))
    distances, indices = get_NN(data, n)
    # avg_dist = np.average(distances, axis=1, weights=values[indices])
    avg_dist = np.average(distances, axis=1)
        
    filter_ind = avg_dist > th
    unique, counts = np.unique(filter_ind, return_counts=True)
    print(unique, counts)
    x_fil = x[filter_ind]
    y_fil = y[filter_ind]

    img_fil = img.copy()
    img_fil[x_fil, y_fil] = 0
    
    return img_fil

def save_hdf5(path:str, name:str, data: np.ndarray, attr_dict= None, mode:str='a') -> None:
    # Read h5 file
    hf = h5py.File(path, mode)
    # Create z_stack_dataset
    if hf.get(name) is None:
        data_shape = data.shape
        data_type = data.dtype
        chunk_shape = (1, ) + data_shape[1:]
        max_shape = (data_shape[0], ) + data_shape[1:]
        dset = hf.create_dataset(name, shape=data_shape, maxshape=max_shape, chunks=chunk_shape, dtype=data_type, compression="gzip")
        dset[:] = data
        if attr_dict is not None:
            for attr_key, attr_val in attr_dict.items():
                dset.attrs[attr_key] = attr_val
    else:
        print(f'Dataset {name} exists')
        
    hf.close()

In [None]:
group = df.groupby('ROI')
h5_data = data_ROI / f'TMA.hdf5'

for name, df_group in group:
    # Read images
    paths = df_group.Path.tolist()
    imgs = joblib_loop(read_img, paths)
    imgs = joblib_loop(filter_img_knn, imgs)
    imgs=np.stack(imgs, axis=0)
    # read markeabs    
    markers = df_group.Marker.tolist()
    print(len(markers))
    #Save h5
    save_hdf5(h5_data, name, imgs, {'labels': markers})
