In [83]:
import glob
import os
import sys
from itertools import cycle
from pathlib import Path, PureWindowsPath

import cv2
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import tifffile
import skimage
import pandas as pd
from skimage.filters import threshold_li
from tqdm.notebook import tqdm, trange
from skimage import exposure, io
from joblib import Parallel, delayed
import napari
import anndata as ad
import scanorama
import scanpy as sc
from fbpca import pca
from geosketch import gs
from matplotlib.pyplot import rc_context
import h5py 

sc.settings.verbosity = 3

In [84]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [85]:
data_dir = r'Y:\coskun-lab\Zhou\4_HCR\2D_analyses_pipelines\subcellular_clustering\v2\bm'

In [86]:
path = Path.cwd().parent / 'data' / 'meta' / 'pixels_clusters.h5ad'
adata = ad.read_h5ad(path)

In [87]:
# Check that the subset is the same 
sketch_index= np.load(
    Path.cwd().parent / 'data' / 'clustering'/ 'index.npy'
)

adata_subset = adata[sketch_index,:]

# Propagate 

In [88]:
import pynndescent
import scanorama
from typing import Optional

def get_img_from_data(x: np.ndarray, y: np.ndarray, data: np.ndarray, pad: int = 0) -> np.ndarray:
    # X is row and Y col
    x_coord = x - x.min() + pad
    y_coord = y - y.min() + pad
    # create image
    img = np.zeros(
        (x_coord.max() + 1 + pad, y_coord.max() + 1 + pad, data.shape[-1]),
        dtype=data.dtype,
    )
    img[x_coord, y_coord] = data
    return img

def hex2rgb(h):
    """Convert hex color string to rgb tuple."""
    h = h.lstrip("#")
    return [int(h[i : i + 2], 16) for i in (0, 2, 4)]

def annotate_img(
    img: np.ndarray,
    annotation: Optional[pd.DataFrame] = None,
    from_col: str = "clustering",
    to_col: Optional[str] = None,
    color: bool = False,
) -> np.ndarray:
    """
    Annotate cluster image.
    Parameters
    ----------
    img
        Image to annotate.
    annotation
        :attr:`Cluster.cluster_annotation` containing mapping of classes to cluster names and colors.
    from_col
        Annotation column containing current values in image.
    to_col
        Annotation column containing desired mapping. If None, use ``from_col``.
    color
        If True, use annotation column ``to_col+"_colors"`` to get colormap and color image.
    Returns
    -------
    Annotated image.
    """
    if to_col is None:
        to_col = from_col
    if color:
        to_col = to_col + "_colors"
        res = np.zeros(img.shape + (3,), dtype=np.uint8)
    else:
        if from_col == to_col:
            # no need to change anything
            return img
        assert annotation is not None
        res = np.zeros_like(img, dtype=annotation[to_col].dtype)
    assert annotation is not None
    for _, row in annotation.iterrows():
        to_value = row[to_col]
        if color:
            to_value = hex2rgb(to_value)
        res[img == row[from_col]] = to_value
    return res.squeeze() if color else res

def get_max_frequency_label(labels, neighs, n):
    df = pd.get_dummies(labels)
    dummies = df.values
    dummies_labels = np.array(df.columns.tolist())
    window = dummies[neighs]
    window = window.reshape(-1, n_neighbor, dummies.shape[1])
    window = window.sum(axis=1) / n_neighbor
    
    assert len(window) == len(neighs)
    return dummies_labels[np.argmax(window, axis=1)]

def missing_elements(X, L):
    start, end = 0, len(X)-1
    return sorted(set(range(start, end + 1)).difference(L))

In [89]:
n_neighbor = 30

# Define batch size
batch_size = 100000

# Get subset labels
labels = adata_subset.obs.cluster

# Get already process data
adatas  = []
for batch in adata_subset.obs["FOV"].unique():
    adata_temp = adata_subset[
        adata_subset.obs["FOV"] == batch
    ]
    adatas.append(adata_temp)

In [90]:
annotations = pd.DataFrame()
annotations['cluster'] = adata.obs.cluster.cat.categories
annotations['cluster_colors'] = adata.uns['cluster_colors']

In [104]:
markers = ['gapdh', 'actb', 'il8', 'il6', 'ccl11', 'col1a1', 'nanog', 'sox9', 'eef2', 'spp1', 'runx1', 'pdl1', 'ConA', 'PhA', 'WGA']


In [None]:
for (dirpath, dirnames, filenames) in os.walk(Path.cwd().parent / 'data' / 'h5'):
    for name in tqdm(sorted(filenames)):
        # Read h5 file for pixel intensity
        path = os.path.join(dirpath, name)
        f = h5py.File(path)
        
        img_path = Path.cwd().parent / 'figures' / 'pixels' / f'{name[:-3]}.png'
        if os.path.exists(img_path):
            print(img_path)
            continue
        
        # Get info
        cell_type = '_'.join(name.split('_')[:-1])
        fov = name.split('_')[-1][:-3]
        
        # Extract df and convert to anndata format
        df = pd.DataFrame(f['df']['table'][:])
        adata_fov = sc.AnnData(df.loc[:, markers].values)
        adata_fov.var_names = adata_subset.var_names
        sc.pp.scale(adata_fov, max_value=4)
        
        # Get already process adata 
        adatas  = []
        for batch in adata_subset.obs["FOV"].unique():
            adata_temp = adata_subset[
                adata_subset.obs["FOV"] == batch
            ]
            adatas.append(adata_temp)
        adatas.append(adata_fov)
        scanorama.integrate_scanpy(adatas, sketch=True, dimred=10) 
        
        # Get scanorama correction
        adata_subset_cor = ad.concat(adatas[:-1])
        
        # Get NNDescent index for fast projection
        X_clustered = adata_subset_cor.obsm['X_scanorama']
        index = pynndescent.NNDescent(X_clustered)
        
        # project clusters
        clustering = []
        samples = adatas[-1].obsm['X_scanorama']
        for i in np.arange(0, samples.shape[0], batch_size):
            print(f"processing chunk {i}")
            cur_samples = samples[i : i + batch_size]
            neighs = index.query(cur_samples.astype(np.float32), k=n_neighbor)[0]
            clustering.append(
                get_max_frequency_label(labels, neighs, n_neighbor)
            )
        full_labels = np.concatenate(clustering)
        
        df_subset = df[['Cell Type', 'Id']]
        df_subset['Cluster'] = full_labels
        df_subset.to_csv(Path.cwd().parent / 'data' / 'pixels' / f'{name[:-3]}.csv', index=False)
        
        # Create images
        
        x = np.array(df.X.values)
        y = np.array(df.Y.values)
        values = full_labels

        if len(values.shape) == 1:
                values = values[:, np.newaxis]
        img_cluster = get_img_from_data(x, y, values)
        img_cluster_annotated = annotate_img(img_cluster, annotations, from_col='cluster', color=True)
        skimage.io.imsave(img_path, img_cluster_annotated)
        f.close()