This code is largely inspired by the repo which can be seen here: https://gitlab.mpcdf.mpg.de/mpibr/laur/cuttlefish/cuttlefish-code-public

In [None]:
import Mapping
import os
import string

import cv2
import geopandas as gpd
import igraph as ig
#import leidenalg
import matplotlib.pyplot as plt
#import networkx as nx
import numpy as np
import pandas as pd
#import rasterio
import seaborn as sns
import tifffile
#import umap
from anndata import AnnData as ad
from matplotlib import patches as mpatches
#from matplotlib_scalebar.scalebar import ScaleBar
import scanpy as sc
from shapely.affinity import translate
from shapely.geometry import Polygon, MultiPolygon, box, shape
from skimage import img_as_bool, img_as_ubyte
from skimage.measure import label, find_contours, regionprops, regionprops_table
from skimage.morphology import skeletonize
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.neighbors import kneighbors_graph
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
import geojson
import json

from tensorflow.keras.applications import vgg19
from keract import get_activations
from tensorflow.keras.preprocessing.image import load_img, img_to_array

In [None]:
def extract_sub_image_with_padding(image, bbox, padding=10):
    min_row, min_col, max_row, max_col = bbox
    min_row = max(min_row - padding, 0)
    min_col = max(min_col - padding, 0)
    max_row = min(max_row + padding, image.shape[0])
    max_col = min(max_col + padding, image.shape[1])
    return image[min_row:max_row, min_col:max_col], (min_row, min_col)

def extract_sub_image_with_padding_and_threshold(image, bbox, padding=0):
    min_row, min_col, max_row, max_col = bbox
    padded_image = np.pad(image, ((padding, padding), (padding, padding)), mode='constant')
    sub_image = padded_image[min_row:max_row + 2*padding, min_col:max_col + 2*padding]
    sub_image[sub_image > 0] = 255
    return sub_image

def load_label_image(test_im, raw_im):
    regions = regionprops(test_im)
    test_label = {}
    for region in regions:
            # change this around if it is broken
            test_label[region.label] = extract_sub_image_with_padding(raw_im, region.bbox)
            #test_label[region.label] = extract_sub_image_with_padding_and_threshold(test_im.astype(np.uint8), region.bbox, padding=10)

    return test_label

def vectorize_imaging_data(data, batch_list):
    root = '/hpc/projects/group.quake/doug/Shapes_Spatial/'
    total = []
    
    for batch in batch_list:
        print(f"Now doing batch: {batch}")
        image_subset = data[data.obs['batchID'] == batch]
        
        # change back to one binary one labeled.
        # yes binary image is raw
        labeled_suffix = '/labeled_image.tif'
        binary_suffix = '/binary_image.tif'
            
        test_im = Mapping.load_tiff_image(root + batch + labeled_suffix)
        #raw_im = np.zeros(10)
        raw_im = Mapping.load_tiff_image(root + batch + binary_suffix)
            
        images = load_label_image(test_im, raw_im)
        total.append(images)
        
    print("Done Analyzing!")
        
    return total

def pad_images(image_dict, target_height=800, target_width=800):
    """
    Pad images in the dictionary to the same width and height.
    
    Parameters:
    image_dict : dict
        Dictionary where keys are image identifiers and values are numpy arrays representing images.
    
    Returns:
    dict
        Dictionary with padded images.
    """
    #max_height = max(image.shape[0] for image in image_dict.values())
    #max_width = max(image.shape[1] for image in image_dict.values())

    padded_images = {}

    for key, image in image_dict.items():
        height, width = image.shape
        pad_height = target_height - height
        pad_width = target_width - width

        # Calculate padding for top, bottom, left, and right to center the image
        pad_top = pad_height // 2
        pad_bottom = pad_height - pad_top
        pad_left = pad_width // 2
        pad_right = pad_width - pad_left

        # Padding: ((top, bottom), (left, right))
        padded_image = np.pad(image, ((pad_top, pad_bottom), (pad_left, pad_right)), mode='constant', constant_values=0)

        padded_images[key] = padded_image

    return padded_images

def filter_images(image_dict, max_dim=800):
    """
    Filter images in the dictionary to remove those with width or height over max_dim.
    
    Parameters:
    image_dict : dict
        Dictionary where keys are image identifiers and values are numpy arrays representing images.
    max_dim : int
        Maximum allowed dimension for width and height.
    
    Returns:
    dict
        Filtered dictionary.
    """
    filtered_images = {key: image for key, image in image_dict.items() if image.shape[0] <= max_dim and image.shape[1] <= max_dim}
    return filtered_images

def analyze_image_dimensions(image_dict):
    """
    Analyze the dimensions of images in the dictionary.
    
    Parameters:
    image_dict : dict
        Dictionary where keys are image identifiers and values are numpy arrays representing images.
    
    Returns:
    None
    """
    widths = [image.shape[1] for image in image_dict.values()]
    heights = [image.shape[0] for image in image_dict.values()]

    # Create a DataFrame for easy analysis
    dimensions_df = pd.DataFrame({'Width': widths, 'Height': heights})

    # Print basic statistics
    print(dimensions_df.describe())

    # Plot the distribution of widths and heights
    fig, axes = plt.subplots(1, 2, figsize=(15, 5))
    
    axes[0].hist(widths, bins=50, color='skyblue', edgecolor='black')
    axes[0].set_title('Distribution of Widths')
    axes[0].set_xlabel('Width')
    axes[0].set_ylabel('Frequency')
    
    axes[1].hist(heights, bins=50, color='lightgreen', edgecolor='black')
    axes[1].set_title('Distribution of Heights')
    axes[1].set_xlabel('Height')
    axes[1].set_ylabel('Frequency')
    
    plt.tight_layout()
    plt.show()

    # Box plots for additional insights
    fig, axes = plt.subplots(1, 2, figsize=(15, 5))
    
    sns.boxplot(y=widths, ax=axes[0], color='skyblue')
    axes[0].set_title('Box Plot of Widths')
    axes[0].set_ylabel('Width')
    
    sns.boxplot(y=heights, ax=axes[1], color='lightgreen')
    axes[1].set_title('Box Plot of Heights')
    axes[1].set_ylabel('Height')
    
    plt.tight_layout()
    plt.show()
    
def resize_images(image_dict, target_height=224, target_width=224):
    """
    Pad images in the dictionary to the same width and height.
    
    Parameters:
    image_dict : dict
        Dictionary where keys are image identifiers and values are numpy arrays representing images.
    
    Returns:
    dict
        Dictionary with padded images.
    """
    #max_height = max(image.shape[0] for image in image_dict.values())
    #max_width = max(image.shape[1] for image in image_dict.values())

    padded_images = {}

    for key, image in image_dict.items():
        
        padded_images[key] = cv2.resize(image_dict[key], dsize=(target_height,target_width), interpolation=cv2.INTER_NEAREST)

    return padded_images

In [None]:
# The following functions are directly from the cited repo:

# lifted from the cuttlefish paper

def apply_clahe(image, clip_limit=2.0, tile_size=(8, 8)):
    clahe = cv2.createCLAHE(clipLimit=clip_limit, tileGridSize=tile_size)
    return clahe.apply(image)

def multi_scale_clahe(image, clip_limit=2.0, tile_sizes=[(8, 8), (16, 16), (32, 32)], weights=None):
    if len(image.shape) == 3:
        # Convert color image to LAB color space
        lab = cv2.cvtColor(image, cv2.COLOR_BGR2LAB)
        l, a, b = cv2.split(lab)
    else:
        l = image  # For grayscale images

    # Apply CLAHE at multiple scales
    enhanced_ls = [apply_clahe(l, clip_limit, tile_size) for tile_size in tile_sizes]

    # Combine the results
    if weights is None:
        weights = [1/len(enhanced_ls)] * len(enhanced_ls)
    
    enhanced_l = np.zeros_like(l, dtype=np.float32)
    for enhanced, weight in zip(enhanced_ls, weights):
        enhanced_l += enhanced * weight
    
    enhanced_l = np.clip(enhanced_l, 0, 255).astype(np.uint8)

    if len(image.shape) == 3:
        # Merge back to LAB and convert to BGR
        enhanced_lab = cv2.merge((enhanced_l, a, b))
        return cv2.cvtColor(enhanced_lab, cv2.COLOR_LAB2BGR)
    else:
        return enhanced_l

def preprocess_image(img,img_width=224,img_height=224):
    from tensorflow.keras.applications import vgg19
    from keract import get_activations

    # from keract import get_activations
    from tensorflow.keras.preprocessing.image import load_img, img_to_array
    img = img_to_array(img)
    img = cv2.resize(img, dsize=(img_width,img_height), interpolation=cv2.INTER_NEAREST)

    if len(img.shape)==2:
        img = img.reshape((img.shape[0], img.shape[1], 1))
        img=np.concatenate((img, img,img),axis=2)

    img = img.reshape((1, img.shape[0], img.shape[1], img.shape[2]))
    img = img.astype('float32')
    img = vgg19.preprocess_input(img)
    return img


def histN(img):
    #img= rgb2gray(img)
    img=img.astype(np.uint8)
    #img = cv2.equalizeHist(img)
    img = multi_scale_clahe(img, 
                            clip_limit=3.0, 
                            tile_sizes=[(16, 16), (32, 32), (64, 64)],
                            weights=[0.0, 0.0, 1.0])
    img = img.reshape((img.shape[0], img.shape[1], 1))
    img = np.concatenate((img, img, img), axis=2)
    return img
def to_tex(img,model,sz=224):
    from tensorflow.keras.applications import vgg19
    from keract import get_activations
    from tensorflow.keras.preprocessing.image import load_img, img_to_array
    LAYER = 'block5_conv1'
    currImg = preprocess_image(img,sz,sz)
    # model.compile(loss="categorical_crossentropy", optimizer="adam")
    activations = get_activations(model, currImg, layer_names=LAYER, auto_compile=True)[LAYER]#.popitem(True)[1]
    # activations = list(get_activations(model, currImg).values())
    fifth = activations.max(axis=(1,2))

    vggRep = fifth.ravel()
    return vggRep



In [None]:
def generate_vectors(adata_filtered, model):
    data_list = []

    for batch_id in adata_filtered.obs.batchID.unique():
        test_im = vectorize_imaging_data(adata_filtered, [batch_id])
        test_data = adata_filtered[adata_filtered.obs.batchID == batch_id]
        valid_cell_labels = test_data.obs.cell_label.tolist()
        cell_dict = test_im[0]
        subset_dict = {label: cell_dict[label][0] for label in valid_cell_labels if label in cell_dict}
        subset_dict = filter_images(subset_dict, max_dim=800)
        filtered_labels = subset_dict.keys()
        test_data = test_data[test_data.obs.cell_label.isin(filtered_labels)]
        subset_dict = pad_images(subset_dict, target_height=800, target_width=800)

        # Generate vectors for each key in subset_dict
        for key, value in subset_dict.items():
            
            currImg = histN(value)
            img_res = cv2.resize(currImg, dsize=(112,112), interpolation=cv2.INTER_NEAREST)
            padding_size = 56
            padding_color_bgr = [103.939, 116.779, 123.68]

            padded_image = cv2.copyMakeBorder(
                img_res,
                padding_size, padding_size, padding_size, padding_size,  # Top, bottom, left, right padding
                cv2.BORDER_CONSTANT,
                value=padding_color_bgr
            )
            #img_vgg = preprocess_image(padded_image)
            
            masked_H = histN(value)
            vecRep = to_tex(padded_image, model, 224)
            vecRep_serialized = json.dumps(vecRep.tolist())
            # Store batchID, key, and vector in the list
            data_list.append({'batchID': batch_id, 'key': key, 'vector': vecRep_serialized})

    # Convert the list to a DataFrame
    df = pd.DataFrame(data_list)
    return df

In [None]:
adata_filtered = sc.read_h5ad('Microglia_Shapespace_baysor_500.h5ad')

model = vgg19.VGG19(weights='imagenet',include_top=False)
model.compile(loss="categorical_crossentropy", optimizer="adam")

In [None]:
feature_space = generate_vectors(adata_filtered, model)

In [None]:
df_vectors = feature_space.copy()

df_vectors.rename(columns={'key': 'cell_label'}, inplace=True)

# Merge the DataFrame with adata_filtered.obs on 'batchID' and 'cell_label'
merged_df = adata_filtered.obs.reset_index().merge(df_vectors, on=['batchID', 'cell_label'], how='left').set_index('index')

# Add the aligned vectors to adata_filtered.obs
adata_filtered.obs['texture'] = merged_df['vector']

adata_filtered.write_h5ad('Microglia_cuttlefish_500_block5_conv1.h5ad')

In [None]:
# Now we can go ahead and create a umap for this:

adata = sc.read_h5ad('Microglia_cuttlefish_500_block5_conv1.h5ad')
adata = adata[~adata.obs['texture'].isna()].copy()

vector_list = [json.loads(vec) if pd.notnull(vec) else np.zeros(512) for vec in adata.obs['texture']]
vector_matrix = np.array(vector_list)

# Step 2: Perform PCA
pca = PCA(n_components=10)
principal_components = pca.fit_transform(vector_matrix)

# Step 3: Calculate UMAP
n_neighbors = 10
umap_model = umap.UMAP(n_neighbors=n_neighbors)
umap_embedding = umap_model.fit_transform(principal_components)

adata.obsm['X_pca'] = principal_components

# Step 4: Add UMAP embedding to AnnData object
adata.obsm['X_umap_shapes'] = umap_embedding

# Example of how to plot UMAP embedding using scanpy, colored by batchID
sc.pl.embedding(adata, basis='X_umap_shapes', color=['batchID', 'Age'], title='Custom UMAP of Microglia')

adata.write_h5ad('Microglia_cuttlefish_500_block5_conv1_umap.h5ad')