In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import umap
from sklearn.preprocessing import StandardScaler

In [None]:
def compute_umap(data, 
                n_neighbors=100,
                min_dist=0.1,
                n_components=2,
                metric='euclidean',
                random_state=None):
    """
    Compute UMAP embeddings from input data
    
    Parameters:
    -----------
    data : array-like
        Input data matrix
    n_neighbors : int, default=15
        Number of neighbors to consider for manifold construction
    min_dist : float, default=0.1
        Minimum distance between points in the embedding
    n_components : int, default=2
        Number of dimensions in the embedding
    metric : str, default='euclidean'
        Distance metric to use
    random_state : int, default=42
        Random seed for reproducibility
    
    Returns:
    --------
    embedding : numpy array
        UMAP embedding of shape (n_samples, n_components)
    reducer : UMAP object
        Fitted UMAP object for potential reuse
    """
    # Scale the data
    scaler = StandardScaler()
    data_scaled = scaler.fit_transform(data)
    
    # Configure and fit UMAP
    reducer = umap.UMAP(
        n_neighbors=n_neighbors,
        min_dist=min_dist,
        n_components=n_components,
        metric=metric,
        random_state=random_state,
        n_jobs=16
    )
    
    # Compute embedding
    embedding = reducer.fit_transform(data_scaled)
    
    return embedding, reducer

def plot_embedding(embedding, 
                          labels=None,
                          figsize=(15, 10),
                          point_size=5,
                          alpha=0.7,
                          title=None,
                          palette='deep',
                          style='whitegrid',
                          return_fig=False,
                          show_legend=True,
                          xlabel='UMAP 0',
                          ylabel='UMAP 1',
                          hue_norm=None,
                          hue_order=None,
                          context='notebook'):

    # Set Seaborn style and context
    sns.set_style(style)
    sns.set_context(context)
    
    # Create DataFrame for Seaborn
    df = pd.DataFrame({
        'UMAP0': embedding[:, 0],
        'UMAP1': embedding[:, 1]
    })
    
    if labels is not None:
        df['Labels'] = labels
    
    # Create figure
    fig, ax = plt.subplots(figsize=figsize)
    
    # Create scatter plot
    if labels is not None:
        sns.scatterplot(
            data=df,
            x='UMAP0',
            y='UMAP1',
            hue='Labels',
            palette=palette,
            s=point_size,
            alpha=alpha,
            legend=show_legend,
            hue_norm=hue_norm,
            hue_order=hue_order,
        )
        
        # Adjust legend
        if show_legend:
            if isinstance(labels[0], (int, float, np.number)):
                plt.colorbar()
            #plt.legend(markerscale=10, bbox_to_anchor=(1.01, 1), loc='upper left')
            plt.legend(markerscale=10, loc='upper right')
    else:
        sns.scatterplot(
            data=df,
            x='UMAP0',
            y='UMAP1',
            s=point_size,
            alpha=alpha
        )
    
    # Customize plot
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    if title:
        plt.title(title)
    
    # Adjust layout to prevent label cutoff
    plt.tight_layout()
    
    if return_fig:
        return fig
    plt.show()

### Load the RAD-DINO embeddings

In [None]:
embd_Emory = pd.read_csv('./input_example.csv')

print('number of sample:', embd_Emory.shape[0])
X = embd_Emory[[f'{i}' for i in range(768)]]

### 2D visualization using UMAP

In [None]:
# Compute UMAP embedding
embedding, reducer = compute_umap(X, n_neighbors=100, min_dist=0.7)

In [None]:
col = 'ViewPosition'

y = embd_Emory[col].astype(str)

# Plot with categorical labels
plot_embedding(
        embedding,
        labels=y,
        palette='Set1',
        style='white',
        title=col,
        point_size=1,
        alpha=1
    )

In [None]:
## Save UMAP embedding for future use as UMAP is time consuming

embd_umap = embd_Emory[['ImagePath']]
embd_umap[['UMAP0','UMAP1']] = embedding

embd_umap.to_csv('embd_UMAP.csv', index=False)

### Plotting images on the UMAP plane

In [None]:
## bin the UMAP embeddings 

embd_umap['UMAP0_Quantile'] = pd.cut(embd_umap.UMAP0, 25, labels=[i+1 for i in range(25)])
embd_umap['UMAP1_Quantile'] = pd.cut(embd_umap.UMAP1, 25, labels=[i+1 for i in range(25)])

In [None]:
embd_umap.head()

In [None]:
import itertools
from functools import partial
from concurrent.futures import ProcessPoolExecutor
from transforms.Transform4RADDINO import Transform4RADDINO
import monai as mn

def get_data_dict_part(df_part):
    "Important! Might need to modify this function"
    
    data_dict = list()
    for i in range(len(df_part)):
        row = df_part.iloc[i]

        data_dict.append({
            'img':BASE+row["ImagePath"],
            "paths": BASE+row["ImagePath"]
        })
    
    return data_dict

def get_data_dict(df, num_cores=1):
    parts = np.array_split(df, num_cores)
    func = partial(get_data_dict_part)
    
    with ProcessPoolExecutor(num_cores) as executor:
        data_dicts = executor.map(func, parts)
    
    return list(itertools.chain(*data_dicts))

In [None]:
IMG_SIZE = 518
BASE = 'IMAGE_PATH_HERE/' # base path for the image

eval_transforms = Transform4RADDINO(IMG_SIZE).predict

fig, axs = plt.subplots(25, 25, figsize=(30,30))

for i in range(25):
    for j in range(25):
        
        df_temp = embd_umap[(embd_umap.UMAP0_Quantile==j+1) & (embd_umap.UMAP1_Quantile==i+1)]
        
        if len(df_temp)>0:
            df_temp_ = df_temp.sample(n=1)
            eval_dict = get_data_dict(df_temp_)
            test_ds=mn.data.Dataset(data=eval_dict, transform=eval_transforms)
            
            for data_ in test_ds[0:1]:
            
                axs[24-i,j].imshow(np.array(data_['img'])[0,:,:], cmap='gray')
                #axs[24-i,j].set_title(f'{df_temp_.Dataset.iloc[0]}')
        axs[24-i,j].axis("off")
plt.tight_layout()
plt.show()

### Filter out the outliers 

In [None]:
## Please modify the boundaries

err = embd_umap[(embd_umap.UMAP0_Quantile>=7) & (embd_umap.UMAP0_Quantile<=10) & (embd_umap.UMAP1_Quantile>=17) & (embd_umap.UMAP1_Quantile<=24)]
err

#### Plot Outliers

In [None]:
from PIL import Image

def plot_images(image_paths, n_img):
   figsize=(20, 25*n_img)
   fig, axes = plt.subplots(1, n_img, figsize=figsize)
   
   for i, ax in enumerate(axes.flat):
       if i < len(image_paths):
           img = Image.open(BASE+image_paths[i])
           ax.imshow(np.array(img), cmap='gray')
           ax.axis('off')
       else:
           ax.axis('off')
           
   plt.tight_layout()
   plt.show()

In [None]:
n_img = 5
image_paths = list(err.sample(n=n_img).ImagePath)
plot_images(image_paths, n_img)