# Loading Pre-extracted Vector Embeddings into Pandas DataFrames
This notebook demonstrates how to load and work with pre-extracted vector embeddings into pandas DataFrames.<br>
Vector embeddings are numerical representations of images that capture semantic meaning in high-dimensional space.

In [None]:
# Core data manipulation and numerical computing
import os
import numpy as np
import pandas as pd

# Progress tracking for long-running operations
from tqdm import tqdm

# Type hints for better code documentation and IDE support
from typing import List, Literal, Tuple, Optional

# Concurrent processing for improved performance
from concurrent.futures import ThreadPoolExecutor, as_completed

## Core Functions for Loading Embeddings
The following functions handle the loading of pre-extracted vector embeddings from disk into pandas DataFrames.<br>
The implementation uses parallel processing to efficiently load multiple embedding files simultaneously.

In [None]:
def _load_single_embedding(sop: str, dataset_path: str, FM: str) -> Tuple[str, Optional[np.ndarray]]:
    """
    Load a single embedding file from disk.
    
    This helper function handles the loading of individual .npy files containing 
    pre-computed vector embeddings. It includes error handling for missing files
    and corrupted data.
    
    Args:
        sop (str): SOP (Study/Series/Image) identifier - unique ID for the embedding
        dataset_path (str): Base path to the dataset embeddings directory
        FM (str): Feature model name (e.g., 'RAD-DINO', 'MedImageInsights')
    
    Returns:
        Tuple[str, Optional[np.ndarray]]: 
            - First element: the SOP identifier (for tracking)
            - Second element: loaded embedding array or None if loading failed
    """
    # Construct the full path to the embedding file
    # Format: dataset_path/embds_{FM}/{sop}.npy
    embedding_file = os.path.join(dataset_path, f'embds_{FM}', sop + '.npy')
    
    # Check if the embedding file exists before attempting to load
    if not os.path.exists(embedding_file):
        print(f"Warning: Embedding file not found for SOP '{sop}': {embedding_file}")
        return sop, None
        
    try:
        # Load the numpy array from disk
        # Embeddings are typically stored as .npy files for efficient loading
        embd = np.load(embedding_file)
        return sop, embd
    except Exception as e:
        # Handle any errors during file loading (corruption, permissions, etc.)
        print(f"Error loading embedding for SOP '{sop}': {e}")
        return sop, None

def load_embeddings(
    SOP: List[str], 
    dataset: Literal['EmoryCXR','MIMIC','MRKR','EMBED'] = 'EmoryCXR', 
    FM: Literal['MedImageInsights','RAD-DINO','CheXagent','MedGemma','Mammo-CLIP','BiomedCLIP'] = 'RAD-DINO',
    max_workers: Optional[int] = None
) -> pd.DataFrame:
    """
    Load embeddings for multiple SOPs in parallel and return as a pandas DataFrame.
    
    This function efficiently loads vector embeddings for a list of identifiers using
    parallel processing. The embeddings are loaded from pre-computed .npy files and
    organized into a pandas DataFrame for easy manipulation and analysis.
    
    Args:
        SOP (List[str]): List of SOP identifiers to load embeddings for
        dataset (Literal): Dataset name - specifies which dataset's embeddings to load
            Options: 'EmoryCXR', 'MIMIC', 'MRKR', 'EMBED'
        FM (Literal): Feature model name - specifies which embedding model to use
            Options: 'MedImageInsights', 'RAD-DINO', 'CheXagent', 'MedGemma', 'Mammo-CLIP'
        max_workers (Optional[int]): Maximum number of parallel worker threads
            Limited to 4 workers max to prevent system overload. Defaults to 4.
    
    Returns:
        pd.DataFrame: DataFrame with:
            - Rows: Each SOP identifier
            - Columns: Embedding dimensions (typically 768, 1024, or 2048 features)
            - Index: 'SOP' column containing the identifiers
    
    Raises:
        TypeError: If SOP is not a list
        ValueError: If SOP list is empty or invalid dataset/FM specified
        RuntimeError: If no embeddings could be loaded successfully
    """
    # Input validation - ensure SOP is a list
    if not isinstance(SOP, list):
        raise TypeError("SOP must be a list of identifiers")
    
    # Input validation - ensure SOP list is not empty
    if not SOP:
        raise ValueError("SOP list cannot be empty")
    
    # Dataset path mapping - defines where embeddings are stored for each dataset
    path_dict = {
        'EmoryCXR': '/mnt/NAS3/Embeddings/EmoryCXR/',
        'MIMIC': '/mnt/NAS3/Embeddings/MIMIC/',
        'MRKR': '/mnt/NAS3/Embeddings/MRKR/',
        'EMBED': '/mnt/NAS3/Embeddings/EMBED/'
    }
    
    # Validate dataset parameter
    if dataset not in path_dict:
        raise ValueError(f"Dataset '{dataset}' not supported. Available: {list(path_dict.keys())}")
    
    # Supported feature models - each produces different embedding dimensions
    FMs = ['MedImageInsights','RAD-DINO','CheXagent','MedGemma','Mammo-CLIP','BiomedCLIP']
    if FM not in FMs:
        raise ValueError(f"FM '{FM}' not supported. Available: {FMs}")
    
    # Get the base path for the selected dataset
    dataset_path = path_dict[dataset]
    
    # Dictionary to store successfully loaded embeddings
    embds_dict = {}
    
    # Parallel loading with thread pool
    # Limit to 4 workers maximum to prevent overwhelming the file system
    effective_max_workers = min(max_workers, 4) if max_workers is not None else 4
    
    with ThreadPoolExecutor(max_workers=effective_max_workers) as executor:
        # Submit all loading tasks to the thread pool
        future_to_sop = {
            executor.submit(_load_single_embedding, sop, dataset_path, FM): sop 
            for sop in SOP
        }
        
        # Collect results as they complete, with progress tracking
        for future in tqdm(as_completed(future_to_sop), 
                          total=len(SOP), 
                          desc=f"Loading {FM} embeddings"):
            sop, embedding = future.result()
            
            # Only store successfully loaded embeddings
            if embedding is not None:
                embds_dict[sop] = embedding
    
    # Ensure at least some embeddings were loaded successfully
    if not embds_dict:
        raise RuntimeError("No embeddings were successfully loaded")
    
    # Convert dictionary to DataFrame
    # - Keys become column names (SOPs)
    # - Values become column data (embedding vectors)
    # - Transpose so SOPs become rows instead of columns
    # - Reset index to make SOP identifiers a proper column
    df = pd.DataFrame(embds_dict).T.reset_index(names=['SOP'])
    
    print(f"Successfully loaded {len(embds_dict)} embeddings out of {len(SOP)} requested")
    print(f"Embedding shape: {df.shape[1]-1}")
    
    return df

## Metadata loading
**Available Embeddings:**<br>
<img src="./figs/embeddings.png" alt="login" width="750"><br>

In [None]:
# =============================================================================
# DATASET AND MODEL CONFIGURATION
# =============================================================================
# This section defines the core configuration for loading embeddings.
# Modify these variables to switch between datasets or feature models.

# Primary dataset selection
DATASET = 'EmoryCXR'  # Current dataset being analyzed
                      # Options: 'EmoryCXR', 'MIMIC', 'MRKR', 'EMBED'

# Feature model selection
FM = 'MedGemma'          # Embedding model to use for analysis
                         # Available models:
                         # - 'MedImageInsights'
                         # - 'RAD-DINO'
                         # - 'CheXagent'
                         # - 'MedGemma'
                         # - 'Mammo-CLIP'
                         # - 'BiomedCLIP'

# =============================================================================
# DATASET-SPECIFIC FILE PATHS
# =============================================================================
# These paths point to the metadata and embedding availability files.
# Each dataset has its own directory structure and naming conventions.

# Main metadata file containing clinical and imaging information
META_PATH = '/mnt/NAS3/CXR/EmoryCXRv2/TABLES/EmoryCXR_v2_Metadata_08112025.csv'
# Dataset alternatives (uncomment and modify DATASET variable above):
# MIMIC: '/mnt/NAS3/CXR/MIMIC_CXR/physionet.org/files/mimic-cxr-jpg/2.0.0/mimic-cxr-2.0.0-metadata.csv'
# MRKR:  '/mnt/NAS3/MRKR_dataset/MRKR_processed_images_08112025.csv'
# EMBED: '/mnt/NAS3/EMBED_dataset/EMBED_metadata.csv'

# Embedding availability list - tracks which images have pre-computed embeddings
EMBDS_LIST_PATH = '/mnt/NAS3/Embeddings/EmoryCXR/embds_list.csv'
# This CSV contains boolean columns for each feature model indicating availability
# Dataset alternatives:
# MIMIC: '/mnt/NAS3/Embeddings/MIMIC/embds_list.csv'
# MRKR:  '/mnt/NAS3/Embeddings/MRKR/embds_list.csv'
# EMBED: '/mnt/NAS3/Embeddings/EMBED/embds_list.csv'

# =============================================================================
# DATASET-SPECIFIC IDENTIFIER COLUMNS
# =============================================================================
# Different datasets use different column names for image identifiers.
# This mapping ensures compatibility across datasets.

IMAGE_ID_COLUMN = 'SOP'  
# Dataset-specific identifier columns:
# EmoryCXR: 'SOP' 
# MRKR: 'SOP'
# MIMIC: 'dicom_id'  
# EMBED: 'SOPInstanceUID_anon'  

# =============================================================================
# QUICK DATASET SWITCHING GUIDE
# =============================================================================
# To switch datasets, update these three variables:
# 
# For MIMIC-CXR:
#   DATASET = 'MIMIC'
#   IMAGE_ID_COLUMN = 'dicom_id'
#   Update META_PATH and EMBDS_LIST_PATH to MIMIC paths
#
# For MRKR:
#   DATASET = 'MRKR' 
#   IMAGE_ID_COLUMN = 'SOP'
#   Update META_PATH and EMBDS_LIST_PATH to MRKR paths
#
# For EMBED:
#   DATASET = 'EMBED'
#   IMAGE_ID_COLUMN = 'SOP'
#   Update META_PATH and EMBDS_LIST_PATH to EMBED paths

In [None]:
# Load the main metadata file
if DATASET == 'EMBED':
    meta = pd.read_parquet(META_PATH)
else:
    meta = pd.read_csv(META_PATH)

# Load the list of available embeddings
embds_list = pd.read_csv(EMBDS_LIST_PATH)

# Merge metadata with embedding availability
meta = meta.merge(embds_list, on=IMAGE_ID_COLUMN, how='inner')

# Display the merged metadata
meta

### Availability checks

In [None]:
# Filter metadata to only include images with embeddings for the selected feature model
# The metadata contains boolean columns indicating embedding availability for each model
print(f"Selected feature model: {FM}")
print(f"Total images in metadata: {len(meta)}")

# Create subset containing only images with embeddings for the selected FM
# meta[FM] creates a boolean mask - True where embeddings exist for this model
meta_sub = meta[meta[FM]].reset_index(drop=True)

print(f"Images with {FM} embeddings: {len(meta_sub)}")

meta_sub

## Sample selection

Here is just an example. Select samples based on you own tasks.

In [None]:
# Select samples you want to analyze
# For large datasets, working with a subset can speed up development and testing

samples = meta_sub.sample(n=10000) # random select for example

# Extract the list of SOP identifiers from our sample metadata
sop_list = list(samples[IMAGE_ID_COLUMN])

## Embedding loading

In [None]:
# Load embeddings using parallel processing
# This will create a DataFrame with SOPs as rows and embedding dimensions as columns
df = load_embeddings(
    SOP=sop_list,           # List of identifiers to load embeddings for
    dataset=DATASET,     # Dataset name (matches our metadata)
    FM=FM                   # Feature model (matches our filtered selection)
)

In [None]:
df

## Example Application: Logistic Regression with Embeddings
This section demonstrates how to use the loaded vector embeddings for supervised learning tasks. We'll use logistic regression to classify medical findings based on the embedding features.

In [None]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [None]:
# Load the finding labels for supervised learning
# This file contains binary labels for various medical findings/diagnoses
print("Loading finding labels...")
label = pd.read_csv('/mnt/NAS3/CXR/EmoryCXRv2/TABLES/EmoryCXR_v2_FindingLabel_10162024.csv')

# Step 1: Fill NaN values with 0.0 (assuming not mentioned = negative/absent)
label = label.fillna(0.0)
print(f"After filling NaN with 0.0: {label.shape}")

# Step 2: Handle uncertain/unknown labels (coded as -1)
# Replace -1 with NaN to mark uncertain cases, then remove them
label_before_cleaning = len(label)
label = label.replace({-1: np.nan}).dropna()
print(f"After removing uncertain labels (-1): {label.shape}")
print(f"Removed {label_before_cleaning - len(label)} uncertain cases")

# Show the cleaned label data
label

### Linking Embeddings to Study-Level Identifiers
To connect our embeddings with the label data, we need to create a mapping between image-level identifiers (SOPs) and study-level identifiers (Accession Numbers). This is essential because labels are often assigned at the study level while embeddings exist at the image level.

In [None]:
# Create mapping between SOP (image-level) and AccessionNumber (study-level)

SOP_ACC_mapping = meta[['AccessionNumber_anon','SOP']].drop_duplicates()
df = df.merge(SOP_ACC_mapping, on='SOP')
df

### Merging Embeddings with Labels
Now we'll combine our embedding features with the ground truth labels to create the final dataset for machine learning. This step links the vector representations with the clinical findings.

In [None]:
df = df.merge(label, on='AccessionNumber_anon')
df

### Logistic Regression Classification Example
Now we'll demonstrate how to use the loaded embeddings for a supervised learning task. We'll build a logistic regression classifier to predict cardiomegaly (enlarged heart) from the embedding features.

In [None]:
X = df[[i for i in range(1024)]] # ex: 1024 for MedGemma
y = df['Cardiomegaly']

# Proper train/test split with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2,
    random_state=42,
    stratify=y
)

# Train on training data only
clf = LogisticRegression(
    random_state=0,
    max_iter=1000,
    solver='lbfgs',
    class_weight='balanced'  # Handle imbalanced classes
).fit(X_train, y_train)

# Evaluate on both sets for comparison
train_accuracy = clf.score(X_train, y_train)
test_accuracy = clf.score(X_test, y_test)

print('train accuracy:', train_accuracy)
print('test accuracy:', test_accuracy)