In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')
!pip install openTSNE


Mounted at /content/drive
Collecting openTSNE
  Downloading openTSNE-1.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.8 kB)
Downloading openTSNE-1.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m90.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: openTSNE
Successfully installed openTSNE-1.0.2


In [None]:
import os
import numpy as np
from pathlib import Path
from typing import List, Tuple, Dict

from keras.applications import VGG16
from keras.applications.vgg16 import preprocess_input
from keras.preprocessing import image
from sklearn.decomposition import PCA
from sklearn.manifold import Isomap, TSNE as SKTSNE
import matplotlib.pyplot as plt

In [None]:
# -- Configuration -----------------------------------------------------------
# Paths to datasets: real and synthetic
DATASET_PATHS: Dict[str, str] = {
    'VinDr': '/content/drive/My Drive/Mammography_data_RS/Real_Mammography_Data/cropped_sampled_vindr-mammo_images/cropped_sampled_vindr-mammo_images_resized/center_cropped/equalized_images',
    'DDSM':  '/content/drive/My Drive/Mammography_data_RS/Real_Mammography_Data/DDSM_all_images_cropped/DDSM_images_resized/DDSM_all_clean_all/center_cropped/equalized_images',
    'InBreast': '/content/drive/My Drive/Mammography_data_RS/Real_Mammography_Data/INbreast_cropped_DICOM_images/INbreast_cropped_DICOM_images_resized/center_cropped/equalized_images',
    'MIAS': '/content/drive/My Drive/Mammography_data_RS/Real_Mammography_Data/all-mias_cropped_images/all-mias_cropped_images_resized/center_cropped_clean_all_mias/equalized_images',
    'MSYNTH': '/content/drive/My Drive/Mammography_data_RS/SMD_datasets/All_images_Elena/Elena_images_resized/equalized_images',
    'HuggingFace': '/content/drive/My Drive/Mammography_data_RS/SMD_datasets/HF_synthetic_mammography_csaw /center_cropped',
    'Mammo_medigan': '/content/drive/My Drive/Mammography_data_RS/SMD_datasets/Mammo_medigan/medigan_images_resized/center_cropped'
}

# Directory to save extracted features
FEATURES_DIR = Path('/content/drive/My Drive/Scorecard_Results/VGG16_Features')
FEATURES_DIR.mkdir(parents=True, exist_ok=True)

# Image size expected by VGG16
IMG_SIZE: Tuple[int, int] = (512, 512)
# Supported image file extensions
IMAGE_EXTS = {'.jpg', '.jpeg', '.png', '.bmp'}

In [None]:
# -- Model Initialization ---------------------------------------------------
# Load pre-trained VGG16 without the top classification layers
# Other models could be used
model = VGG16(weights='imagenet', include_top=False)

# -- Feature Extraction ------------------------------------------------------
def extract_vgg16_features(img_path: Path) -> np.ndarray:
    """
    Load an image, preprocess it, and extract features using VGG16.

    Args:
        img_path: Path to the image file.
    Returns:
        1D numpy array of extracted features.
    """
    img = image.load_img(str(img_path), target_size=IMG_SIZE)
    arr = image.img_to_array(img)
    arr = np.expand_dims(arr, axis=0)
    arr = preprocess_input(arr)
    feats = model.predict(arr)
    return feats.flatten()



In [None]:
def save_dataset_features(name: str, paths: List[Path]) -> None:
    """
    Extract and save features for all images in the given dataset paths.

    Args:
        name: Dataset identifier (used for file naming).
        paths: List of root directories to search for images.
    """
    features_list = []
    filenames = []

    for root in paths:
        for img_file in Path(root).rglob('*'):
            if img_file.suffix.lower() in IMAGE_EXTS:
                feats = extract_vgg16_features(img_file)
                features_list.append(feats)
                filenames.append(img_file.name)

    if not features_list:
        print(f"⚠️ No images found for dataset '{name}' in paths: {paths}")
        return

    out_file = FEATURES_DIR / f"{name}_features.npz"
    np.savez_compressed(out_file,
                        features=np.stack(features_list),
                        filenames=filenames)
    print(f"✅ Saved {len(features_list)} feature vectors to {out_file}")




In [None]:
def load_all_features() -> Tuple[np.ndarray, List[str]]:
    """
    Load features and labels from all .npz files in FEATURES_DIR.

    Returns:
        features: 2D numpy array, shape (n_samples, n_features).
        labels: List of dataset names corresponding to each sample.
    """
    all_feats = []
    all_labels = []

    for npz_file in FEATURES_DIR.glob('*.npz'):
        dataset = npz_file.stem.replace('_features', '')
        data = np.load(npz_file, allow_pickle=True)
        feats = data['features']  # shape (n_samples, feature_dim)
        all_feats.append(feats)
        all_labels += [dataset] * feats.shape[0]

    if not all_feats:
        raise RuntimeError(f"No feature files found in {FEATURES_DIR}")

    return np.vstack(all_feats), all_labels

In [None]:
# -- Visualization Utilities ------------------------------------------------
def plot_embedding(X: np.ndarray, labels: List[str], title: str, method: str='PCA', components: int=2) -> None:
    """
    Reduce dimensionality and plot embeddings with a legend.

    Args:
        X: Feature matrix (n_samples, n_features).
        labels: List of dataset labels per sample.
        title: Title for the plot.
        method: Dimensionality reduction method: 'PCA', 'Isomap', or 'TSNE'.
        components: Number of dimensions to reduce to (only 2 is supported for plotting).
    """
    # Select reducer
    if method == 'PCA':
        reducer = PCA(n_components=components)
    elif method == 'Isomap':
        reducer = Isomap(n_components=components)
    elif method == 'TSNE':
        reducer = SKTSNE(n_components=components, random_state=42)
    else:
        raise ValueError(f"Unknown method: {method}")

    X_emb = reducer.fit_transform(X)
    plt.figure(figsize=(10, 8))

    # Plot each dataset separately
    unique = sorted(set(labels))
    for ds in unique:
        idx = [i for i, l in enumerate(labels) if l == ds]
        plt.scatter(X_emb[idx, 0], X_emb[idx, 1], alpha=0.7, label=ds)

    plt.title(f"{method} Visualization - {title}")
    plt.xlabel('Component 1')
    plt.ylabel('Component 2')
    plt.legend(loc='best')
    plt.grid(True)
    plt.show()

In [None]:
# -- Main Workflow -----------------------------------------------------------
def main():
    # Step 1: Extract and save features for each dataset
    for name, path in DATASET_PATHS.items():
        save_dataset_features(name, [Path(path)])

    # Step 2: Load all features and labels
    features, labels = load_all_features()

    # Step 3: Visualize embeddings
    for method in ['PCA', 'Isomap', 'TSNE']:
        plot_embedding(features, labels, title='All Datasets', method=method)

if __name__ == '__main__':
    main()


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 75ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 70ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 70ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 72ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 75ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 75ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 74ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 74ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 74ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 74ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 74ms/step