In [17]:
import os
import sys
import json
from pathlib import Path
sys.path.insert(0, os.path.join(str(Path().resolve().parents[0]), "src"))

import deeplake
import numpy as np
import pandas as pd
from umap import UMAP
from tqdm import tqdm
import plotly.express as px

from utils import set_seed
from utils.constants import EMBEDDING_DIR, RESULTS_DIR

In [None]:
encoder = "uni"
augmentation = "brightness"
dest_dir = os.path.join(RESULTS_DIR, "gleason_grading", encoder, "perturbation", augmentation, "separability")
os.makedirs(dest_dir, exist_ok=True)

seed = 42
set_seed(seed)

In [3]:
perturbation_dir = os.path.join(EMBEDDING_DIR, "gleason-grading", encoder, "perturbations")

brightness_dir = os.path.join(perturbation_dir, augmentation, "performance")

perturbed_datasets = sorted(os.listdir(brightness_dir), key=lambda x: float(x.split("_")[-1]))

perturbed_datasets[:5]

['brightness_-1.0',
 'brightness_-0.8',
 'brightness_-0.6',
 'brightness_-0.4',
 'brightness_-0.2']

In [4]:
test_dir = os.path.join(brightness_dir, perturbed_datasets[0])

ds = deeplake.open_read_only(test_dir)

In [5]:
ds.summary()

Dataset length: 373726
Columns:
  embedding: embedding(1024, clustered)
  label    : int32
  file_key : int32




In [6]:
unique_labels = np.unique(ds["label"][:]).tolist()

unique_labels

[0, 1, 2, 3, 4]

In [None]:
embeddings = []
labels = []
file_key = []

sample_percentage = 0.1
for label in tqdm(unique_labels, desc="Sampling dataset"):
    filtered = ds.query(f"SELECT * WHERE label == {label}")
    indices = np.random.randint(
        low=0, 
        high=len(filtered)-1, 
        size=int(len(filtered)*sample_percentage)
    )

    sampled = filtered[*indices]
    embeddings.extend(sampled["embedding"])
    labels.extend(sampled["label"])
    file_key.extend(sampled["file_key"])

Sampling dataset: 100%|██████████| 5/5 [00:02<00:00,  1.68it/s]


In [8]:
df = pd.DataFrame({
    "embedding": embeddings,
    "label": labels,
    "file_key": file_key
})

df["label"] = df["label"].map(lambda x: str(x))

df.head()

Unnamed: 0,embedding,label,file_key
0,"[-1.0332096, -0.29899368, 1.1959273, 1.5469971...",0,264880
1,"[-1.0332096, -0.29899368, 1.1959273, 1.5469971...",0,249945
2,"[-1.0332096, -0.29899368, 1.1959273, 1.5469971...",0,287243
3,"[-1.0332096, -0.29899368, 1.1959273, 1.5469971...",0,303428
4,"[-1.0332096, -0.29899368, 1.1959273, 1.5469971...",0,293817


In [None]:
dest_dir = os.path.join(RESULTS_DIR, "gleason_grading", encoder, "perturbation", augmentation, "separability")
os.makedirs(dest_dir, exist_ok=True)

umap_2d = UMAP()
proj_2d = umap_2d.fit_transform(X=np.stack(df["embedding"].tolist(), axis=0))

fig_2d = px.scatter(
    proj_2d, x=0, y=1,
    color=df["label"], labels={"color": "label"},
    height=800, width=800
)

fig_2d.show()