In [1]:
import sys
sys.path.append("../src")

import os
from pathlib import Path

import torch
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight

from utils import extract_coords, merge_patches, adjust_coords

  from .autonotebook import tqdm as notebook_tqdm
  check_for_updates()


In [2]:
model = "UNI"

img_dir = os.path.join("..", "..", "raw-data", "embeddings", "experiment-0", model)

os.path.isdir(img_dir)

True

In [3]:
len(os.listdir(img_dir))

127

In [4]:
label_dir = os.path.join("..", "data", "labels.csv")
df = pd.read_csv(label_dir)

df.head()

Unnamed: 0,id,grade
0,16425,1
1,16421,1
2,16223,1
3,16089,1
4,16026,1


In [5]:
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(df["grade"]),
    y=df["grade"]
)

In [6]:
class_weights

array([0.79746835, 1.34042553])

In [7]:
img_ids = [Path(i).stem for i in os.listdir(img_dir)]

df = df[df["id"].isin(img_ids)]

df.shape

(121, 2)

In [8]:
df["grade"].value_counts()

grade
1     77
2+    44
Name: count, dtype: int64

In [8]:
df["embedding_path"] = df["id"].map(lambda x: os.path.join(img_dir, f"{x}.parquet"))

df.head()

Unnamed: 0,id,grade,embedding_path
0,16425,1,../../raw-data/embeddings/experiment-0/UNI/164...
1,16421,1,../../raw-data/embeddings/experiment-0/UNI/164...
2,16223,1,../../raw-data/embeddings/experiment-0/UNI/162...
3,16089,1,../../raw-data/embeddings/experiment-0/UNI/160...
4,16026,1,../../raw-data/embeddings/experiment-0/UNI/160...


In [9]:
all(df["embedding_path"].map(lambda x: os.path.isfile(x)))

True

In [10]:
test_path = df[df["id"] == "12447"]["embedding_path"].item()

test_df = pd.read_parquet(test_path)

test_df.head()

Unnamed: 0,coords,embedding,processed_coords
0,patch-45472-45696-24192-24416,"[0.41218102, 0.21434812, -0.43298548, -1.53898...","[45472, 45696, 24192, 24416]"
1,patch-45696-45920-24192-24416,"[0.9725843, 0.49554092, 0.84255534, -0.7825324...","[45696, 45920, 24192, 24416]"
2,patch-47712-47936-24192-24416,"[0.19120352, -1.0196893, 0.5372217, -1.2949024...","[47712, 47936, 24192, 24416]"
3,patch-47936-48160-24192-24416,"[0.003308189, -0.33312634, 0.84845734, -1.0282...","[47936, 48160, 24192, 24416]"
4,patch-48160-48384-24192-24416,"[0.7834015, -1.0366802, -0.24850303, -1.468561...","[48160, 48384, 24192, 24416]"


In [11]:
test_path

'../../raw-data/embeddings/experiment-0/UNI/12447.parquet'

In [12]:
len(test_df)

2036

In [13]:
np.vstack(test_df["embedding"].tolist()).shape

(2036, 1024)

In [14]:
len(os.listdir("../../raw-data/patches/experiment-0/12447")) == len (test_df)

True

In [18]:
train, test = train_test_split(df, test_size=0.3, shuffle=True, stratify=df["grade"])

val, test = train_test_split(test, test_size=0.5, shuffle=True, stratify=test["grade"])

In [19]:
train["grade"].value_counts()

grade
1     53
2+    31
Name: count, dtype: int64

In [20]:
test["grade"].value_counts()

grade
1     12
2+     7
Name: count, dtype: int64

In [21]:
val["grade"].value_counts()

grade
1     12
2+     6
Name: count, dtype: int64

In [16]:
train

Unnamed: 0,id,grade,embedding_path
83,11785C,1,../../raw-data/embeddings/experiment-0/UNI/117...
25,14077,1,../../raw-data/embeddings/experiment-0/UNI/140...
4,16026,1,../../raw-data/embeddings/experiment-0/UNI/160...
41,13193,2+,../../raw-data/embeddings/experiment-0/UNI/131...
40,13267,1,../../raw-data/embeddings/experiment-0/UNI/132...
...,...,...,...
80,11845,1,../../raw-data/embeddings/experiment-0/UNI/118...
88,11727,2+,../../raw-data/embeddings/experiment-0/UNI/117...
32,13663,1,../../raw-data/embeddings/experiment-0/UNI/136...
23,14120,1,../../raw-data/embeddings/experiment-0/UNI/141...


In [17]:
test_embedding_path = train["embedding_path"].tolist()[0]

train = pd.read_parquet(test_embedding_path)
train["processed_coords"] = train["coords"].map(lambda x: extract_coords(x))
train = train.sort_values(by="processed_coords", key=lambda col: col.map(lambda x: (x[2], x[3], x[0], x[1])))
train["embedding"] = train["embedding"].map(lambda x: x.reshape(1, 1, 1024))
train["adjusted_coords"] = adjust_coords(train["processed_coords"].tolist(), 224, 1)

train.head()

Unnamed: 0,coords,embedding,processed_coords,adjusted_coords
0,patch-17472-17696-21280-21504,"[[[-0.107238434, -0.5433728, 0.48717967, -3.16...","(17472, 17696, 21280, 21504)","[78, 79, 95, 96]"
1,patch-17696-17920-21280-21504,"[[[-0.494741, -0.52670056, 0.7192134, -2.85197...","(17696, 17920, 21280, 21504)","[79, 80, 95, 96]"
2,patch-18368-18592-21280-21504,"[[[0.02677094, -0.4770287, 0.7068547, -3.08085...","(18368, 18592, 21280, 21504)","[82, 83, 95, 96]"
3,patch-18592-18816-21280-21504,"[[[0.040181085, -0.3175492, 0.62684464, -3.061...","(18592, 18816, 21280, 21504)","[83, 84, 95, 96]"
4,patch-18816-19040-21280-21504,"[[[-0.24719547, -0.5820347, 0.64646184, -2.925...","(18816, 19040, 21280, 21504)","[84, 85, 95, 96]"


In [18]:
merged = merge_patches(train["embedding"].tolist(), train["adjusted_coords"].tolist(), target_patch_size=1)

merged.shape

(384, 384, 1024)

In [19]:
train.iloc[0]["embedding"].shape

(1, 1, 1024)

In [20]:
merged[0][0].shape

(1024,)

In [21]:
np.unique(merged[0][0])

array([0.], dtype=float32)

In [22]:
np.array_equal(merged[226][95], train.iloc[0]["embedding"].squeeze())

False

In [23]:
def save_stitched_embeddings(src_files, dest_dir):
    os.makedirs(dest_dir, exist_ok=True)
    for file in src_files:
        filename = Path(file).stem

        df = pd.read_parquet(file)
        df["processed_coords"] = df["coords"].map(lambda x: extract_coords(x))
        df = df.sort_values(by="processed_coords", key=lambda col: col.map(lambda x: (x[2], x[3], x[0], x[1])))

        df["embedding"] = df["embedding"].map(lambda x: x.reshape(1, 1, 1024))
        df["adjusted_coords"] = adjust_coords(
            coords=df["processed_coords"].tolist(), src_patch_size=224, target_patch_size=1
            )

        merged = merge_patches(df["embedding"].tolist(), df["adjusted_coords"].tolist(), target_patch_size=1)
        np.save(os.path.join(dest_dir, f"{filename}.npy"), merged)


def save_isolated_embeddings(src_files, dest_dir):
    os.makedirs(dest_dir, exist_ok=True)
    for file in src_files:
        filename = Path(file).stem

        df = pd.read_parquet(file)
        stacked = np.vstack(df["embedding"].tolist())
        np.save(os.path.join(dest_dir, f"{filename}.npy"), stacked)

In [24]:
for i in tqdm(range(1, 6)):
    dest_dir = os.path.join("..", "data", model)
    stitched_dest_dir = os.path.join(dest_dir, "stitched", f"split-{i}")
    isolated_dest_dir = os.path.join(dest_dir, "isolated", f"split-{i}")

    train, test = train_test_split(df, test_size=0.3, shuffle=True, stratify=df["grade"], random_state=i)
    val, test = train_test_split(test, test_size=0.5, shuffle=True, stratify=test["grade"], random_state=i)

    train_embedding_paths = train["embedding_path"].tolist()
    val_embedding_paths = val["embedding_path"].tolist()
    test_embedding_paths = test["embedding_path"].tolist()

    save_stitched_embeddings(train_embedding_paths, os.path.join(stitched_dest_dir, "train"))
    save_stitched_embeddings(val_embedding_paths, os.path.join(stitched_dest_dir, "val"))
    save_stitched_embeddings(test_embedding_paths, os.path.join(stitched_dest_dir, "test"))

    save_isolated_embeddings(train_embedding_paths, os.path.join(isolated_dest_dir, "train"))
    save_isolated_embeddings(val_embedding_paths, os.path.join(isolated_dest_dir, "val"))
    save_isolated_embeddings(test_embedding_paths, os.path.join(isolated_dest_dir, "test"))

100%|██████████| 5/5 [12:38<00:00, 151.66s/it]
