In [1]:
import sys
sys.path.append("../src")

import os
from pathlib import Path

import torch
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split

from utils import extract_coords, merge_patches, adjust_coords

In [2]:
model = "UNI"

img_dir = os.path.join("..", "..", "raw-data", "embeddings", "experiment-0", model)

os.path.isdir(img_dir)

True

In [3]:
len(os.listdir(img_dir))

127

In [4]:
label_dir = os.path.join("..", "data", "labels.csv")
df = pd.read_csv(label_dir)

df.head()

Unnamed: 0,id,grade
0,16425,1
1,16421,1
2,16223,1
3,16089,1
4,16026,1


In [5]:
img_ids = [Path(i).stem for i in os.listdir(img_dir)]

df = df[df["id"].isin(img_ids)]

df.shape

(121, 2)

In [6]:
df["embedding_path"] = df["id"].map(lambda x: os.path.join(img_dir, f"{x}.parquet"))

df.head()

Unnamed: 0,id,grade,embedding_path
0,16425,1,../../raw-data/embeddings/experiment-0/UNI/164...
1,16421,1,../../raw-data/embeddings/experiment-0/UNI/164...
2,16223,1,../../raw-data/embeddings/experiment-0/UNI/162...
3,16089,1,../../raw-data/embeddings/experiment-0/UNI/160...
4,16026,1,../../raw-data/embeddings/experiment-0/UNI/160...


In [7]:
all(df["embedding_path"].map(lambda x: os.path.isfile(x)))

True

In [8]:
train, test = train_test_split(df, test_size=0.3, shuffle=True, stratify=df["grade"])

In [9]:
train["grade"].value_counts()

grade
1     53
2+    31
Name: count, dtype: int64

In [10]:
test["grade"].value_counts()

grade
1     24
2+    13
Name: count, dtype: int64

In [11]:
train

Unnamed: 0,id,grade,embedding_path
48,13054B,2+,../../raw-data/embeddings/experiment-0/UNI/130...
105,11226,1,../../raw-data/embeddings/experiment-0/UNI/112...
65,12327,2+,../../raw-data/embeddings/experiment-0/UNI/123...
4,16026,1,../../raw-data/embeddings/experiment-0/UNI/160...
46,13055,2+,../../raw-data/embeddings/experiment-0/UNI/130...
...,...,...,...
107,11189,2+,../../raw-data/embeddings/experiment-0/UNI/111...
26,13982,1,../../raw-data/embeddings/experiment-0/UNI/139...
34,13554,1,../../raw-data/embeddings/experiment-0/UNI/135...
114,10902,2+,../../raw-data/embeddings/experiment-0/UNI/109...


In [12]:
test_embedding_path = train["embedding_path"].tolist()[0]

train = pd.read_parquet(test_embedding_path)
train["processed_coords"] = train["coords"].map(lambda x: extract_coords(x))
train = train.sort_values(by="processed_coords", key=lambda col: col.map(lambda x: (x[2], x[3], x[0], x[1])))
train["embedding"] = train["embedding"].map(lambda x: x.reshape(1, 1, 1024))
train["adjusted_coords"] = adjust_coords(train["processed_coords"].tolist(), 1)

train.head()

Unnamed: 0,coords,embedding,processed_coords,adjusted_coords
0,patch-0-224-0-224,"[[[0.16639982, -0.027044892, -0.446473, -1.191...","(0, 224, 0, 224)","(0, 1, 0, 1)"
10065,patch-224-448-0-224,"[[[0.7690655, 0.50395703, -0.3032657, -0.68857...","(224, 448, 0, 224)","(1, 2, 0, 1)"
25410,patch-448-672-0-224,"[[[0.7690655, 0.50395703, -0.3032657, -0.68857...","(448, 672, 0, 224)","(2, 3, 0, 1)"
27225,patch-672-896-0-224,"[[[0.7690655, 0.50395703, -0.3032657, -0.68857...","(672, 896, 0, 224)","(3, 4, 0, 1)"
29040,patch-896-1120-0-224,"[[[0.7690655, 0.50395703, -0.3032657, -0.68857...","(896, 1120, 0, 224)","(4, 5, 0, 1)"


In [13]:
merged = merge_patches(train["embedding"].tolist(), train["adjusted_coords"].tolist())

merged.shape

(182, 165, 1024)

In [14]:
train.iloc[0]["embedding"].shape

(1, 1, 1024)

In [15]:
merged[0][0].shape

(1024,)

In [16]:
np.array_equal(merged[0][0], train.iloc[0]["embedding"].squeeze())

True

In [17]:
def copy_embeddings(src_files, dest_dir):
    os.makedirs(dest_dir, exist_ok=True)
    for file in src_files:
        filename = Path(file).stem

        df = pd.read_parquet(file)
        df["processed_coords"] = df["coords"].map(lambda x: extract_coords(x))
        df = df.sort_values(by="processed_coords", key=lambda col: col.map(lambda x: (x[2], x[3], x[0], x[1])))

        df["embedding"] = df["embedding"].map(lambda x: x.reshape(1, 1, 1024))
        df["adjusted_coords"] = adjust_coords(coords=df["processed_coords"].tolist(), new_size=1)

        merged = merge_patches(df["embedding"].tolist(), df["adjusted_coords"].tolist())
        np.save(os.path.join(dest_dir, f"{filename}.npy"), merged)

In [18]:
for i in tqdm(range(1, 6)):
    dest_dir = os.path.join("..", "data", model, f"trial-{i}")

    train, test = train_test_split(df, test_size=0.3, shuffle=True, stratify=df["grade"])
    val, test = train_test_split(test, test_size=0.5, shuffle=True, stratify=test["grade"])

    train_embedding_paths = train["embedding_path"].tolist()
    val_embedding_paths = val["embedding_path"].tolist()
    test_embedding_paths = test["embedding_path"].tolist()

    copy_embeddings(train_embedding_paths, os.path.join(dest_dir, "train"))
    copy_embeddings(val_embedding_paths, os.path.join(dest_dir, "val"))
    copy_embeddings(test_embedding_paths, os.path.join(dest_dir, "test"))

  0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 5/5 [08:49<00:00, 105.96s/it]
