In [1]:
import os
import json

import torch
import deeplake
import numpy as np
from tqdm import tqdm
from PIL import Image
import matplotlib.pyplot as plt
from torchvision import datasets, transforms
from torch.utils.data import Dataset, DataLoader

In [2]:
map_path = os.path.join("..", "data", "gleason-grading")

label_map_path = os.path.join(map_path, "label-map.json")
train_map_path = os.path.join(map_path, "train-file-map.json")
val_map_path = os.path.join(map_path, "test-file-map.json")

with open(label_map_path, "r") as f: label_map = json.load(f)
with open(train_map_path, "r") as f: train_file_map = json.load(f)
with open(val_map_path, "r") as f: val_file_map = json.load(f)

label_map = {v: k for k, v in label_map.items()}
train_file_map = {v: k for k, v in train_file_map.items()}
val_file_map = {v: k for k, v in val_file_map.items()}

In [3]:
data_dir = os.path.join("..", "embeddings", "gleason-grading", "uni")
train_dir = os.path.join(data_dir, "train")
val_dir = os.path.join(data_dir, "test")

train_ds = deeplake.open_read_only(train_dir)
val_ds = deeplake.open_read_only(val_dir)


In [4]:
print("len train:", len(train_ds))
print("len val:", len(val_ds))

len train: 899331
len val: 373726


In [23]:
idx = 530000

print("Train:")
print("embedding shape: ", train_ds[idx]["embedding"].shape)
print("label: ", label_map[train_ds[idx]["label"].item()])
print("file key: ", train_file_map[train_ds[idx]["file_key"].item()])

Train:
embedding shape:  (1024,)
label:  G4
file key:  P8811_D14_Scan1_G4_9_538_886.tif


In [48]:
idx = 310000

print("Validation:")
print("embedding shape: ", val_ds[idx]["embedding"].shape)
print("label: ", label_map[val_ds[idx]["label"].item()])
print("file key: ", val_file_map[val_ds[idx]["file_key"].item()])

Validation:
embedding shape:  (1024,)
label:  Stroma
file key:  11366_18_S1_HE_Scan1_Stroma_3_8503_6945.tif


In [65]:
for i in range(5):
    print(label_map[i], "composition:",
        round(train_ds.query(
            f"""
            SELECT * WHERE label == {i}
            """
        )[:]["label"].shape[0] / len(train_ds), 4))

Normal composition: 0.1466
Stroma composition: 0.168
G3 composition: 0.2479
G4 composition: 0.4057
G5 composition: 0.0319


In [79]:
label_map

{0: 'Normal', 1: 'Stroma', 2: 'G3', 3: 'G4', 4: 'G5'}

In [81]:
stroma_train = train_ds.query("SELECT * where label == 1").pytorch()[:]

stroma_train["file_key"]

array([748289, 748290, 748291, ..., 899334, 899335, 899336], dtype=int32)

In [None]:
[train_file_map[i] for i in stroma_train["file_key"].numpy().tolist()]