In [1]:
# do experiment on size of nodules using LIDC dataset
# select two batches of images, one is smallest 5%, the other is largest 5%
# get their embeddings and calculate the difference
# select another batch of nodules, could be any
# modify with the direction of the vector


In [2]:
import os
import os.path as osp
import sys
sys.path.insert(1, '/labs/gevaertlab/users/yyhhli/code/vae/')

In [3]:
# import lidc dataset
from datasets import PATCH_DATASETS
from datasets.utils import sitk2tensor
lidc_train = PATCH_DATASETS['LIDCPatchAugDataset'](root_dir=None, transform=sitk2tensor, split='train')
lidc_val = PATCH_DATASETS['LIDCPatchAugDataset'](root_dir=None, transform=sitk2tensor, split='val')

  from .autonotebook import tqdm as notebook_tqdm
[04-12 17:38:45 | patch_ds:LIDCPatchAugDataset] patient split: train:699, val:88, test:88
[04-12 17:38:45 | patch_ds:LIDCPatchAugDataset] patient split: train:699, val:88, test:88


In [4]:
# get dataloaders
from torch.utils.data.dataloader import DataLoader
lidc_train_dataloader = DataLoader(dataset=lidc_train, batch_size=36, shuffle=False, drop_last=False, num_workers=4, pin_memory=False)
lidc_val_dataloader = DataLoader(dataset=lidc_val, batch_size=36, shuffle=False, drop_last=False, num_workers=4, pin_memory=False)

In [5]:
# import exporter
from evaluations.export import Exporter

exporter = Exporter(log_name="VAE3D32AUG", version=60, 
    dataloaders={"train": lidc_train_dataloader, "val": lidc_val_dataloader}, 
    task_names=["volume"])

[04-12 17:39:02 | instantiator:  <module>] Created a temporary directory at /tmp/tmpoqcxin4t
[04-12 17:39:02 | instantiator:    _write] Writing /tmp/tmpoqcxin4t/_remote_module_non_sriptable.py
[04-12 17:39:06 | patch_ds:LIDCPatchAugDataset] patient split: train:699, val:88, test:88
[04-12 17:39:06 | patch_ds:LNDbPatch32AugDataset] patient split: train:168, val:22, test:22
[04-12 17:39:06 | patch_stanfordradiogenomics:StanfordRadiogenomicsPatchAugDataset] patient split: train:100, test:43


In [6]:
embeddings, data_names, label_dict = exporter.get_data()

[04-12 17:39:06 | export:  Exporter] initializing embeddings
[04-12 17:39:10 | patch_ds:LIDCPatchAugDataset] patient split: train:699, val:88, test:88
[04-12 17:39:10 | patch_ds:LNDbPatch32AugDataset] patient split: train:168, val:22, test:22
[04-12 17:39:10 | patch_stanfordradiogenomics:StanfordRadiogenomicsPatchAugDataset] patient split: train:100, test:43


 | 75.0 secs.


In [7]:
# select smallest and largest 5% nodules
import numpy as np
volume = np.array(label_dict["volume"]['train']) # numpy array
smallest_5_idx = volume.argsort()[:int(len(volume)*0.05)]
largest_5_idx = volume.argsort()[-int(len(volume)*0.05):]

In [8]:
np.mean(volume[largest_5_idx] - volume[smallest_5_idx])

7444.667388783828

In [9]:
embeddings_train = embeddings["train"]

In [14]:
embeddings_train = np.array(embeddings_train)

In [15]:
embeddings_train.shape

(5304, 4096)

In [17]:
# get their embeddings
smallest_5_embeddings = embeddings_train[smallest_5_idx]
largest_5_embeddings = embeddings_train[largest_5_idx]

In [19]:
data_names["train"][:5]

['LIDC-IDRI-0001.84.Aug00',
 'LIDC-IDRI-0001.85.Aug00',
 'LIDC-IDRI-0001.86.Aug00',
 'LIDC-IDRI-0001.87.Aug00',
 'LIDC-IDRI-0002.88.Aug00']

In [23]:
# calculate the difference
smallest_5_embeddings_mean = smallest_5_embeddings.mean(axis=0)
largest_5_embeddings_mean = largest_5_embeddings.mean(axis=0)
diff_vector = largest_5_embeddings_mean - smallest_5_embeddings_mean

In [25]:
# select another batch of nodules from val dataset
test_nodules = next(iter(lidc_val_dataloader))

In [28]:
len(test_nodules[0])

36

In [29]:
test_nodules[0][0].shape

torch.Size([1, 32, 32, 32])

In [56]:
# visualize the nodules
from utils.visualization import vis3d_tensor
vis_dir="/labs/gevaertlab/users/yyhhli/code/vae/EDA/image_synthesize_experiment/results/"
vis3d_tensor(test_nodules[0], save_path=osp.join(vis_dir, "test_nodules_orig.jpeg"))

In [32]:
# generate images
from evaluations.evaluator import ReconEvaluator
evaluator = ReconEvaluator(vis_dir=vis_dir, log_name='VAE3D32AUG', version=60)


[04-12 18:37:25 | patch_ds:LIDCPatchAugDataset] patient split: train:699, val:88, test:88
[04-12 18:37:25 | patch_ds:LNDbPatch32AugDataset] patient split: train:168, val:22, test:22
[04-12 18:37:25 | patch_stanfordradiogenomics:StanfordRadiogenomicsPatchAugDataset] patient split: train:100, test:43


In [48]:
# get embeddings
embeds = evaluator.module.model.encode(test_nodules[0])

In [51]:
print(embeds[0].shape, embeds[1].shape)

torch.Size([36, 2048]) torch.Size([36, 2048])


In [54]:
# corrupt the nodules embeddings
corrupted_embeddings = embeds[0] + torch.from_numpy(diff_vector[:2048]).type(torch.FloatTensor).to(evaluator.module.device)

In [55]:
corrupted_embeddings.shape

torch.Size([36, 2048])

In [58]:
import torch
synth_img = evaluator.generate(corrupted_embeddings)

In [59]:
synth_img.shape

torch.Size([36, 1, 32, 32, 32])

In [60]:
vis3d_tensor(synth_img, save_path=osp.join(vis_dir, "test_nodules_corrupted.jpeg"))

In [61]:
# corrupt the nodules embeddings to the other direction
corrupted_embeddings_smaller = embeds[0] - torch.from_numpy(diff_vector[:2048]).type(torch.FloatTensor).to(evaluator.module.device)

In [62]:
synth_img_smaller = evaluator.generate(corrupted_embeddings_smaller)

In [None]:
# visualize
vis3d_tensor(synth_img_smaller, save_path=osp.join(vis_dir, "test_nodules_corrupted_smaller.jpeg"))