In [2]:
import torch
import torchvision

from data.svodataset import SVODataset

def custom_collate(batch):
    subject_images = []
    verb_images = []
    object_images = []
    spatial_encodings = []
    subject_labels = []
    verb_labels = []
    object_labels = []
    for subject_image, verb_image, object_image, spatial_encoding, subject_label, verb_label, object_label in batch:
        subject_images.append(subject_image)
        verb_images.append(verb_image)
        object_images.append(object_image)
        spatial_encodings.append(spatial_encoding)
        subject_labels.append(subject_label)
        verb_labels.append(verb_label)
        object_labels.append(object_label)

    return subject_images, verb_images, object_images, spatial_encodings, subject_labels, verb_labels, object_labels

dataset = SVODataset(
    name = 'Custom',
    data_root = 'Custom',
    csv_path = 'Custom/annotations/dataset_v4_2_train.csv',
    training = True
)

data_loader = torch.utils.data.DataLoader(
    dataset,
    batch_size = 16,
    shuffle = True,
    collate_fn = custom_collate
)

In [2]:
len(data_loader)

189

In [3]:
count = 0
for subject_images, verb_images, object_images, spatial_encodings, subject_labels, verb_labels, object_labels in data_loader:
    for subject_image, verb_image, object_image, spatial_encoding, subject_label, verb_label, object_label in zip(subject_images, verb_images, object_images, spatial_encodings, subject_labels, verb_labels, object_labels):
        print(f"Iter {count}")
        if subject_image is not None:
            print(f'Subject image shape: {subject_image.shape}')
        if verb_image is not None:
            print(f'Verb image shape: {verb_image.shape}')
        if object_image is not None:
            print(f'Object image shape: {object_image.shape}')
        if spatial_encoding is not None:
            print(f'Spatial encoding shape: {spatial_encoding.shape}')
        if subject_label is not None:
            print(f'Subject label shape: {subject_label.shape}')
        if verb_label is not None:
            print(f'Verb label shape: {verb_label.shape}')
        if object_label is not None:
            print(f'Object label shape: {object_label.shape}')
        count += 1

Iter 0
Subject image shape: torch.Size([3, 746, 736])
Verb image shape: torch.Size([3, 746, 736])
Object image shape: torch.Size([3, 99, 265])
Spatial encoding shape: torch.Size([2, 36])
Subject label shape: torch.Size([])
Verb label shape: torch.Size([])
Object label shape: torch.Size([])
Iter 1
Subject image shape: torch.Size([3, 607, 350])
Verb image shape: torch.Size([3, 613, 1068])
Object image shape: torch.Size([3, 554, 958])
Spatial encoding shape: torch.Size([2, 36])
Subject label shape: torch.Size([])
Verb label shape: torch.Size([])
Object label shape: torch.Size([])
Iter 2
Subject image shape: torch.Size([3, 761, 471])
Verb image shape: torch.Size([3, 761, 471])
Object image shape: torch.Size([3, 94, 141])
Spatial encoding shape: torch.Size([2, 36])
Subject label shape: torch.Size([])
Verb label shape: torch.Size([])
Object label shape: torch.Size([])
Iter 3
Subject image shape: torch.Size([3, 659, 781])
Verb image shape: torch.Size([3, 659, 781])
Spatial encoding shape: tor

In [4]:
test_dataset = SVODataset(
        name = 'Custom',
        data_root = 'Custom',
        csv_path = 'Custom/annotations/dataset_v4_2_val.csv',
        training = True
    )

In [7]:
test_loader = torch.utils.data.DataLoader(
    test_dataset,
    batch_size = 16,
    shuffle = True,
    collate_fn = custom_collate
)

In [8]:
len(test_loader)

56

In [13]:
count = 0
for subject_images, verb_images, object_images, spatial_encodings, subject_labels, verb_labels, object_labels in data_loader:
    for subject_image, verb_image, object_image, spatial_encoding, subject_label, verb_label, object_label in zip(subject_images, verb_images, object_images, spatial_encodings, subject_labels, verb_labels, object_labels):
        if subject_image is None or verb_image is None or object_image is None or subject_label is None or verb_label is None or object_label is None:
            count += 1
print(f"Incomplete {count}")

Incomplete 776


In [2]:
count = 0
for subject_images, verb_images, object_images, spatial_encodings, subject_labels, verb_labels, object_labels in data_loader:
    for subject_image, verb_image, object_image, spatial_encoding, subject_label, verb_label, object_label in zip(subject_images, verb_images, object_images, spatial_encodings, subject_labels, verb_labels, object_labels):
        if subject_image is None or verb_image is None or object_image is None:
            count += 1
print(f"Incomplete {count}")

Incomplete 776


In [20]:
min_dim = float('inf')
max_dim = -float('inf')
for subject_images, verb_images, object_images, spatial_encodings, subject_labels, verb_labels, object_labels in data_loader:
    for subject_image, verb_image, object_image, spatial_encoding, subject_label, verb_label, object_label in zip(subject_images, verb_images, object_images, spatial_encodings, subject_labels, verb_labels, object_labels):
        if subject_image is not None:
            # print(subject_image.shape)
            min_dim = min(min(subject_image.shape[-2:]), min_dim)
            max_dim = max(max(subject_image.shape), max_dim)
        if verb_image is not None:
            min_dim = min(min(verb_image.shape[-2:]), min_dim)
            max_dim = max(max(verb_image.shape), max_dim)
        if object_image is not None:
            min_dim = min(min(object_image.shape[-2:]), min_dim)
            max_dim = max(max(object_image.shape), max_dim)
        # if subject_label is not None:
        #     min_dim = min(min(subject_label.shape), min_dim)
        #     max_dim = max(max(subject_label.shape), max_dim)
        # if verb_label is not None:
        #     min_dim = min(min(verb_label.shape), min_dim)
        #     max_dim = max(max(verb_label.shape), max_dim)
        # if object_label is not None:
        #     min_dim = min(min(object_label.shape), min_dim)
        #     max_dim = max(max(object_label.shape), max_dim)
print("Min dim", min_dim)
print("max_dim", max_dim)

Min dim 4
max_dim 1332


In [21]:
max_subj = 0
max_verb = 0
max_obj = 0
for subject_images, verb_images, object_images, spatial_encodings, subject_labels, verb_labels, object_labels in data_loader:
    for subject_image, verb_image, object_image, spatial_encoding, subject_label, verb_label, object_label in zip(subject_images, verb_images, object_images, spatial_encodings, subject_labels, verb_labels, object_labels):
        # if subject_image is not None:
        #     # print(subject_image.shape)
        #     min_dim = min(min(subject_image.shape[-2:]), min_dim)
        #     max_dim = max(max(subject_image.shape), max_dim)
        # if verb_image is not None:
        #     min_dim = min(min(verb_image.shape[-2:]), min_dim)
        #     max_dim = max(max(verb_image.shape), max_dim)
        # if object_image is not None:
        #     min_dim = min(min(object_image.shape[-2:]), min_dim)
        #     max_dim = max(max(object_image.shape), max_dim)
        if subject_label is not None:
            max_subj = max(max_subj, subject_label.max())
        if verb_label is not None:
            max_verb = max(max_verb, verb_label.max())
        if object_label is not None:
            max_obj = max(max_obj, object_label.max())

print(max_subj, max_verb, max_obj)

tensor(4) tensor(7) tensor(11)


In [22]:
max_subj = 0
max_verb = 0
max_obj = 0
for subject_images, verb_images, object_images, spatial_encodings, subject_labels, verb_labels, object_labels in test_loader:
    for subject_image, verb_image, object_image, spatial_encoding, subject_label, verb_label, object_label in zip(subject_images, verb_images, object_images, spatial_encodings, subject_labels, verb_labels, object_labels):
        # if subject_image is not None:
        #     # print(subject_image.shape)
        #     min_dim = min(min(subject_image.shape[-2:]), min_dim)
        #     max_dim = max(max(subject_image.shape), max_dim)
        # if verb_image is not None:
        #     min_dim = min(min(verb_image.shape[-2:]), min_dim)
        #     max_dim = max(max(verb_image.shape), max_dim)
        # if object_image is not None:
        #     min_dim = min(min(object_image.shape[-2:]), min_dim)
        #     max_dim = max(max(object_image.shape), max_dim)
        if subject_label is not None:
            max_subj = max(max_subj, subject_label.max())
        if verb_label is not None:
            max_verb = max(max_verb, verb_label.max())
        if object_label is not None:
            max_obj = max(max_obj, object_label.max())

print(max_subj, max_verb, max_obj)

tensor(4) tensor(7) tensor(11)


In [3]:
dataset_224 = SVODataset(
    name = 'Custom',
    data_root = 'Custom',
    csv_path = 'Custom/annotations/dataset_v4_2_train.csv',
    training = True,
    
)

data_loader_resnet = torch.utils.data.DataLoader(
    dataset_224,
    batch_size = 16,
    shuffle = True,
    collate_fn = custom_collate
)

In [4]:
count = 0
for subject_images, verb_images, object_images, spatial_encodings, subject_labels, verb_labels, object_labels in data_loader_resnet:
    for subject_image, verb_image, object_image, spatial_encoding, subject_label, verb_label, object_label in zip(subject_images, verb_images, object_images, spatial_encodings, subject_labels, verb_labels, object_labels):
        print(f"Iter {count}")
        if subject_image is not None:
            print(f'Subject image shape: {subject_image.shape}')
        if verb_image is not None:
            print(f'Verb image shape: {verb_image.shape}')
        if object_image is not None:
            print(f'Object image shape: {object_image.shape}')
        if spatial_encoding is not None:
            print(f'Spatial encoding shape: {spatial_encoding.shape}')
        if subject_label is not None:
            print(f'Subject label shape: {subject_label.shape}')
        if verb_label is not None:
            print(f'Verb label shape: {verb_label.shape}')
        if object_label is not None:
            print(f'Object label shape: {object_label.shape}')
        count += 1
            # if subject_image is not None and verb_image is not None and object_image is not None:
            #     to_pil_image = torchvision.transforms.ToPILImage()
            #     subject_image_pil = to_pil_image(subject_image)
            #     verb_image_pil = to_pil_image(verb_image)
            #     object_image_pil = to_pil_image(object_image)
            #     subject_image_pil.save('subject_example_image224.jpg')
            #     verb_image_pil.save('verb_example_image224.jpg')
            #     object_image_pil.save('object_example_image224.jpg')
            #     # NOTE: These example images will look corrupted. That's because
            #     # they're standardized to have mean 0 std 1 for each color channel,
            #     # and PIL isn't aware of this, so it has no way of "unstandardizing"
            #     # them to get the raw pixel data. But you can generally still make
            #     # out the content of the images to some extent
            #     raise

Iter 0
Subject image shape: torch.Size([3, 134, 51])
Verb image shape: torch.Size([3, 156, 67])
Object image shape: torch.Size([3, 77, 55])
Spatial encoding shape: torch.Size([2, 36])
Subject label shape: torch.Size([])
Verb label shape: torch.Size([])
Object label shape: torch.Size([])
Iter 1
Subject image shape: torch.Size([3, 107, 45])
Verb image shape: torch.Size([3, 123, 85])
Object image shape: torch.Size([3, 121, 84])
Spatial encoding shape: torch.Size([2, 36])
Subject label shape: torch.Size([])
Verb label shape: torch.Size([])
Object label shape: torch.Size([])
Iter 2
Subject image shape: torch.Size([3, 47, 55])
Verb image shape: torch.Size([3, 53, 69])
Object image shape: torch.Size([3, 10, 10])
Spatial encoding shape: torch.Size([2, 36])
Subject label shape: torch.Size([])
Verb label shape: torch.Size([])
Object label shape: torch.Size([])
Iter 3
Subject image shape: torch.Size([3, 91, 42])
Verb image shape: torch.Size([3, 107, 61])
Object image shape: torch.Size([3, 36, 60]

KeyboardInterrupt: 

In [5]:
min_dim = float('inf')
max_dim = -float('inf')
for subject_images, verb_images, object_images, spatial_encodings, subject_labels, verb_labels, object_labels in data_loader_resnet:
    for subject_image, verb_image, object_image, spatial_encoding, subject_label, verb_label, object_label in zip(subject_images, verb_images, object_images, spatial_encodings, subject_labels, verb_labels, object_labels):
        if subject_image is not None:
            # print(subject_image.shape)
            min_dim = min(min(subject_image.shape[-2:]), min_dim)
            max_dim = max(max(subject_image.shape), max_dim)
        if verb_image is not None:
            min_dim = min(min(verb_image.shape[-2:]), min_dim)
            max_dim = max(max(verb_image.shape), max_dim)
        if object_image is not None:
            min_dim = min(min(object_image.shape[-2:]), min_dim)
            max_dim = max(max(object_image.shape), max_dim)
        # if subject_label is not None:
        #     min_dim = min(min(subject_label.shape), min_dim)
        #     max_dim = max(max(subject_label.shape), max_dim)
        # if verb_label is not None:
        #     min_dim = min(min(verb_label.shape), min_dim)
        #     max_dim = max(max(verb_label.shape), max_dim)
        # if object_label is not None:
        #     min_dim = min(min(object_label.shape), min_dim)
        #     max_dim = max(max(object_label.shape), max_dim)
print("Min dim", min_dim)
print("max_dim", max_dim)

Min dim 1
max_dim 224
