In [1]:
import os
import glob
import torch
import numpy as np
from matplotlib import pyplot as plt
from collections import defaultdict, Counter
from skimage import io, transform
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils

In [2]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

## Explore dataset

Training and validation datasets are same for both tasks, but test sets are different. It seems that in tri-subject task there are some duplicates in the test set, but in kinship dataset images seems there is no duplications.

Additionally, we explore dimensions distributions over images in all 3 datasets for kinship task.

In [3]:
person_path_train = 'kinship_ver_t1/train-faces/*/MID*'
person_path_val = 'kinship_ver_t1/val-faces/*/MID*'
image_paths_train = os.path.join(person_path_train, '*.jpg')
image_paths_val = os.path.join(person_path_val, '*.jpg')
image_paths_test = 'kinship_ver_t1/test-faces/*.jpg'

t_persons_paths = glob.glob(person_path_train)
t_images_paths = glob.glob(image_paths_train)

v_persons_paths = glob.glob(person_path_val)
v_images_paths = glob.glob(image_paths_val)

test_images_paths = glob.glob(image_paths_test)

print(f'Train persons: {len(t_persons_paths)}')
print(f'Train images: {len(t_images_paths)}')
print(f'Val persons: {len(v_persons_paths)}')
print(f'Val images: {len(v_images_paths)}')
print(f'Test images: {len(test_images_paths)}')

Train persons: 3021
Train images: 15845
Val persons: 966
Val images: 5045
Test images: 5226


In [4]:
for name, images_paths in [('Train', t_images_paths), ('Valid', v_images_paths), ('Test', test_images_paths)]:
    shapes = []
    for img_path in images_paths:
        img = io.imread(img_path)
        shapes.append(img.shape)

    sh_cnt = Counter(shapes)
    print(f'{name} dataset - shape values: {sh_cnt}')
    print('\n')

Train dataset - shape values: Counter({(124, 108, 3): 15839, (224, 224, 3): 6})


Valid dataset - shape values: Counter({(124, 108, 3): 4995, (224, 224, 3): 50})


Test dataset - shape values: Counter({(124, 108, 3): 5217, (224, 224, 3): 9})




The main part of all images are of shape 124x108, only 65 out of ~26k images have shape 224x224.
So, we'll accept standard shape for resizing images in all dataset as 124x108.

## Dataset creation

In [5]:
class FaceImagesDataset(Dataset):
    def __init__(self, path_pattern, transform=None):
        self.transform = transform
        self.root_paths = glob.glob(path_pattern)
    
    def __len__(self):
        return len(self.root_paths)
    
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        
        image = io.imread(self.root_paths[idx])
        image = image / 255.
        
        if self.transform:
            image = self.transform(image)
        
        return image

In [6]:
IMAGE_HEIGHT = 124
IMAGE_WIDTH = 108

transform = transforms.Compose(
    [transforms.ToTensor(),
    transforms.Resize((IMAGE_HEIGHT, IMAGE_WIDTH))]
)

train_dataset = FaceImagesDataset('kinship_ver_t1/train-faces/*/MID*/*.jpg', transform=transform)
val_dataset = FaceImagesDataset('kinship_ver_t1/val-faces/*/MID*/*.jpg', transform=transform)
test_dataset = FaceImagesDataset('kinship_ver_t1/test-faces/*.jpg', transform=transform)

print(f'Train dataset size: {len(train_dataset)}')
print(f'Validation dataset size: {len(val_dataset)}')
print(f'Test dataset size: {len(test_dataset)}')

Train dataset size: 15845
Validation dataset size: 5045
Test dataset size: 5226


In [None]:
BATCH_SIZE = 4
SHUFFLE = True

train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=SHUFFLE)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=SHUFFLE)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=SHUFFLE)

In [None]:
# Show example of batch from DataLoader

# Helper function to show a batch
def show_landmarks_batch(sample_batched):
    """Show image with landmarks for a batch of samples."""
    batch_size = len(sample_batched)
    im_size = sample_batched.size(2)
    grid_border_size = 2

    grid = utils.make_grid(sample_batched)
    plt.imshow(grid.numpy().transpose((1, 2, 0)))

for i_batch, sample_batched in enumerate(train_dataloader):
    print(i_batch, sample_batched.size())

    # observe 4th batch and stop.
    if i_batch == 3:
        plt.figure()
        show_landmarks_batch(sample_batched)
        plt.axis('off')
        plt.ioff()
        plt.show()
        break