In [40]:
import pandas as pd
import torch
import torchvision
import os
from tqdm import tqdm_notebook
import zipfile as zf
from torch.utils.data import DataLoader, Subset, Dataset, TensorDataset
from torchvision.io import read_image
from torchvision.transforms import v2
from sklearn.model_selection import train_test_split
import numpy as np

In [31]:
files = zf.ZipFile('/content/drive/MyDrive/archive.zip','r')
files.extractall()

In [32]:
path = 'UTKFace/'
data = pd.DataFrame({'file_path': [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]})
data['age'] = data['file_path'].apply(lambda x: int(x.split('_')[0]))
data['gender'] = data['file_path'].apply(lambda x: int(x.split('_')[1]))
# data['ethnicity'] = data['file_path'].apply(lambda x:  int(x.split('_')[2]))
data

Unnamed: 0,file_path,age,gender
0,68_0_2_20170116193700812.jpg.chip.jpg,68,0
1,58_0_0_20170111171747508.jpg.chip.jpg,58,0
2,26_1_1_20170112211649085.jpg.chip.jpg,26,1
3,40_0_0_20170104210228028.jpg.chip.jpg,40,0
4,22_1_4_20170117193958245.jpg.chip.jpg,22,1
...,...,...,...
23703,37_1_0_20170117140027921.jpg.chip.jpg,37,1
23704,27_0_0_20170117120152773.jpg.chip.jpg,27,0
23705,4_1_4_20170109191223274.jpg.chip.jpg,4,1
23706,61_0_1_20170117193727094.jpg.chip.jpg,61,0


In [33]:
class ImageDataset(Dataset):
    def __init__(self, img_dir, transform=None, target_transform=None):
        self.img_labels = pd.DataFrame(
            {'file_path': [f for f in os.listdir(img_dir) if os.path.isfile(os.path.join(path, f))]})
        self.img_labels['age'] = self.img_labels['file_path'].apply(lambda x: int(x.split('_')[0]))
        self.img_labels['gender'] = self.img_labels['file_path'].apply(lambda x: int(x.split('_')[1]))
        # self.img_labels['ethnicity'] = self.img_labels['file_path'].apply(lambda x:  int(x.split('_')[2]))
        self.img_dir = img_dir
        self.transform = transform
        self.target_transform = target_transform

    def __len__(self):
        return len(self.img_labels)

    def __getitem__(self, idx):
        img_path = os.path.join(self.img_dir, self.img_labels.iloc[idx, 0])
        image = read_image(img_path)
        label_age = self.img_labels.iloc[idx, 1]
        label_gender = self.img_labels.iloc[idx, 2]
        # label_ethnicity = self.img_labels.iloc[idx, 3]
        if self.transform:
            image = self.transform(image)
        if self.target_transform:
            label_age = self.target_transform(label_age)
            label_gender = self.target_transform(label_gender)
        return image, label_age, label_gender

In [34]:
class Dataset(Dataset):
    def __init__(self, paths, gender, age, dir_path = '', transform=None):
        super().__init__()
        self.paths = paths
        self.gender = gender
        self.age = age
        self.dir_path = dir_path
        self.transform = transform

    def __len__(self):
        return len(self.paths)

    def __getitem__(self, ind):

        img = read_image(self.dir_path+self.paths[ind])


        img = self.transform(img / 255)



        return img, self.gender[ind], self.age[ind]

In [35]:
dataset1 = Dataset(
    paths = data['file_path'].tolist(),
    dir_path = 'UTKFace/',
    gender = data['gender'].tolist(),
    age = data['age'].tolist(),
    transform=v2.Compose([
        v2.Resize((200, 200)),
        #v2.ToImage(),
        v2.ToTensor(),
        v2.ToDtype(torch.float, scale=True),
        v2.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])
)



In [36]:
dataset1[0]



(tensor([[[-0.8164, -0.7993, -0.7822,  ..., -0.1999,  0.0741,  0.2453],
          [-0.8164, -0.7993, -0.7822,  ..., -0.2513,  0.0056,  0.1597],
          [-0.8164, -0.7993, -0.7822,  ..., -0.3027, -0.0801,  0.0569],
          ...,
          [-1.4843, -1.4843, -1.4843,  ..., -1.0390, -1.0390, -1.0390],
          [-1.4672, -1.4672, -1.4843,  ..., -1.0390, -1.0390, -1.0390],
          [-1.4672, -1.4672, -1.4672,  ..., -1.0390, -1.0390, -1.0390]],
 
         [[-0.6527, -0.6352, -0.6176,  ..., -0.1450,  0.1352,  0.3102],
          [-0.6527, -0.6352, -0.6176,  ..., -0.1975,  0.0651,  0.2227],
          [-0.6527, -0.6352, -0.6176,  ..., -0.2850, -0.0574,  0.0826],
          ...,
          [-1.2304, -1.2304, -1.2304,  ..., -0.7752, -0.7752, -0.7752],
          [-1.2129, -1.2129, -1.2304,  ..., -0.7752, -0.7752, -0.7752],
          [-1.2129, -1.2129, -1.2129,  ..., -0.7752, -0.7752, -0.7752]],
 
         [[-0.5495, -0.5321, -0.5147,  ..., -0.1312,  0.1651,  0.3393],
          [-0.5495, -0.5321,

In [37]:
dataset = ImageDataset(
    img_dir='UTKFace/',
    transform=v2.Compose([
        v2.Resize((200, 200), antialias=True),
        #v2.ToImage(),
        v2.ToTensor(),
        v2.ToDtype(torch.float, scale=True),
        v2.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ]),
    target_transform=v2.Compose([
        torch.tensor,
        v2.ToDtype(torch.long)
    ])
)

In [38]:
dataset[0]

(tensor([[[-0.8164, -0.7993, -0.7822,  ..., -0.1999,  0.0741,  0.2453],
          [-0.8164, -0.7993, -0.7822,  ..., -0.2513,  0.0056,  0.1597],
          [-0.8164, -0.7993, -0.7822,  ..., -0.3027, -0.0801,  0.0569],
          ...,
          [-1.4843, -1.4843, -1.4843,  ..., -1.0390, -1.0390, -1.0390],
          [-1.4672, -1.4672, -1.4843,  ..., -1.0390, -1.0390, -1.0390],
          [-1.4672, -1.4672, -1.4672,  ..., -1.0390, -1.0390, -1.0390]],
 
         [[-0.6527, -0.6352, -0.6176,  ..., -0.1450,  0.1352,  0.3102],
          [-0.6527, -0.6352, -0.6176,  ..., -0.1975,  0.0651,  0.2227],
          [-0.6527, -0.6352, -0.6176,  ..., -0.2850, -0.0574,  0.0826],
          ...,
          [-1.2304, -1.2304, -1.2304,  ..., -0.7752, -0.7752, -0.7752],
          [-1.2129, -1.2129, -1.2304,  ..., -0.7752, -0.7752, -0.7752],
          [-1.2129, -1.2129, -1.2129,  ..., -0.7752, -0.7752, -0.7752]],
 
         [[-0.5495, -0.5321, -0.5147,  ..., -0.1312,  0.1651,  0.3393],
          [-0.5495, -0.5321,

In [41]:
train_data, test_data = train_test_split(np.arange(len(data)), test_size=0.2, random_state=42)
train_data

array([ 5096, 19586,  9835, ...,   860, 15795, 23654])

In [42]:
train_data = Subset(dataset, train_data)
test_data = Subset(dataset, test_data)
test_data

<torch.utils.data.dataset.Subset at 0x7c603eb738b0>

In [43]:
batch_size = 64
train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=batch_size, shuffle=False)