In [1]:
import matplotlib.pyplot as plt
import numpy as np
import torch
import os
import torchvision
import gdown
import tarfile
from shutil import copyfile
from PIL import Image
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, random_split, Dataset

In [2]:
celeba_images_path = '/content/celeba/img_align_celeba'
partition_file_path = '/content/celeba/list_eval_partition.txt'
flowers_images_path = '/content/flowers102/jpg'

In [3]:
file_id = '1h_oR2jMJWO3JbIM63Oj2-2SjvnZkN3NI'
destination = 'celeba.zip'

gdown.download(f'https://drive.google.com/uc?id={file_id}', destination, quiet=False)

!unzip celeba.zip -d /content/celeba

Downloading...
From (original): https://drive.google.com/uc?id=1h_oR2jMJWO3JbIM63Oj2-2SjvnZkN3NI
From (redirected): https://drive.google.com/uc?id=1h_oR2jMJWO3JbIM63Oj2-2SjvnZkN3NI&confirm=t&uuid=84781b16-08bd-4a79-a37d-09f9d3e0f474
To: /content/celeba.zip
100%|██████████| 1.44G/1.44G [00:11<00:00, 131MB/s]


Archive:  celeba.zip
replace /content/celeba/img_align_celeba/000001.jpg? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [4]:
file_id = '1ntYY__wPA50wd_zxwIw_h-TnfvWuXrKp'
destination = '/content/celeba/list_eval_partition.txt'

gdown.download(f'https://drive.google.com/uc?id={file_id}', destination, quiet=False)

KeyboardInterrupt: 

In [None]:
file_id = '18-Lq0PpRpfiNHV7CozqDQUbH1iTrtnMT'
destination = 'flowers102.tgz'

gdown.download(f'https://drive.google.com/uc?id={file_id}', destination, quiet=False)

output_dir = './flowers102'
os.makedirs(output_dir, exist_ok=True)

with tarfile.open(destination) as tar:
    tar.extractall(path=output_dir)

In [None]:
file_id = '1ekVLhSiTgDiQAj76d3JxRrepq7nZvh_L'
destination = '/content/flowers102/setid.mat'

gdown.download(f'https://drive.google.com/uc?id={file_id}', destination, quiet=False)

In [None]:
import scipy.io

mat = scipy.io.loadmat('/content/flowers102/setid.mat')

print(mat.keys())

train_ids = mat['trnid'].flatten()    # Training set
valid_ids = mat['valid'].flatten()    # Validation set
test_ids = mat['tstid'].flatten()     # Testing set

print("Training set IDs:", train_ids)
print("Validation set IDs:", valid_ids)
print("Testing set IDs:", test_ids)


In [None]:
os.makedirs('/content/flowers102/train', exist_ok=True)
os.makedirs('/content/flowers102/val', exist_ok=True)
os.makedirs('/content/flowers102/test', exist_ok=True)

def copy_images(set_ids, source_folder, target_folder):
    for img_id in set_ids:

        file_name = f'image_{img_id:05d}.jpg'

        source_path = os.path.join(source_folder, file_name)
        target_path = os.path.join(target_folder, file_name)

        if os.path.exists(source_path):
            copyfile(source_path, target_path)
        else:
            print(f"Fájl nem található: {source_path}")

copy_images(train_ids, flowers_images_path, '/content/flowers102/train')
copy_images(valid_ids, flowers_images_path, '/content/flowers102/val')
copy_images(test_ids, flowers_images_path, '/content/flowers102/test')

print("Képek sikeresen felosztva!")

In [None]:
import matplotlib.pyplot as plt

train_size = len(train_ids)
valid_size = len(valid_ids)
test_size = len(test_ids)

sizes = [train_size, valid_size, test_size]
labels = ['Training', 'Validation', 'Testing']
colors = ['#ff9999','#66b3ff','#99ff99']

plt.figure(figsize=(7, 7))
plt.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90, colors=colors, explode=(0.05, 0.05, 0.05))

plt.axis('equal')

plt.title('Flowers102 Dataset Distribution (Training, Validation, Testing)', fontsize=14)

plt.show()


In [None]:
transform = transforms.Compose([
    transforms.Resize((64, 64)),
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

In [None]:
celeba_data = datasets.ImageFolder(root='/content/celeba', transform=transform)
celeba_loader = DataLoader(celeba_data, batch_size=64, shuffle=True)

flowers_data = datasets.ImageFolder(root='/content/flowers102', transform=transform)
flowers_loader = DataLoader(flowers_data, batch_size=64, shuffle=True)

In [None]:
celeba_data_iter = iter(celeba_loader)
images, labels = next(celeba_data_iter)
print(images.shape)  # Pl. torch.Size([64, 3, 64, 64])

flowers_data_iter = iter(flowers_loader)
images, labels = next(flowers_data_iter)
print(images.shape)  # Pl. torch.Size([64, 3, 64, 64])

In [None]:
partition_dict = {}
with open(partition_file_path, 'r') as file:
    for line in file.readlines():
        image_name, partition = line.strip().split()
        partition_dict[image_name] = int(partition)

In [None]:
class CelebADataset(Dataset):
    def __init__(self, root_dir, partition_dict, partition, transform=None):
        self.root_dir = root_dir
        self.partition_dict = partition_dict
        self.partition = partition
        self.transform = transform

        # Filter the image list based on the partition (0 for train, 1 for val, 2 for test)
        self.image_names = [img for img, p in partition_dict.items() if p == partition]

    def __len__(self):
        return len(self.image_names)

    def __getitem__(self, idx):
        img_name = os.path.join(self.root_dir, self.image_names[idx])
        image = Image.open(img_name)

        if self.transform:
            image = self.transform(image)

        label = self.partition_dict[self.image_names[idx]]  # Getting the partition number

        return image, label

In [None]:
celeba_train_dataset = CelebADataset(root_dir=celeba_images_path, partition_dict=partition_dict, partition=0, transform=transform)
celeba_val_dataset = CelebADataset(root_dir=celeba_images_path, partition_dict=partition_dict, partition=1, transform=transform)
celeba_test_dataset = CelebADataset(root_dir=celeba_images_path, partition_dict=partition_dict, partition=2, transform=transform)

In [None]:
print(f'Train dataset size: {len(celeba_train_dataset)}')
print(f'Validation dataset size: {len(celeba_val_dataset)}')
print(f'Test dataset size: {len(celeba_test_dataset)}')

In [None]:
partitions = list(partition_dict.values())
plt.hist(partitions, bins=[-0.5, 0.5, 1.5, 2.5], edgecolor='black')
plt.xticks([0, 1, 2], ['Train', 'Val', 'Test'])
plt.xlabel('Partitions')
plt.ylabel('Number of Images')
plt.title('Distribution of Images in Partitions')
plt.show()

In [None]:
def check_image_validity(image_path):
    try:
        with Image.open(image_path) as img:
            img.verify()  # Verify that it is an image
        return True
    except (IOError, SyntaxError):
        return False

invalid_images = [img for img in os.listdir(celeba_images_path) if not check_image_validity(os.path.join(celeba_images_path, img))]
print(f'Invalid CelebA images: {len(invalid_images)}')


In [None]:
train_data_iter = iter(DataLoader(celeba_test_dataset, batch_size=64, shuffle=True))
images, labels = next(train_data_iter)

print(f'Images batch shape: {images.shape}')  # Pl. (64, 3, 64, 64)
print(f'Labels batch shape: {labels.shape}')  # Pl. (64,)

def imshow(img):
    img = img / 2 + 0.5
    npimg = img.numpy()
    plt.imshow(np.transpose(npimg, (1, 2, 0)))
    plt.show()

imshow(torchvision.utils.make_grid(images[:8]))

In [None]:
celeba_train_loader = DataLoader(celeba_train_dataset, batch_size=64, shuffle=True)
celeba_val_loader = DataLoader(celeba_val_dataset, batch_size=64, shuffle=False)
celeba_test_loader = DataLoader(celeba_test_dataset, batch_size=64, shuffle=False)

In [None]:
data_iter = iter(celeba_train_loader)
images, labels = next(data_iter)

print(f'Images batch shape: {images.shape}')  # Should be (batch_size, channels, height, width)
print(f'Labels batch shape: {labels.shape}')  # Should be (batch_size,)