In [1]:
# !pip install opencv-python tqdm scikit-learn tensorflow

In [2]:
import os
import cv2
import numpy as np
from tensorflow.keras.datasets import cifar100
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import shutil
from zipfile import ZipFile

In [3]:
# Define local directories for saving images
local_base_dir = 'preprocessed_data'  # Base directory for preprocessed data
zip_file_path = 'preprocessed_data.zip'  # Path to save the zip file

# Define tasks and splits
tasks = ['denoising', 'super_resolution', 'colorization', 'inpainting']
splits = ['train', 'val']

# Create directories for each task and split
for task in tasks:
    for split in splits:
        input_dir = os.path.join(local_base_dir, task, split, 'input')
        target_dir = os.path.join(local_base_dir, task, split, 'target')
        os.makedirs(input_dir, exist_ok=True)
        os.makedirs(target_dir, exist_ok=True)

#### Load and Normalize CIFAR-100 Dataset


In [4]:
print("Loading CIFAR-100 dataset...")
(x_train_full, _), (x_test, _) = cifar100.load_data(label_mode='fine')

# Combine training and test sets
x_data = np.concatenate((x_train_full, x_test), axis=0)
x_data = x_data.astype('float32') / 255.0  # Normalize to [0, 1]

# Split Data into Training and Validation Sets

print("Splitting data into training and validation sets...")
x_train, x_val = train_test_split(x_data, test_size=0.2, random_state=42)

Loading CIFAR-100 dataset...
Downloading data from https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz
[1m169001437/169001437[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 0us/step
Splitting data into training and validation sets...


#### Define Data Augmentation Functions

In [5]:
def add_noise(images, noise_factor=0.1):
    """Add Gaussian noise to images."""
    noisy_images = images + noise_factor * np.random.randn(*images.shape)
    noisy_images = np.clip(noisy_images, 0., 1.)
    return noisy_images

def downsample_images(images, scale=2):
    """Downsample and then upsample images for super-resolution."""
    downsampled_images = []
    for img in tqdm(images, desc="Downsampling images"):
        height, width = img.shape[:2]
        # Downscale
        low_res_img = cv2.resize(img, (width // scale, height // scale), interpolation=cv2.INTER_CUBIC)
        # Upscale back to original size
        low_res_img = cv2.resize(low_res_img, (width, height), interpolation=cv2.INTER_CUBIC)
        downsampled_images.append(low_res_img)
    return np.array(downsampled_images)

def convert_to_grayscale(images):
    """Convert RGB images to grayscale and back to RGB."""
    grayscale_images = []
    for img in tqdm(images, desc="Converting to grayscale"):
        gray_img = cv2.cvtColor((img * 255).astype(np.uint8), cv2.COLOR_RGB2GRAY)
        gray_img_3ch = cv2.cvtColor(gray_img, cv2.COLOR_GRAY2RGB)
        grayscale_images.append(gray_img_3ch.astype('float32') / 255.0)
    return np.array(grayscale_images)

def create_masks(images, mask_size=8):
    """Apply random square masks to images for inpainting."""
    masked_images = []
    for img in tqdm(images, desc="Applying masks"):
        img_copy = img.copy()
        h, w, _ = img_copy.shape
        x = np.random.randint(0, w - mask_size)
        y = np.random.randint(0, h - mask_size)
        img_copy[y:y+mask_size, x:x+mask_size, :] = 0  # Apply mask
        masked_images.append(img_copy)
    return np.array(masked_images)

#### Preprocess Data for Each Task

In [6]:
print("Preprocessing data for each task...")
# Denoising
x_train_noisy = add_noise(x_train)
x_val_noisy = add_noise(x_val)

# Super-Resolution
x_train_low_res = downsample_images(x_train)
x_val_low_res = downsample_images(x_val)

# Colorization
x_train_gray = convert_to_grayscale(x_train)
x_val_gray = convert_to_grayscale(x_val)

# Inpainting
x_train_masked = create_masks(x_train)
x_val_masked = create_masks(x_val)

Preprocessing data for each task...


Downsampling images: 100%|██████████| 48000/48000 [00:02<00:00, 20979.15it/s]
Downsampling images: 100%|██████████| 12000/12000 [00:00<00:00, 26471.35it/s]
Converting to grayscale: 100%|██████████| 48000/48000 [00:01<00:00, 31790.00it/s]
Converting to grayscale: 100%|██████████| 12000/12000 [00:00<00:00, 37343.89it/s]
Applying masks: 100%|██████████| 48000/48000 [00:00<00:00, 58398.31it/s]
Applying masks: 100%|██████████| 12000/12000 [00:00<00:00, 60686.09it/s]


### Save Preprocessed Images

In [7]:
def save_images(input_images, target_images, task_name, split_name):
    """Save input and target images to the specified directories."""
    input_dir = os.path.join(local_base_dir, task_name, split_name, 'input')
    target_dir = os.path.join(local_base_dir, task_name, split_name, 'target')

    for i in tqdm(range(len(input_images)), desc=f"Saving {task_name} {split_name} images"):
        input_img = (input_images[i] * 255).astype(np.uint8)
        target_img = (target_images[i] * 255).astype(np.uint8)

        # Save input image
        cv2.imwrite(os.path.join(input_dir, f'{i}.png'), input_img)

        # Save target image
        cv2.imwrite(os.path.join(target_dir, f'{i}.png'), target_img)

print("Saving preprocessed images...")
# Save Denoising Data
save_images(x_train_noisy, x_train, 'denoising', 'train')
save_images(x_val_noisy, x_val, 'denoising', 'val')

# Save Super-Resolution Data
save_images(x_train_low_res, x_train, 'super_resolution', 'train')
save_images(x_val_low_res, x_val, 'super_resolution', 'val')

# Save Colorization Data
save_images(x_train_gray, x_train, 'colorization', 'train')
save_images(x_val_gray, x_val, 'colorization', 'val')

# Save Inpainting Data
save_images(x_train_masked, x_train, 'inpainting', 'train')
save_images(x_val_masked, x_val, 'inpainting', 'val')

Saving preprocessed images...


Saving denoising train images: 100%|██████████| 48000/48000 [00:17<00:00, 2674.23it/s]
Saving denoising val images: 100%|██████████| 12000/12000 [00:04<00:00, 2667.32it/s]
Saving super_resolution train images: 100%|██████████| 48000/48000 [00:17<00:00, 2764.80it/s]
Saving super_resolution val images: 100%|██████████| 12000/12000 [00:04<00:00, 2754.44it/s]
Saving colorization train images: 100%|██████████| 48000/48000 [00:17<00:00, 2821.23it/s]
Saving colorization val images: 100%|██████████| 12000/12000 [00:04<00:00, 2845.44it/s]
Saving inpainting train images: 100%|██████████| 48000/48000 [00:17<00:00, 2725.40it/s]
Saving inpainting val images: 100%|██████████| 12000/12000 [00:04<00:00, 2713.70it/s]


#### Compress Preprocessed Data


In [8]:
def zip_data_directory(data_dir, zip_path):
    """Compress the entire data directory into a zip file."""
    shutil.make_archive(zip_path.replace('.zip', ''), 'zip', data_dir)
    print(f"Data successfully compressed into {zip_path}")

print("Compressing preprocessed data...")
zip_data_directory(local_base_dir, zip_file_path)

print("Data Preprocessing Completed Successfully!")

Compressing preprocessed data...
Data successfully compressed into preprocessed_data.zip
Data Preprocessing Completed Successfully!
