### Note: Run this after you download the full dataset if you don't already have it

# Set Up

In [1]:
import random
import torchvision.transforms as transforms
import torchvision.transforms.functional as F
from PIL import ImageFilter
from torchvision.transforms.transforms import RandomAffine, Resize
from torchvision.datasets import ImageFolder
from torchvision import datasets
from torch.utils.data import Dataset, DataLoader
from torch.utils.data import random_split
import torch
import cv2
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from PIL import Image
import os

In [2]:
# data_path = "D:\\APS360 Data\\Subset\\" # Replace with wherever you store your dataset
data_path = "D:\\Training Data-20240401T010113Z-001\\Training Data\\"

In [3]:
ls = sorted(os.listdir(data_path))
print(ls)

['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'call', 'del', 'space', 'thumbsup']


In [4]:
def show_image(path):
    img = mpimg.imread(path)
    plt.imshow(img)
    return img

# Define Custom Dataset and Transform

In [5]:
class CustomImageDataset(Dataset):
    def __init__(self, root_dir, transform=None):
        self.root_dir = root_dir
        self.transform = transform
        self.classes = os.listdir(root_dir)
        self.classes.sort()
        self.class_to_idx = {cls_name: i for i, cls_name in enumerate(self.classes)}
        self.samples = self._make_dataset()

    def _make_dataset(self):
        instances = []
        for target_class in sorted(self.class_to_idx.keys()):
            class_index = self.class_to_idx[target_class]
            target_dir = os.path.join(self.root_dir, target_class)
            for root, _, fnames in sorted(os.walk(target_dir, followlinks=True)):
                for fname in sorted(fnames):
                    path = os.path.join(root, fname)
                    item = path, class_index
                    instances.append(item)
        return instances

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        path, target = self.samples[idx]
        image = cv2.imread(path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)  # Convert image to RGB
        if self.transform:
            image = self.transform(image)
        return image, target

    @staticmethod
    def resize_and_pad_image(image, output_size=(256, 256)):
        h, w, _ = image.shape
        pad_width = abs(h-w) // 2
        
        if h > w:
            image = cv2.copyMakeBorder(image, 0, 0, pad_width, pad_width, cv2.BORDER_CONSTANT, value=[0, 0, 0])
        elif h < w:
            image = cv2.copyMakeBorder(image, pad_width, pad_width, 0, 0, cv2.BORDER_CONSTANT, value=[0, 0, 0])
        
        image = cv2.resize(image, output_size)

        return Image.fromarray(image)

    def balance_dataset(self, desired_count=2000):
        # Calculate the current count of images for each class
        class_counts = {class_idx: 0 for class_idx in range(len(self.classes))}
        for _, target in self.samples:
            class_counts[target] += 1

        # Identify classes with fewer than desired_count images
        classes_with_less = [class_idx for class_idx, count in class_counts.items() if count < desired_count]

        # Iterate over the classes with fewer images and augment them
        for class_idx in classes_with_less:
            # Find indices of images belonging to the current class
            class_indices = [idx for idx, (_, target) in enumerate(self.samples) if target == class_idx]
            # Calculate the number of images to duplicate
            num_to_duplicate = desired_count - class_counts[class_idx]
            # Randomly select images from the current class and duplicate them
            for _ in range(num_to_duplicate):
                random_index = random.choice(class_indices)
                image_to_duplicate = self.samples[random_index]
                self.samples.append(image_to_duplicate)
                class_counts[class_idx] += 1

        # Shuffle the balanced dataset
        random.shuffle(self.samples)

        return self.samples

In [6]:
# NOTE FOR YEONG: I added horizontal flipping here since are dataset
# is now impartial to flipping

transform = transforms.Compose([
    transforms.ToPILImage(),  # Convert NumPy array to PIL Image
    transforms.Lambda(lambda x: CustomImageDataset.resize_and_pad_image(np.array(x))),  # Your custom resize and pad
    # Data Augmentation
    transforms.RandomApply([
        transforms.RandomAffine(degrees=15, scale=(0.85, 1.15)),  # Random rotation and scaling
        transforms.GaussianBlur(kernel_size=(5, 5), sigma=(0.1, 2.0)),  # Random blurring
        transforms.RandomHorizontalFlip(p=0.2),
    ], p=0.15),  # Apply the above transformations with 30% probability
    transforms.ToTensor(),  # Convert PIL Image to tensor
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),  # Normalize
])

In [7]:
# Seeding to ensure reproducibility!
random.seed(42)
torch.manual_seed(42)

<torch._C.Generator at 0x1d16f8b2b70>

# Let's Process Our Dataset

In [8]:
dataset = CustomImageDataset(root_dir=data_path, transform=transform)
len(dataset)

58088

In [9]:
dataset.balance_dataset(desired_count=2000)
dataloader = DataLoader(dataset, shuffle=False)
len(dataloader)

60000

In [10]:
# Placeholder for data and labels
i = 0
all_data = []
all_labels = []

for data, labels in dataloader:
    all_data.append(data)
    all_labels.append(labels)
    
    if(i%1000 == 0):
        print("Passed the " + str(i) + "th image\n")
    i = i + 1

Passed the 0th image

Passed the 1000th image

Passed the 2000th image

Passed the 3000th image

Passed the 4000th image

Passed the 5000th image

Passed the 6000th image

Passed the 7000th image

Passed the 8000th image

Passed the 9000th image

Passed the 10000th image

Passed the 11000th image

Passed the 12000th image

Passed the 13000th image

Passed the 14000th image

Passed the 15000th image

Passed the 16000th image

Passed the 17000th image

Passed the 18000th image

Passed the 19000th image

Passed the 20000th image

Passed the 21000th image

Passed the 22000th image

Passed the 23000th image

Passed the 24000th image

Passed the 25000th image

Passed the 26000th image

Passed the 27000th image

Passed the 28000th image

Passed the 29000th image

Passed the 30000th image

Passed the 31000th image

Passed the 32000th image

Passed the 33000th image

Passed the 34000th image

Passed the 35000th image

Passed the 36000th image

Passed the 37000th image

Passed the 38000th image


In [None]:
# Check if your images are extractable from the dataset
print(len(all_data))
image = all_data[2].numpy()

mean = np.array([0.5, 0.5, 0.5])
std = np.array([0.5, 0.5, 0.5])
mean = torch.tensor(mean).view(1, 3, 1, 1)
std = torch.tensor(std).view(1, 3, 1, 1)

image = std * image + mean
image = torch.clamp(image, 0, 1)
image = torch.squeeze(image)
image = image.permute(1, 2, 0)
image = image.cpu().numpy()
plt.imshow(image)

# Let's Save It As A Pytorch Tensor

In [32]:
import torch
all_data_tensor = torch.cat(all_data, dim=0)
all_labels_tensor = torch.cat(all_labels, dim=0)

In [None]:
torch.save(all_data_tensor, 'data_tensor.pth')
torch.save(all_labels_tensor, 'labels_tensor.pth')

# Load It Back In and Train-Val-Test Split

In [None]:
torch.manual_seed(42)
total_size = len(dataset)
train_size = int(total_size * 0.7)  # 70% for training
val_size = int(total_size * 0.2)    # 20% for validation
test_size = total_size - train_size - val_size  # Remaining 10% for testing

In [None]:
train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])

In [None]:
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)