In [7]:
import cv2
import numpy as np
import torch
from torch.utils.data import DataLoader, Dataset
import os
import torch.nn as nn
import torch.nn.functional as F

In [8]:
class CustomDataset(Dataset):
    def __init__(self, root_dir, transform=None):
        """
        Args:
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        self.root_dir = root_dir
        self.transform = transform
        self.classes, self.class_to_idx = self._find_classes(self.root_dir)
        self.imgs = self._make_dataset()

    def _find_classes(self, dir):
        """
        Finds the class folders in a dataset.
        """
        classes = [d.name for d in os.scandir(dir) if d.is_dir()]
        classes.sort()
        class_to_idx = {cls_name: i for i, cls_name in enumerate(classes)}
        return classes, class_to_idx

    def _make_dataset(self):
        """
        Creates a list of samples with their class indices.
        """
        images = []
        for target in sorted(self.class_to_idx.keys()):
            d = os.path.join(self.root_dir, target)
            if not os.path.isdir(d):
                continue
            for root, _, fnames in sorted(os.walk(d)):
                for fname in sorted(fnames):
                    path = os.path.join(root, fname)
                    item = (path, self.class_to_idx[target])
                    images.append(item)
        return images

    def __len__(self):
        return len(self.imgs)

    def __getitem__(self, idx):
        path, target = self.imgs[idx]
        # Using cv2 to read and process images
        img = cv2.imread(path)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)  # Convert from BGR to RGB

        if self.transform:
            img = self.transform(img)

        return img, target


In [9]:
# Example transform function that can be used with the CustomDataset
def transform(image):
    # Resize image to 224x224
    image = cv2.resize(image, (224, 224))
    # Randomly flip image horizontally
    if np.random.rand() > 0.5:
        image = cv2.flip(image, 1)
    # Convert image to PyTorch tensor and scale to [0,1] (it helps to normalize, Numerical Stability)
    # Matching Activation Functions: Many neural network architectures use activation functions like sigmoid or tanh in their layers. 
    # These functions squeeze their input values into the range [0,1][0,1] 
    # Pre-trained Model Compatibility: Many pre-trained models (e.g., models available in torchvision or TensorFlow model libraries) 
    # expect input images to be scaled to [0,1][0,1]
    image = torch.from_numpy(image).permute(2, 0, 1).float() / 255.0
    return image

In [10]:
# Usage example
train_dataset = CustomDataset(root_dir='data/train', transform=transform)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# Now you can iterate over train_loader in your training loop

## CNN Architecture explanation

The architecture of a Convolutional Neural Network (CNN) typically involves a series of convolutional layers, pooling layers, and fully connected layers. The specific numbers of each type of layer and their configurations can be chosen based on empirical evidence, computational resources, and the complexity of the task at hand. Here's the rationale for the architecture I provided:

    Convolutional Layers:
        These layers are the core building blocks of a CNN. They apply a number of filters (also known as kernels) to the input to create feature maps. These feature maps capture spatial hierarchies and features (e.g., edges, textures) from the input images. Increasing the number of convolutional layers allows the network to learn more complex patterns. As we go deeper into the network, the filters can capture higher-level features (e.g., shapes, specific parts of an object).

    Number of Filters:
        The number of filters in each convolutional layer often increases with the depth of the network. This is because the complexity and the number of high-level features tend to increase as you move deeper, and more filters are needed to capture this complexity. Common practice is to double the number of filters after each pooling layer, which helps to balance the reduction in spatial dimensions.

    Pooling Layers:
        Pooling layers (in this case, max pooling) are used to reduce the spatial size of the representation, which decreases the number of parameters and computation in the network. This also helps to make the detection of features somewhat invariant to scale and orientation changes. Pooling helps to control overfitting by providing an abstracted form of the representation.

    Fully Connected Layers:
        After the convolutional and pooling layers, the high-level reasoning in the neural network is done via fully connected layers. Neurons in a fully connected layer have full connections to all activations in the previous layer, as seen in regular Neural Networks. The last fully connected layer (which is often called the "output layer") has as many neurons as there are classes in the dataset for a classification task. Each neuron in this layer will correspond to a class score.

    Activation Functions:
        The ReLU activation function is used after each convolution operation to introduce non-linear properties to the system, allowing the network to learn more complex functions.

The architecture I described is a simple and somewhat generic model for illustrative purposes. In practice, the architecture might need adjustments. For instance, if the images are larger, you may have additional layers to further reduce the spatial dimensions before flattening for the fully connected layers. If there are more or fewer classes, or if the images are more or less complex, you would adjust the depth of the network and the number of neurons in the fully connected layers accordingly.

Additionally, state-of-the-art CNN architectures like ResNet, Inception, and DenseNet use a lot of other strategies like skip connections, depth-wise separable convolutions, and densely connected layers to improve performance. These designs are often the result of extensive experimentation and research to optimize network performance on large-scale image recognition tasks.

In [None]:
# Define a simple CNN model
class SimpleCNN(nn.Module):
    def __init__(self, num_classes = 4):
        super(SimpleCNN, self).__init__()
        
        self.conv1 = nn.Conv2d(in_channels=3, out_channels=32, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(in_channels=3, out_channels=64, kernel_size=3, padding=1)
        self.conv3 = nn.Conv2d(in_channels=3, out_channels=128, kernel_size=3, padding=1)