# [Custom Dataset (Cats & Dogs)](https://www.youtube.com/watch?v=ZoZHd0Zm3RY&list=PLLO3XdUcTFzwZNl2lpg_PTbUSLzHBZoK_&index=8)

## Import Packages

In [2]:
import os

import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from skimage import io
from torchvision import transforms

# Data Preparation
[Kaggle Data](https://www.kaggle.com/c/dogs-vs-cats/overview)

In [3]:
# Define the train folder path
train_dir = os.path.join('data', 'dogs-vs-cats', 'train')  # Update path accordingly

# Initialize a list to hold file data
data = []

# Iterate through all files in the train folder
for file_name in os.listdir(train_dir):
    if os.path.isfile(os.path.join(train_dir, file_name)):
        if 'cat' in file_name:
            label = 0
            num_part = int(file_name.split('cat.')[1].split('.jpg')[0])  # Extract the number part
        elif 'dog' in file_name:
            label = 1
            num_part = int(file_name.split('dog.')[1].split('.jpg')[0])  # Extract the number part
        else:
            continue  # Skip files that are neither cat nor dog
        data.append({'id': file_name, 'label': label, 'num': num_part})

# Convert the data into a DataFrame
df = pd.DataFrame(data)

# Sort the DataFrame by label (cats first, dogs second), and then by the numerical part
df_sorted = df.sort_values(by=['label', 'num']).drop(columns=['num'])  # Drop 'num' after sorting

# Save the sorted DataFrame to a CSV file
output_csv = os.path.join('data', 'dogs-vs-cats', 'cats_dogs_labels_sorted.csv')
df_sorted.to_csv(output_csv, index=False)  # Save without the index column

print(f"CSV file saved to {output_csv}")


CSV file saved to data/dogs-vs-cats/cats_dogs_labels_sorted.csv


In [4]:
class CatsAndDogsDataset(Dataset):
    def __init__(self, csv_file, root_dir, transform=None):
        """
        Initialize the CatsAndDogsDataset.

        Parameters:
        csv_file (str): Path to the CSV file containing image file names and labels.
        root_dir (str): Directory containing the image files.
        transform (callable, optional): Transformations to be applied on the images (e.g., scaling, normalization, etc.).
        """
        self.annotations = pd.read_csv(csv_file)  # Load the CSV file into a DataFrame.
        self.root_dir = root_dir  # Path to the directory containing image files.
        self.transform = transform  # Transformations to apply to images during data retrieval (if specified).

    def __len__(self):
        """
        Return the total number of samples in the dataset.

        Returns:
        int: Number of rows in the annotations DataFrame, representing the dataset size.
        """
        return len(self.annotations)

    def __getitem__(self, idx):
        """
        Retrieve a single data sample (image and label) by index.

        Parameters:
        idx (int): Index of the sample to retrieve.

        Returns:
        tuple: A tuple containing the transformed image (torch.Tensor) and the corresponding label (torch.Tensor).
        """
        # Full path to the image file corresponding to the given index.
        img_path = os.path.join(self.root_dir, self.annotations.iloc[idx, 0])

        # Load the image from the specified path. `io.imread` reads the image as a NumPy array.
        image = io.imread(img_path)

        # Retrieve the label from the DataFrame and convert it to a tensor (integer).
        y_label = torch.tensor(int(self.annotations.iloc[idx, 1]))

        # Apply transformations to the image, if any are specified.
        if self.transform:
            image = self.transform(image)

        # Return the transformed image and corresponding label as a tuple.
        return image, y_label

In [6]:
# Instantiate a CatsAndDogsDataset object using the sorted CSV file and the directory containing the images.
# The transform parameter is set to 'transforms.ToTensor()', which converts images to PyTorch tensors.
dataset = CatsAndDogsDataset(
    csv_file='data/dogs-vs-cats/cats_dogs_labels_sorted.csv',
    root_dir='data/dogs-vs-cats/train',
    transform=transforms.ToTensor()
)

# Print the total number of samples in the dataset by calling the __len__ method.
print(len(dataset))

# Split the dataset randomly into a training set (20000 samples) and a test set (5000 samples).
# The random_split function ensures the specified sizes for the train and test sets.
train_set, test_set = torch.utils.data.random_split(dataset, [20000, 5000])

# Create a DataLoader for the training set to enable batch processing.
# The batch size is set to 16, and shuffling ensures that the samples in each batch are randomized.
train_loader = DataLoader(train_set, batch_size=16, shuffle=True)

# Create a DataLoader for the test set.
# The batch size is also set to 16, and shuffling is enabled for randomized testing batches.
test_loader = DataLoader(test_set, batch_size=16, shuffle=True)

25000
