Implementing dataloader like torch:  


- it's iterable  
- not indexable however  
- no string representation  
- it takes every batch of data in a tensor form: if we have 60000 `(1,28,28)` tensors, it's return a loader with 938 tensors where each is of dimensions `(64,1,28,28)`(last one will be `(32,1,28,28)` if we have 60000 samples). It compresses every 64 (batch_size) tensors into one tensor.  

In [8]:
import torch
from torch import nn, optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

# Define transformations for the dataset
transform = transforms.Compose([
    transforms.ToTensor(),  # Convert PIL images to PyTorch tensors
    transforms.Normalize((0.5,), (0.5,))  # Normalize to [-1, 1]
])
transform = transforms.Compose([
    transforms.ToTensor()
])

In [26]:
# train_loader.__dict__

In [10]:
train_dataset = datasets.MNIST(root='mnist_data', train=True, transform=transform, download=True)
test_dataset = datasets.MNIST(root='mnist_data', train=False, transform=transform, download=True)

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

In [27]:
for i in train_loader:
    print(i[0].shape)
    print(i[1].shape)
    break

torch.Size([64, 1, 28, 28])
torch.Size([64])


In [19]:
len(train_loader)

938

In [20]:
938*64

60032

In [22]:
train_loader[-1]

TypeError: 'DataLoader' object is not subscriptable

In [23]:
for i in train_loader:
    
    if i[0].shape[0] != 64:
        print(i[0].shape)
        print(i[1].shape)


torch.Size([32, 1, 28, 28])
torch.Size([32])


In [25]:
len(i[0])

32

In [29]:
train_loader.dataset

Dataset MNIST
    Number of datapoints: 60000
    Root location: mnist_data
    Split: Train
    StandardTransform
Transform: Compose(
               ToTensor()
           )

In [34]:
train_loader.__dict__.keys()
# train_loader.collate_fn

dict_keys(['dataset', 'num_workers', 'prefetch_factor', 'pin_memory', 'pin_memory_device', 'timeout', 'worker_init_fn', '_DataLoader__multiprocessing_context', '_dataset_kind', 'batch_size', 'drop_last', 'sampler', 'batch_sampler', 'generator', 'collate_fn', 'persistent_workers', '_DataLoader__initialized', '_IterableDataset_len_called', '_iterator'])

In [1]:
import random
import numpy as np
seed=42
random.seed(seed)

from ..modules.tensor import *
from ..modules.dataset import *

class dataloader:
    '''DataLoader class
    --------------------------------
    The purpose is to make an object out of the dataset that is split into batches and is iterable to be used in the training loop (even in the validation loop).
    
    Implementing dataloader like torch:  

    - it's iterable  
    - not indexable however  
    - no string representation  
    - it takes every batch of data in a tensor form: if we have 60000 `(1,28,28)` tensors, it's return a loader with 938 tensors where each is of dimensions `(64,1,28,28)`(last one will be `(32,1,28,28)` if we have 60000 samples). It compresses every 64 (batch_size) tensors into one tensor.  
    
    <!> next step would be to make it do parallel processing (multiprocessing) to speed up the process (only start with it when everything works right)
    
    '''


    def __init__(self, dataset, batch_size, shuffle):
        self.__dataset=dataset  
        self.__batch_size=batch_size  
        self.__shuffle = shuffle
        self.__num_samples = len(dataset)  


    # -- getters and setters --
    @property
    def dataset(self):
        return self.__dataset
    @dataset.setter
    def dataset(self, dataset):
        self.__dataset = dataset

    @property
    def batch_size(self):
        return self.__batch_size
    @batch_size.setter
    def batch_size(self, batch_size):
        self.__batch_size = batch_size
    
    @property
    def shuffle(self):
        return self.__shuffle
    @shuffle.setter
    def shuffle(self, shuffle):
        self.__shuffle = shuffle
    
    @property
    def num_samples(self):
        return self.__num_samples
    @num_samples.setter
    def num_samples(self, num_samples):
        self.__num_samples = num_samples

    # -- need to validate the setters --

    def __setattr__(self, name, value):
        '''
        validating the attribute types  (dataset must be a torch.utils.data.Dataset object, batch_size must be an integer, shuffle must be a boolean fo instance

        Handle the errors and maybe set to default parameters.        
        '''

        if name == 'dataset':
            # !!!!!!!! dont forget its abstract class !!!!!!!!!
            # if not isinstance(value, Dataset):
            #     raise TypeError('dataset must be a torch.utils.data.Dataset object')
            pass
        elif name == 'batch_size':
            if not isinstance(value, int):
                raise TypeError('batch_size must be an integer')
        elif name == 'shuffle':
            if not isinstance(value, bool):
                raise TypeError('shuffle must be a boolean')
        super().__setattr__(name, value)
        

    
    def __iter__(self):
        '''iterability
        
        each batch_size tensors that come in the dataset will be compressed in one tensor, adding one dimension at the beginning

        example:   
            * batch_size=64  
            * dataset is a `(60000, 28, 28)` tensor (each item is `(1,28,28)`)  
            * Each item of the dataloader will be a `(64, 1, 28, 28)` tensor consisting of 64 items of the dataset tensor
        '''
        shuffled_indices = list(range(self.num_samples))
        if self.shuffle:
            random.shuffle(shuffled_indices)

        for i in range(0, self.num_samples, self.batch_size):
            indices = shuffled_indices[i:i+self.batch_size]

            tensor_data_list=[self.dataset.data[i] for i in indices]
            nd_data_list=[tensor_data_list[i].data for i in indices]

            nd_data = np.stack(nd_data_list, axis=0)
            tensor_data=Tensor(nd_data)  

            print('<> tetsing the dataloader')
            print('     indices:', indices)
            print('     tensor_data_list:', tensor_data_list)
            print('     tensor_data list item shape :', tensor_data_list[0].shape)
            print('     tensor_data.shape:', tensor_data.shape)  
            print('     nd_data.shape:', nd_data.shape) 

            yield tensor_data
            



ImportError: attempted relative import with no known parent package

In [42]:
import numpy as np

a1=np.array([1,2,3,4,5,6,7,8,9,10])
a2=np.array([11,12,13,14,15,16,17,18,19,20])
a3=np.array([21,22,23,24,25,26,27,28,29,30])
a4=np.array([31,32,33,34,35,36,37,38,39,40])

# collate each 2 arrays into one
np.stack([a1,a2], axis=1)

array([[ 1, 11],
       [ 2, 12],
       [ 3, 13],
       [ 4, 14],
       [ 5, 15],
       [ 6, 16],
       [ 7, 17],
       [ 8, 18],
       [ 9, 19],
       [10, 20]])