In [62]:
import numpy as np
from pprint import pprint
import torch
from torch.utils.data import TensorDataset, DataLoader, Dataset

## DataLoader

The most important argument of DataLoader constructor is dataset, which indicates a dataset object to load data from. PyTorch supports two different types of datasets:

    map-style datasets
    iterable-style datasets.


#### Map-style datasets

    A map-style dataset is one that implements the __getitem__() and __len__() protocols, and represents a map from (possibly non-integral) indices/keys to data samples.
    For example, such a dataset, when accessed with dataset[idx], could read the idx-th image and its corresponding label from a folder on the disk.

#### Iterable-style datasets

    An iterable-style dataset is an instance of a subclass of IterableDataset that implements the __iter__() protocol

##### When input is array or list

In [68]:
"""
features : array => every item has same length
label : array 
"""
features = np.array([
    [1,2,3],
    [4,5,6],
    [7,8,9],
    [10,11,12],
    [13,14,15],
    [16,17,18],
    [23,17,18]
])

labels = [1,2,3,4,5,6,7]

In [69]:
## Method - 1
class CustomDataset(Dataset):

    def __init__(self, features, labels):
        super().__init__()
        self.features = features
        self.labels = labels

    def __getitem__(self, index):
        feature = self.features[index]
        label = self.labels[index]
        return (feature, label)

    def __len__(self):
        return len(self.features)


print(f"Features : \n{features}")
print(f"Labels : \n{labels}")
print("")
ds = CustomDataset(features, labels)
dl = DataLoader(ds, batch_size = 2)
exmaple = next(iter(dl))
example_feature, example_label = exmaple
print(example_feature)
print(example_label)

Features : 
[[ 1  2  3]
 [ 4  5  6]
 [ 7  8  9]
 [10 11 12]
 [13 14 15]
 [16 17 18]
 [23 17 18]]
Labels : 
[1, 2, 3, 4, 5, 6, 7]

tensor([[1, 2, 3],
        [4, 5, 6]])
tensor([1, 2])


In [70]:
## Method - 2
"""
features : tensor
labels : tensor
description : Using TensorDataset converting data into datset obj, then using DataLoader class.
"""
features = torch.Tensor(features)
labels = torch.tensor(labels)
print(f"Features : \n{features}")
print(f"Labels : \n{labels}")
print("")
ds = TensorDataset(features, labels)
dl = DataLoader(ds, batch_size = 2)
exmaple = next(iter(dl))
example_feature, example_label = exmaple
print(example_feature)
print(example_label)

Features : 
tensor([[ 1.,  2.,  3.],
        [ 4.,  5.,  6.],
        [ 7.,  8.,  9.],
        [10., 11., 12.],
        [13., 14., 15.],
        [16., 17., 18.],
        [23., 17., 18.]])
Labels : 
tensor([1, 2, 3, 4, 5, 6, 7])

tensor([[1., 2., 3.],
        [4., 5., 6.]])
tensor([1, 2])


In [71]:
## Method - 3
"""
Without making dataset object, direct passing the features, labels to DataLoader
"""

def data_collator(batch):
    print(batch)
    return batch


features = features.numpy() if torch.is_tensor(features) else features
labels = labels.numpy() if torch.is_tensor(labels) else labels
print(f"Features : \n{features}")
print(f"Labels : \n{labels}")

print("")
data_list = list(zip(features, label))
print(f"Input_data for loader : \n{data_list}")
print("")

dl = DataLoader(data_list, batch_size = 2)
exmaple = next(iter(dl))
example_feature, example_label = exmaple
print(example_feature)
print(example_label)

Features : 
[[ 1.  2.  3.]
 [ 4.  5.  6.]
 [ 7.  8.  9.]
 [10. 11. 12.]
 [13. 14. 15.]
 [16. 17. 18.]
 [23. 17. 18.]]
Labels : 
[1 2 3 4 5 6 7]

Input_data for loader : 
[(array([1., 2., 3.], dtype=float32), 1), (array([4., 5., 6.], dtype=float32), 2), (array([7., 8., 9.], dtype=float32), 3), (array([10., 11., 12.], dtype=float32), 4), (array([13., 14., 15.], dtype=float32), 5), (array([16., 17., 18.], dtype=float32), 6), (array([23., 17., 18.], dtype=float32), 7)]

tensor([[1., 2., 3.],
        [4., 5., 6.]])
tensor([1, 2])


##### When input is dict

In [77]:
inputs = {
    "features": np.array([
                [1,2,3],
                [4,5,6],
                [7,8,9],
                [10,11,12],
                [13,14,15],
                [16,17,18],
                [23,17,18]
            ]),
    "labels" : [1,2,3,4,5,6,7]
}

In [79]:
## Method - 1
class CustomDataset(Dataset):

    def __init__(self, inputs):
        super().__init__()
        self.inputs = inputs

    def __getitem__(self, index):
        feature = self.inputs['features'][index]
        label = self.inputs['labels'][index]
        return (feature, label)

    def __len__(self):
        return len(self.inputs['features'])


print(f"Inputs : \n{inputs}")
print("")

ds = CustomDataset(inputs)
dl = DataLoader(ds, batch_size = 2)
exmaple = next(iter(dl))
example_feature, example_label = exmaple
print(example_feature)
print(example_label)

Inputs : 
{'features': array([[ 1,  2,  3],
       [ 4,  5,  6],
       [ 7,  8,  9],
       [10, 11, 12],
       [13, 14, 15],
       [16, 17, 18],
       [23, 17, 18]]), 'labels': [1, 2, 3, 4, 5, 6, 7]}

tensor([[1, 2, 3],
        [4, 5, 6]])
tensor([1, 2])


In [81]:
## Method - 2
class CustomDataset(Dataset):

    def __init__(self, features, labels):
        super().__init__()
        self.features = features
        self.labels = labels

    def __getitem__(self, index):
        feature = self.features[index]
        label = self.labels[index]
        return (feature, label)

    def __len__(self):
        return len(self.features)


print(f"Inputs : \n{inputs}")
print("")

ds = CustomDataset(inputs['features'], inputs['labels'])
dl = DataLoader(ds, batch_size = 3)
exmaple = next(iter(dl))
example_feature, example_label = exmaple
print(example_feature)
print(example_label)

Inputs : 
{'features': array([[ 1,  2,  3],
       [ 4,  5,  6],
       [ 7,  8,  9],
       [10, 11, 12],
       [13, 14, 15],
       [16, 17, 18],
       [23, 17, 18]]), 'labels': [1, 2, 3, 4, 5, 6, 7]}

tensor([[1, 2, 3],
        [4, 5, 6],
        [7, 8, 9]])
tensor([1, 2, 3])


In [86]:
## Method - 3
print(f"Inputs : \n{inputs}")
print("")

ds = TensorDataset(torch.tensor(inputs['features']), torch.tensor(inputs['labels']))
dl = DataLoader(ds, batch_size = 2)
exmaple = next(iter(dl))
example_feature, example_label = exmaple
print(example_feature)
print(example_label)

Inputs : 
{'features': array([[ 1,  2,  3],
       [ 4,  5,  6],
       [ 7,  8,  9],
       [10, 11, 12],
       [13, 14, 15],
       [16, 17, 18],
       [23, 17, 18]]), 'labels': [1, 2, 3, 4, 5, 6, 7]}

tensor([[1, 2, 3],
        [4, 5, 6]])
tensor([1, 2])


In [92]:
## Method - 4
print(f"Inputs : \n{inputs}")
print("")

data = list(zip(inputs['features'], inputs['labels']))
dl = DataLoader(data, batch_size = 2)
exmaple = next(iter(dl))
example_feature, example_label = exmaple
print(example_feature)
print(example_label)

Inputs : 
{'features': array([[ 1,  2,  3],
       [ 4,  5,  6],
       [ 7,  8,  9],
       [10, 11, 12],
       [13, 14, 15],
       [16, 17, 18],
       [23, 17, 18]]), 'labels': [1, 2, 3, 4, 5, 6, 7]}

tensor([[1, 2, 3],
        [4, 5, 6]])
tensor([1, 2])


## Data Collator

    In all the above examples length of every feature is same. But what if every item has different length then above method will not work.

In [99]:
nlp_data = [
    {'tokenized_input': [1, 4, 5, 9, 3, 2],'label':0},
    {'tokenized_input': [1, 7, 3, 14, 48, 7, 23, 154, 2],'label':0},
    {'tokenized_input': [1, 30, 67, 117, 21, 15, 2],'label':1},
    {'tokenized_input': [1, 17, 2],'label':0},
     {'tokenized_input': [1, 23, 2],'label':0}
]

In [100]:
## The below code will raise an error
# loader = DataLoader(nlp_data, batch_size=2, shuffle=False)
# batch = next(iter(loader))  

What can we do? There are two solutions:

Pad the whole dataset to the longest example.
Pad dynamically during batch creation.
The first solution might seem more straightforward — just expand all examples to the longest one. But there is an issue — we will waste memory and computing power (they are expensive on GPU!) for processing padding, which does not influence the result. It is especially painful if we have a few long sequences in the data, and most of them are relatively short. In such a case, we are mostly process padding instead of data!

If we pad the whole dataset to the longest sequence, there is a lot of wasted space! An alternative is to pad the data on the fly. When samples for the batch are selected, we pad only them to the longest one. If we additionally order the data by length, the padding will be minimal. If there are a few very long sequences, they will only influence their batches- not the whole dataset.

In [101]:
from torch.nn.utils.rnn import pad_sequence

In [106]:
def custom_collate(batch):
    print(batch)

loader = DataLoader(nlp_data, batch_size=2, collate_fn=custom_collate) 
next(iter(loader))

[{'tokenized_input': [1, 4, 5, 9, 3, 2], 'label': 0}, {'tokenized_input': [1, 7, 3, 14, 48, 7, 23, 154, 2], 'label': 0}]


In [122]:
def custom_collate(batch):
    tokenized_input = [torch.tensor(item['tokenized_input'] )for item in batch]
    label = [item['label'] for item in batch]

    print(f'Tokenized input : {tokenized_input}')
    print(f'Label :  {label}\n')

    tokenized_input = pad_sequence(tokenized_input)
    return (tokenized_input, label)

pprint(nlp_data)
print("")
loader = DataLoader(nlp_data, batch_size=2, collate_fn=custom_collate) 
print("\n")
print(next(iter(loader)))

[{'label': 0, 'tokenized_input': [1, 4, 5, 9, 3, 2]},
 {'label': 0, 'tokenized_input': [1, 7, 3, 14, 48, 7, 23, 154, 2]},
 {'label': 1, 'tokenized_input': [1, 30, 67, 117, 21, 15, 2]},
 {'label': 0, 'tokenized_input': [1, 17, 2]},
 {'label': 0, 'tokenized_input': [1, 23, 2]}]



Tokenized input : [tensor([1, 4, 5, 9, 3, 2]), tensor([  1,   7,   3,  14,  48,   7,  23, 154,   2])]
Label :  [0, 0]

(tensor([[  1,   1],
        [  4,   7],
        [  5,   3],
        [  9,  14],
        [  3,  48],
        [  2,   7],
        [  0,  23],
        [  0, 154],
        [  0,   2]]), [0, 0])


Step by step:

For padding we use pad_sequence.

Collate function takes a single argument — a list of examples. In this case, it will be a list of dicts, but it also can be a list of tuples, etc. — depending on the dataset.

As data comes if format “list of dicts” we need to traverse it and create a separate list for all inputs and labels. In the meantime, tokenized_input is converted to a 1-D tensor (it was a list of ints).

Perform the padding.

As labels were a list of ints, we converted it into a tensor.

Return formatted batch.

Set our custom function in the loader.

As we can see, the batch is in the same format as for default collation with a dictionary. We clearly see that amount of padding is only minimal.