In [1]:
import os
os.environ['http_proxy'] = 'http://127.0.0.1:7890'
os.environ['https_proxy'] = 'http://127.0.0.1:7890'

In [2]:
from torchvision.datasets import MNIST
from torchvision.datasets import CIFAR100
from torch.utils.data import ConcatDataset
import numpy as np

- dataloader 是对 dataset 的进一步封装；

## dataloader

In [2]:
from sentence_transformers import SentenceTransformer, LoggingHandler, losses, InputExample
from torch.utils.data import DataLoader

model = SentenceTransformer('all-MiniLM-L6-v2')
train_examples = [
    InputExample(texts=['This is a positive pair', 'Where the distance will be minimized'], label=1),
    InputExample(texts=['This is a negative pair', 'Their distance will be increased'], label=0)]

[2024-02-06 20:48:42,066] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)


  return self.fget.__get__(instance, owner)()


In [14]:
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=2)

### len

In [8]:
len(train_dataloader)

1

### sampler

In [5]:
train_dataloader.sampler

<torch.utils.data.sampler.RandomSampler at 0x7fb09001d5a0>

### collate_fn: 指定如何将一批数据样本组合成一个批次(batch)

- list of `<x, y>` (`dataset.__get_item__`) => batch X tensor, batch y tensor

In [7]:
train_dataloader.collate_fn

<function torch.utils.data._utils.collate.default_collate(batch)>

```
# from dataset => dataloader
class _MapDatasetFetcher(_BaseDatasetFetcher):
    def fetch(self, possibly_batched_index):
        if self.auto_collation:
            if hasattr(self.dataset, "__getitems__") and self.dataset.__getitems__:
                data = self.dataset.__getitems__(possibly_batched_index)
            else:
                data = [self.dataset[idx] for idx in possibly_batched_index]
        else:
            data = self.dataset[possibly_batched_index]
        return self.collate_fn(data)
```

### next(iter(dataloader))

In [10]:
# next(iter(train_dataloader))

In [15]:
train_dataloader.collate_fn = model.smart_batching_collate
next(iter(train_dataloader))

([{'input_ids': tensor([[ 101, 2023, 2003, 1037, 3893, 3940,  102],
           [ 101, 2023, 2003, 1037, 4997, 3940,  102]]),
   'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0],
           [0, 0, 0, 0, 0, 0, 0]]),
   'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1],
           [1, 1, 1, 1, 1, 1, 1]])},
  {'input_ids': tensor([[  101,  2073,  1996,  3292,  2097,  2022, 18478,  2094,   102],
           [  101,  2037,  3292,  2097,  2022,  3445,   102,     0,     0]]),
   'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0],
           [0, 0, 0, 0, 0, 0, 0, 0, 0]]),
   'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1],
           [1, 1, 1, 1, 1, 1, 1, 0, 0]])}],
 tensor([1, 0]))

## ConcatDataset

In [4]:
mnist_data = MNIST('./data/', train=True, download=True)
print('mnist: ', len(mnist_data))
cifar10_data = CIFAR100('./data', train=True, download=True)
print('cifar: ', len(cifar10_data))
concat_data = ConcatDataset([mnist_data, cifar10_data])
print('concat_data: ', len(concat_data))
img, target = concat_data.__getitem__(133)
print(np.array(img).shape)
print(target)

mnist:  60000
Downloading https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz to ./data/cifar-100-python.tar.gz


100%|██████████| 169001437/169001437 [00:14<00:00, 11771577.02it/s]


Extracting ./data/cifar-100-python.tar.gz to ./data
cifar:  50000
concat_data:  110000
(28, 28)
9


## custom dataset

In [7]:
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.datasets import make_classification

In [6]:
data, targets = make_classification(n_samples=1000)
print(len(data), len(targets))
print(data[0], targets[0])

1000 1000
[-0.50732797 -0.0859023   0.79519987 -0.941882    0.60729342  0.37327327
 -1.35511381  0.08934151  0.38568075 -0.82372423  0.21789479  1.14796323
  0.38797855  0.23849993 -1.66507864  0.39428038 -2.59608648 -0.97139603
 -0.32160851  0.16779007] 0


In [14]:
print(torch.float, torch.long)

torch.float32 torch.int64


In [12]:
class CustomDataset(Dataset):
    def __init__(self, data, targets):
        self.data = data
        self.targets = targets
    def __len__(self):
        return self.data.shape[0]
    def __getitem__(self, idx):
        return {
            'x': torch.tensor(self.data[idx, :], dtype=torch.float),
            'y': torch.tensor(self.targets[idx], dtype=torch.long)
        }

In [13]:
dataset = CustomDataset(data, targets)
dataset[0]

{'x': tensor([-0.5073, -0.0859,  0.7952, -0.9419,  0.6073,  0.3733, -1.3551,  0.0893,
          0.3857, -0.8237,  0.2179,  1.1480,  0.3880,  0.2385, -1.6651,  0.3943,
         -2.5961, -0.9714, -0.3216,  0.1678]),
 'y': tensor(0)}

In [15]:
len(dataset)

1000

In [16]:
train_dataloader = DataLoader(dataset, batch_size=32, num_workers=4)

In [19]:
next(iter(train_dataloader))['x'].shape

torch.Size([32, 20])

In [22]:
for batch in train_dataloader:
    batch_x = batch['x']
    batch_y = batch['y']
    print(batch_x.shape, batch_y.shape)
    break

torch.Size([32, 20]) torch.Size([32])
