### 以生成器(Generator)自定義資料讀取器(Data Loader)

In [1]:
import random
import numpy as np

In [2]:
# 在進行讀取前，會先將資料做打亂(shuffle)，增加隨機性
def shuffle_aligned_list(data):
    """Random shuffle data (x and y)"""
    
    indice = list(range(len(data[0])))
    random.shuffle(indice) #inplace shuffle indice
    
    return [d[indice] for d in data]

# 定義data loader
def data_loader(data, batch_size, shuffle=True):
    """Generate batches of data.
    """
    if shuffle:
        # do data shuffling
        data = shuffle_aligned_list(data)

    batch_count = 0
    while True:
        if batch_count * batch_size + batch_size > len(data[0]):
            # if it's last batch, then reload data
            batch_count = 0

            if shuffle:
                data = shuffle_aligned_list(data)
                
        # get start and end indice
        start = batch_count * batch_size
        end = start + batch_size
        batch_count += 1
        
        yield [d[start:end] for d in data]

In [3]:
# generate fake data
x = np.array(range(1,101)).reshape((-1,5))
y = np.array(range(1,21)).reshape((-1,1))

In [4]:
# 定義batch size (一個batch有多少比資料)與epochs(一個epoch為跑完所有資料一次)
batch_size = 5
epochs = 10

# define data loader
batch_loader = data_loader([x,y], batch_size)

In [5]:
for i in range(epochs):
    for j in range(len(x)//batch_size): #一個epoch為跑完所有資料一次
        batch_x,batch_y = next(batch_loader) #generator用next讀取資料
        if int(i%2)==0:
            print('Epoch %d'%int(i/2))
            print(f'x: {batch_x}, y: {batch_y}')

Epoch 0
x: [[16 17 18 19 20]
 [91 92 93 94 95]
 [11 12 13 14 15]
 [31 32 33 34 35]
 [ 1  2  3  4  5]], y: [[ 4]
 [19]
 [ 3]
 [ 7]
 [ 1]]
Epoch 0
x: [[76 77 78 79 80]
 [81 82 83 84 85]
 [51 52 53 54 55]
 [46 47 48 49 50]
 [ 6  7  8  9 10]], y: [[16]
 [17]
 [11]
 [10]
 [ 2]]
Epoch 0
x: [[36 37 38 39 40]
 [61 62 63 64 65]
 [86 87 88 89 90]
 [56 57 58 59 60]
 [41 42 43 44 45]], y: [[ 8]
 [13]
 [18]
 [12]
 [ 9]]
Epoch 0
x: [[ 26  27  28  29  30]
 [ 21  22  23  24  25]
 [ 96  97  98  99 100]
 [ 71  72  73  74  75]
 [ 66  67  68  69  70]], y: [[ 6]
 [ 5]
 [20]
 [15]
 [14]]
Epoch 1
x: [[26 27 28 29 30]
 [21 22 23 24 25]
 [86 87 88 89 90]
 [ 1  2  3  4  5]
 [76 77 78 79 80]], y: [[ 6]
 [ 5]
 [18]
 [ 1]
 [16]]
Epoch 1
x: [[ 96  97  98  99 100]
 [ 16  17  18  19  20]
 [ 31  32  33  34  35]
 [ 46  47  48  49  50]
 [ 81  82  83  84  85]], y: [[20]
 [ 4]
 [ 7]
 [10]
 [17]]
Epoch 1
x: [[11 12 13 14 15]
 [71 72 73 74 75]
 [66 67 68 69 70]
 [41 42 43 44 45]
 [51 52 53 54 55]], y: [[ 3]
 [15]
 [14]
 [ 9

### 以Pytorch Dataset與DataLoader建構資料讀取器(Data Loader)

在Pytorch中許多的客製化建構，都會使用到**類別的繼承**，而這邊的Dataset也不例外。

Pytorch中的**torch.utils.data.Dataset**是一個代表dataset的抽象類別，我們自定義的dataset需要繼承此類別且覆寫以下兩種方法:

* __len__: 讓我們可以使用len(dataset)取的資料集大小
* __getitem__: 主要取得資料的方法，讓我們可以用dataset[i]的方式取得第i筆資料

In [5]:
import torch
from torch.utils.data import Dataset, DataLoader

In [6]:
class CustomDataset(Dataset):
    """self defined dataset"""
    
    def __init__(self, data):
        
        self.x = data[0]
        self.y = data[1]
    
    def __len__(self):
        return len(self.x)
    
    def __getitem__(self, idx):
        
        x = torch.tensor(self.x[idx])
        y = torch.tensor(self.y[idx])
        
        return x, y

In [7]:
# generate fake data
x = np.array(range(1,101)).reshape((-1,5))
y = np.array(range(1,21)).reshape((-1,1))

In [8]:
# 建構自定義dataset
custom_dataset = CustomDataset([x,y])
# 使用index讀取資料
custom_dataset[0]

(tensor([1, 2, 3, 4, 5], dtype=torch.int32), tensor([1], dtype=torch.int32))

In [10]:
# 定義data loader
custom_dataloader = DataLoader(custom_dataset, batch_size=5, shuffle=True, num_workers=2)

# 讀取單筆資料
next(iter(custom_dataloader))

RuntimeError: DataLoader worker (pid(s) 5104, 16872) exited unexpectedly

In [11]:
epochs = 10

for i in range(epochs):
    for data in custom_dataloader:
        if int(i%2)==0:
            print('Epoch %d'%int(i/2))
            print(f'x: {data[0]}, y: {data[1]}')

Epoch 0
x: tensor([[81, 82, 83, 84, 85],
        [46, 47, 48, 49, 50],
        [61, 62, 63, 64, 65],
        [71, 72, 73, 74, 75],
        [76, 77, 78, 79, 80]]), y: tensor([[17],
        [10],
        [13],
        [15],
        [16]])
Epoch 0
x: tensor([[ 96,  97,  98,  99, 100],
        [ 66,  67,  68,  69,  70],
        [ 56,  57,  58,  59,  60],
        [  1,   2,   3,   4,   5],
        [  6,   7,   8,   9,  10]]), y: tensor([[20],
        [14],
        [12],
        [ 1],
        [ 2]])
Epoch 0
x: tensor([[26, 27, 28, 29, 30],
        [21, 22, 23, 24, 25],
        [16, 17, 18, 19, 20],
        [86, 87, 88, 89, 90],
        [11, 12, 13, 14, 15]]), y: tensor([[ 6],
        [ 5],
        [ 4],
        [18],
        [ 3]])
Epoch 0
x: tensor([[31, 32, 33, 34, 35],
        [41, 42, 43, 44, 45],
        [36, 37, 38, 39, 40],
        [91, 92, 93, 94, 95],
        [51, 52, 53, 54, 55]]), y: tensor([[ 7],
        [ 9],
        [ 8],
        [19],
        [11]])
Epoch 1
x: tensor([[  1,   