## PyTorch DataLoader

지금 까지는 그냥 데이터 전부 읽어서 전부 모델에 넣고 딥러닝 알고리즘을 적용하였다. 왜냐면 데이터가 별로 안 컸기 때문에.. <br>
하지만 실제 프로젝트를 진행하다보면 데이터 용량이 천문학적으로 커지게 된다.이때 등장하는 개념이 **bath(batch size)**이다.


### Terminology 

###### epoch

one forward pass and one backward pass of all the training examples

###### batch_size

the number of training examples in one forward/backward pass. The higher the batch size, the more memory space you'll need.

###### iterations

number of iterations = number of passess, each pass using [batch size] number of examples. one pass = one forward pass + one backward pass

(EX) if you have 1000 training examples, and your batch size is 500. then it will take 2 iterations to complete 1 epoch


#### 하지만, Pytorch 에서 제공하는 DataLoader를 사용하면 내가 original data를 randomly suffle 하여 queue에 쌓고, iterable 한 data를 만들고 하는 일련의 과정을 알아서 해주기 떄문에 직접 할 필요가 없어진다. 

### DataLoader

우리가 Custom DataLoader를 만들어야 한다.<br>
우리가 DataLoader를 만들때 두가지만 만들어주면 된다 (so simple)

1. __getitem__(self, index) 
    
        # return one item on the given index
        
2. __len__(self)

        # return the data length


In [45]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch import from_numpy, tensor
import numpy as np
from torch.autograd import Variable

In [52]:
# Custom DataLoader
class DiabetesDataset(Dataset): # extends dataset
    # initialize your data, download, etc.
    def __init__(self):
        # download, read data, etc...
        xy = np.loadtxt('data/diabetes.csv',delimiter=',',dtype=np.float32)
        self.len = xy.shape[0]
        self.x_data = torch.from_numpy(xy[:,0:-1])
        self.y_data = torch.from_numpy(xy[:,[-1]])
        print(self.len, self.x_data, self.y_data)
    def __getitem__(self, index):
        # return one item on the given index
        return self.x_data[index],self.y_data[index]
    def __len__(self):
        # return the data length
        return self.len

    
dataset = DiabetesDataset()
train_loader = DataLoader(dataset=dataset,
                         batch_size=32,
                         shuffle=True,
                         num_workers=2)
# num_workers = multiple processes

759 tensor([[-0.2941,  0.4874,  0.1803,  ...,  0.0015, -0.5312, -0.0333],
        [-0.8824, -0.1457,  0.0820,  ..., -0.2072, -0.7669, -0.6667],
        [-0.0588,  0.8392,  0.0492,  ..., -0.3055, -0.4927, -0.6333],
        ...,
        [-0.4118,  0.2161,  0.1803,  ..., -0.2191, -0.8574, -0.7000],
        [-0.8824,  0.2663, -0.0164,  ..., -0.1028, -0.7686, -0.1333],
        [-0.8824, -0.0653,  0.1475,  ..., -0.0939, -0.7976, -0.9333]]) tensor([[0.],
        [1.],
        [0.],
        [1.],
        [0.],
        [1.],
        [0.],
        [1.],
        [0.],
        [0.],
        [1.],
        [0.],
        [1.],
        [0.],
        [0.],
        [0.],
        [0.],
        [1.],
        [0.],
        [1.],
        [1.],
        [0.],
        [0.],
        [0.],
        [0.],
        [1.],
        [1.],
        [1.],
        [1.],
        [0.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [0.],
        [0.],
        [0.],
        [1.],
        [1.],
  

In [51]:
class Model(torch.nn.Module):
    def __init__(self):
        """
        In the constructor we instantiate two nn.Linear modules
        """
        super(Model, self).__init__()
        
        self.l1 = torch.nn.Linear(8,6)
        self.l2 = torch.nn.Linear(6,4)
        self.l3 = torch.nn.Linear(4,1)
        
        self.sigmoid = torch.nn.Sigmoid()
    
    def forward(self,x):
        out1 = self.sigmoid(self.l1(x))
        out2 = self.sigmoid(self.l2(out1))
        y_pred = self.sigmoid(self.l3(out2))
        return y_pred

# our model
model = Model()

# Construct our loss function and an Optimizer.
# The call to model.parameters() in the SGD constructor will contain
# the learnable parameters of the two nn.Linear modules which are
# members of the model.

criterion = torch.nn.BCELoss(size_average=True)
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)


params = list(model.parameters())
for param in params:
    print(param.size())
print(len(params))
print(params[0].size())  # conv1's .weight

# Training loop
for epoch in range(2):
    for i,data in enumerate(train_loader):
            # get the inputs
            inputs, labels = data
            
            # wrap them in Variable
            inputs, labels = Variable(inputs), Variable(labels)
            
            # Forward pass 
            y_pred = model(inputs)
            
            # Compute and print loss
            loss = criterion(y_pred, labels)
            print(epoch, i, loss.data.item())
            
            # Zero Gradients, perform a backward pass, and update the weight
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

torch.Size([6, 8])
torch.Size([6])
torch.Size([4, 6])
torch.Size([4])
torch.Size([1, 4])
torch.Size([1])
6
torch.Size([6, 8])
0 0 0.8390055298805237
0 1 1.0275843143463135
0 2 0.9603310823440552
0 3 0.8564997315406799
0 4 0.8601959943771362
0 5 0.8405967354774475
0 6 0.7869867086410522
0 7 0.8097572922706604
0 8 0.7800126075744629
0 9 0.8221293091773987
0 10 0.7655919790267944
0 11 0.7249827980995178
0 12 0.78365558385849
0 13 0.7541194558143616
0 14 0.7375591993331909
0 15 0.7117654085159302
0 16 0.7125611305236816
0 17 0.6992925405502319
0 18 0.689765989780426
0 19 0.6903635263442993
0 20 0.6803292036056519
0 21 0.6946393847465515
0 22 0.6913390755653381
0 23 0.7008209228515625
1 0 0.6693582534790039
1 1 0.6620118021965027
1 2 0.6916953325271606
1 3 0.6646742224693298
1 4 0.6610709428787231
1 5 0.6712883114814758
1 6 0.678400993347168
1 7 0.6472041010856628
1 8 0.6682885885238647
1 9 0.6501188278198242
1 10 0.6095486283302307
1 11 0.6428996920585632
1 12 0.652522623538971
1 13 0.6754