## ~.Dataset()和~.Dataloader()

在pytorch中，torch.utils.data.里面提供方面的数据集和数据迭代器
在每次训练中我们可以使用这个迭代器输出每一个batchsize的数据，并能够在输出之前及时对数据做预处理或者数据增强的操作

### ~.Dataset()

In [25]:
import torch
from torch.utils.data import Dataset

class TensorDataset(Dataset):
    '''
    TD继承dataset，重载init、getitem、len操作
    实现将一组tensor封装成为tensor数据集
    通过index可以得到数据集的数据
    通过len得到数据集的大小
    '''
    def __init__(self,x,y):
        self.x=x
        self.y=y
    
    def __getitem__(self, index) :
        return self.x[index],self.y[index]
    
    def __len__(self):
        return self.x.size(0)
# 看我们如何使用这个class
x=torch.randn(120,2)/2-2
y=torch.randn(120)
# 封装成为dataset
t_dataset=TensorDataset(x,y)
# 抽取数据
t_dataset[1]


(tensor([-2.0204, -1.8964]), tensor(-0.2659))

In [8]:
t_dataset.__len__()

120

### ~.Dataloder()

dataloader将dataset对象或者自定义数据类的对象封装成为一个迭代器，迭代器可以迭代输出dataset的内容，可以实现多进程、shuffle、sample、校对等多种操作
__init__()输入
1. dataset
2. batch_size()
3. shuffle
4. collate_fn 处理不同情况下输入dataset的封装，一般默认即可
5. batch_sampler 一般采取默认
6. sampler 与shuffle互斥，默认即可
6. num_workers 线程数量

In [26]:
from torch.utils.data import DataLoader
tensor_dataloader = DataLoader(t_dataset,   # 封装的对象
                               batch_size=8,     # 输出的batch size
                               shuffle=True,     # 随机输出
                               num_workers=0)    # 只有1个进程

# 以for循环形式输出
for data, target in tensor_dataloader:
    print(data, target)


tensor([[-2.5042, -2.1659],
        [-1.8833, -1.1500],
        [-1.9991, -2.1198],
        [-1.8401, -1.9984],
        [-1.6414, -2.1670],
        [-1.9247, -1.8506],
        [-1.1528, -1.9911],
        [-1.7750, -1.8837]]) tensor([-1.5444,  1.9749, -0.2567, -0.6198,  0.3054,  0.2757, -0.2918, -0.6072])
tensor([[-1.8771, -1.9792],
        [-1.9391, -2.5095],
        [-2.2484, -1.9347],
        [-2.2756, -1.2855],
        [-2.6127, -1.9398],
        [-2.2896, -1.8757],
        [-1.9209, -1.5757],
        [-1.8822, -1.7244]]) tensor([-0.3374,  0.0069,  0.9390, -1.4069,  1.0069, -0.6879,  0.9389, -0.1247])
tensor([[-2.2115, -1.6545],
        [-1.8845, -1.9511],
        [-2.3268, -1.4210],
        [-2.5839, -2.7000],
        [-2.1112, -1.4714],
        [-2.0672, -3.4426],
        [-2.3766, -1.9899],
        [-2.0631, -2.6673]]) tensor([-2.1607,  0.7821, -0.3751,  0.3690,  0.9473, -0.3641,  0.6716, -1.9142])
tensor([[-2.5535, -1.9998],
        [-2.1357, -1.8139],
        [-1.9401, -2.0147]

### transforms
实现对数据集的预处理、数据增强等一系列操作，包括compose，to tensor等等操作，或者自定义操作

In [None]:
# 看看别人是怎用的
# from PIL import Image
# from torchvision import transforms
# from torch.utils.data import Dataset


# class MyDataset(Dataset):
#     def __init__(self, data_dir, transforms=None):
#         self.data_info = self.get_img_info(data_dir)
#         self.transforms = transforms

#     def __getitem__(self, item):
#         path_img, label = self.data_info[item]
#         image = Image.open(path_img).convert('RGB')
#         # 使用定义好的transforms，对数据进行处理
#         if self.transforms is not None:
#             image = self.transforms(image)

#         return image, label

#     def __len__(self):
#         return len(self.data_info)

# train_transforms = transforms.Compose([transforms.ToTensor(),
#                                        transforms.RandomHorizontalFlip(0.5)])
# train_dataset = MyDataset(data_dir, train_transforms)


In [11]:
# Get cpu or gpu device for training.
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

Using cpu device


In [31]:
# 定义模型
import torch.nn as nn
import torch.nn.functional as F
class Mynet(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1=nn.Linear(2,256)
        self.fc2=nn.Linear(256,128)
        self.fc3=nn.Linear(128,1)

    def forward(self,x):
        y1=F.relu(self.fc1(x))
        y2=F.relu(self.fc2(y1))
        return self.fc3(y2).squeeze(-1)

model=Mynet()
print(model)
        


Mynet(
  (fc1): Linear(in_features=2, out_features=256, bias=True)
  (fc2): Linear(in_features=256, out_features=128, bias=True)
  (fc3): Linear(in_features=128, out_features=1, bias=True)
)


In [28]:
loss_fn = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)

In [29]:
def train(dataloader,model,loss_fn,optimizer):
    size=len(dataloader.dataset)
    model.train()
    for batch,(X,y) in enumerate(dataloader):
        y_hat=model(X)
        loss=loss_fn(y_hat,y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        if batch%2==0:
            loss,current=loss.item(),batch*len(X)
            print('current{} loss is {}'.format(current,loss))

In [32]:
epoch=5
for t in range(epoch):
    print(str(t)+'\n---------------------')
    train(tensor_dataloader,model,loss_fn,optimizer)

0
---------------------
current0 loss is 0.7304892539978027
current16 loss is 0.7407615780830383
current32 loss is 0.5027478337287903
current48 loss is 0.9335276484489441
current64 loss is 1.7129956483840942
current80 loss is 0.40431901812553406
current96 loss is 0.5802524089813232
current112 loss is 0.6539322733879089
1
---------------------
current0 loss is 0.5178716778755188
current16 loss is 0.4191153645515442
current32 loss is 0.8373433351516724
current48 loss is 0.6799297332763672
current64 loss is 0.34582436084747314
current80 loss is 1.3609826564788818
current96 loss is 0.9500986933708191
current112 loss is 1.084706425666809
2
---------------------
current0 loss is 0.5745450258255005
current16 loss is 1.5159958600997925
current32 loss is 0.44541501998901367
current48 loss is 1.9395828247070312
current64 loss is 1.1680681705474854
current80 loss is 0.22981217503547668
current96 loss is 0.18289926648139954
current112 loss is 0.5761668682098389
3
---------------------
current0 los