In [1]:
%matplotlib inline
import torch
import numpy as np
import matplotlib.pyplot as plt

torch.set_printoptions(edgeitems=2)
torch.manual_seed(123456)

print("")




In [2]:
# 导入本文档所需的库
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

### 1. 先查看目录中的图片

请根据[02-torchvison.datasets-保存图片到文件中.ipynb](./02-torchvison.datasets-保存图片到文件中.ipynb)准备好图片。

In [3]:
# 查看我们的目标目录中是否有图片
!ls ../../data/images/CIFAR10/**/*.png | wc -l

     100


In [4]:
# 查看图片目录
!tree ../../data/images/CIFAR10/ -L 1

[01;34m../../data/images/CIFAR10/[0m
├── [01;34mairplane[0m
├── [01;34mautomobile[0m
├── [01;34mbird[0m
├── [01;34mcat[0m
├── [01;34mdeer[0m
├── [01;34mdog[0m
├── [01;34mfrog[0m
├── [01;34mhorse[0m
├── [01;34mship[0m
└── [01;34mtruck[0m

11 directories, 0 files


### 2. 使用ImageFolder

### 2.1 实例化数据转换实例

In [5]:
data_transforms = transforms.Compose([
    transforms.Resize((16, 16)),  # 调整图片图像尺寸，其实可以不需要，但是为了练习加上
    transforms.ToTensor(), # 将PIL图片转换为Tensor
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])  # 对图片张量归一化
])

In [6]:
data_transforms

Compose(
    Resize(size=(16, 16), interpolation=bilinear, max_size=None, antialias=True)
    ToTensor()
    Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
)

### 2.2 加载图片数据

如果`ImageFoler`不传递`transform`那么默认就是None

In [7]:
images_dir = "../../data/images/CIFAR10"
images = datasets.ImageFolder(root=images_dir, transform=data_transforms)

In [8]:
images

Dataset ImageFolder
    Number of datapoints: 100
    Root location: ../../data/images/CIFAR10
    StandardTransform
Transform: Compose(
               Resize(size=(16, 16), interpolation=bilinear, max_size=None, antialias=True)
               ToTensor()
               Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
           )

In [9]:
type(images)

torchvision.datasets.folder.ImageFolder

In [10]:
len(images)

100

In [11]:
type(images[0])

tuple

In [12]:
# 得到的数据是一个图像张量和一个标签
type(images[0][0]), type(images[0][1])

(torch.Tensor, int)

In [13]:
images[99][1]

9

**看一下不传递`transform`**：

In [14]:
images2 = datasets.ImageFolder(root=images_dir)

In [15]:
type(images2)

torchvision.datasets.folder.ImageFolder

In [16]:
type(images2[0])

tuple

In [17]:
type(images2[0][0]), type(images2[0][1])

(PIL.Image.Image, int)

In [18]:
print(images2[0][0])

<PIL.Image.Image image mode=RGB size=32x32 at 0x17536DED0>


### 2.3 使用DataLoader

为了进行批处理和数据加载的并行化，通常会使用`DataLoader`。    
常用的参数：
- `dataset`: 必填，数据集实例
- `batch_size`: 每个批次的样本数量， 默认是1
- `shuffle`: `True`表示每次迭代前都打乱数据顺序
- `num_workers`: 默认`0`(数据将在主进程中加载), 使用多少个子进程来并发加载数据
- `collate_fn`: 一个可选的函数，用于将赝本列表转换为小批量。默认情况下，它会堆叠张量
- `pin_memory`: 如果使用GPU，是否将张量复制到`CUDA`的固定内存中以加速数据传输，默认`False`
- `drop_last`: 如果数据集大小不能被`batch_size`整除，是否丢弃最后一个不完整的批次，默认是`False`
- `timeout`: 数据加载超时时间，单位为秒，防止加载数据的时候卡死，默认是`0`（无超时）

In [19]:
batch_size = 10
data_loader = DataLoader(images, batch_size=batch_size, shuffle=True, num_workers=2)

In [20]:
type(data_loader)

torch.utils.data.dataloader.DataLoader

`DataLoader`实例化的时候`batch_size=10`,那么当for执行的时候，会执行`len(datasets) / batch_size`次

In [21]:
len(images) / batch_size

10.0

In [22]:
# 现在我们取2次图片数据
count = 0
for epoch in range(1, 3):
    i = 0
    for imgs, labels in data_loader:
        i += 1
        count += 1
        print(f"epoch={epoch}, i = {i}, count={count}\tlabels:{labels}")
    print("")
print(f"count = {count}")

epoch=1, i = 1, count=1	labels:tensor([6, 2, 1, 0, 3, 9, 1, 2, 0, 2])
epoch=1, i = 2, count=2	labels:tensor([3, 7, 4, 1, 5, 9, 8, 4, 2, 5])
epoch=1, i = 3, count=3	labels:tensor([7, 2, 2, 2, 9, 4, 7, 6, 1, 0])
epoch=1, i = 4, count=4	labels:tensor([3, 5, 4, 4, 2, 6, 4, 3, 0, 1])
epoch=1, i = 5, count=5	labels:tensor([1, 5, 2, 7, 3, 3, 7, 1, 3, 7])
epoch=1, i = 6, count=6	labels:tensor([3, 1, 1, 3, 2, 1, 0, 4, 1, 9])
epoch=1, i = 7, count=7	labels:tensor([3, 9, 7, 8, 9, 7, 5, 9, 0, 1])
epoch=1, i = 8, count=8	labels:tensor([5, 7, 7, 1, 6, 3, 2, 8, 4, 4])
epoch=1, i = 9, count=9	labels:tensor([9, 9, 4, 6, 3, 5, 9, 2, 6, 7])
epoch=1, i = 10, count=10	labels:tensor([3, 8, 6, 4, 2, 9, 1, 9, 1, 1])

epoch=2, i = 1, count=11	labels:tensor([7, 6, 2, 6, 1, 2, 9, 9, 2, 4])
epoch=2, i = 2, count=12	labels:tensor([5, 7, 3, 3, 3, 1, 3, 4, 6, 7])
epoch=2, i = 3, count=13	labels:tensor([1, 5, 1, 1, 7, 7, 3, 9, 9, 6])
epoch=2, i = 4, count=14	labels:tensor([8, 7, 3, 0, 0, 0, 3, 4, 7, 9])
epoch=2, i = 