# 6. TensorBoard

# 6.1 TensorBoard 使用

* tensorboard 是一个可视化工具，可以自动画图像
* pytorch1.1.0以上的版本增加了对tensorboard的支持
* conda install tensorboard

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from torch.utils.tensorboard import SummaryWriter
import pandas as pd
from collections import OrderedDict
from collections import namedtuple
from itertools import product
import time
from IPython.display import clear_output
import json

torch.set_printoptions(linewidth=120)

print(torch.__version__)
print(torchvision.__version__)
!tensorboard --version

1.6.0
0.7.0
TensorFlow installation not found - running with reduced feature set.
2.2.1


In [2]:
def get_num_corrent(preds, labels):
    return preds.argmax(dim=1).eq(labels).sum().item()

In [3]:
class Network(nn.Module):
    def __init__(self):
        super(Network,self).__init__()
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=6, kernel_size=5)
        self.conv2 = nn.Conv2d(in_channels=6, out_channels=12, kernel_size=5)
        
        self.fc1 = nn.Linear(in_features=12*4*4, out_features=120)
        self.fc2 = nn.Linear(in_features=120, out_features=60)
        self.out = nn.Linear(in_features=60, out_features=10)
    
    def forward(self, t):
        t = t 
        t = F.relu(self.conv1(t))
        t = F.max_pool2d(t, kernel_size=2, stride=2)
        
        t = F.relu(self.conv2(t))
        t = F.max_pool2d(t, kernel_size=2, stride=2)
        
        t = t.flatten(start_dim=1)
        t = F.relu(self.fc1(t))
        
        t = F.relu(self.fc2(t))
        
        t = self.out(t)
        return t

In [4]:
train_set = torchvision.datasets.FashionMNIST(
    root='./data/FashionMNIST/',
    train=True,
    download=True,
    transform=transforms.Compose([
        transforms.ToTensor()
    ])
)

In [None]:
batch_size_list = [10, 100]
lr_list = [.01]

- pytorch的SummaryWriter提供了TensorBoard支持
- 常用的方法有
    - add_scalar
    - add_histogram

In [None]:
for batch_size in batch_size_list:
    for lr in lr_list:
        network = Network()
        train_loader = torch.utils.data.DataLoader(train_set, batch_size=batch_size)
        images, labels = next(iter(train_loader))
        
        grid = torchvision.utils.make_grid(images)
        
        comment = f'batch_size={batch_size} lr={lr}'
        print(comment)
        
        tb = SummaryWriter(comment=comment)
        tb.add_image('images', grid)
        tb.add_graph(network, images)
        
        optimizer = optim.Adam(network.parameters(), lr=lr)
        
        for epoch in range(5):
            total_loss = 0
            total_correct = 0
            
            for batch in train_loader:
                images, labels = batch
                
                preds = network(images)
                
                loss = F.cross_entropy(preds, labels)
                
                optimizer.zero_grad()
                loss.backward()
                
                optimizer.step()
                
                total_loss += loss.item() * batch_size
                total_correct += get_num_corrent(preds, labels)
                
            tb.add_scalar('Loss', total_loss, epoch)
            tb.add_scalar('Number Correct', total_correct, epoch)
            tb.add_scalar('Accuracy', total_correct / len(train_set), epoch)
            
            for name, weight in network.named_parameters():
                tb.add_histogram(name, weight, epoch)
                tb.add_histogram(f'{name}.grad', weight.grad, epoch)
                
            print(f'epoch: {epoch}, total_correct: {total_correct}, loss: {total_loss}')
tb.close()

# 6.2 超参数调试
* 设置不同的超参数以查看性能，配合TensorBoard使用

### 6.2.1 用product生成运行参数

In [None]:
parameters = dict(
    lr = [.01, .001],
    batch_size = [10, 100, 1000],
    shuffle = [True, False]
)

In [None]:
param_values = [v for v in parameters.values()]
param_values

In [None]:
param_keys = [k for k in parameters.keys()]
param_keys

In [None]:
for lr, batch_size, shuffle in product(*param_values):
    print(lr, batch_size, shuffle)

这里`product(*param_values)`在参数前面加入一个`*`，表示的是将传入的参数解压。
- 如这里的`param_values`的值是`[[0.01, 0.001], [10, 100, 1000], [True, False]]`，解压后变成了`[0.01, 0.001], [10, 100, 1000], [True, False]`。
- `product`函数是计算笛卡尔积，用法是`product([1, 2], [3, 4])`

### 6.2.2 RunBuilder类

In [5]:
class RunBuilder():
    @staticmethod
    def get_runs(params):
        Run = namedtuple('Run', params.keys())
        runs = []
        for v in product(*params.values()):
            runs.append(Run(*v))
        return runs

- `Run = namedtuple('Run', params.keys())`定义了一个命名元组，可以看下面的例子，命名元组除了可以通过索引查询，还可以通过名字查询。
- `Run`的名字是`params.keys()`
- `Run`的值是`*v`，一个`v`的就是一个笛卡尔积

In [None]:
Student = namedtuple('Student', ['name', 'age', 'email'])
s1 = Student('zhangsan', 18, 'zhangsan@gmail.com')

In [None]:
print(s1)
print(s1.name)

In [None]:
params = dict(
    lr = [.1, .01, .001],
    batch_size = [10, 100, 1000]
)

In [None]:
runs = RunBuilder.get_runs(params)
runs

In [None]:
for run in runs:
    print(run, run.lr, run.batch_size)

### 6.2.3 RunManager

- RunManager可以让程序更可读

In [6]:
class RunManager():
    def __init__(self):
        self.epoch_count = 0
        self.epoch_loss = 0
        self.epoch_num_correct = 0
        self.epoch_start_time = None
        
        self.run_params = None
        self.run_count = 0
        self.run_data = []
        self.run_start_time = None
        
        self.network = None
        self.loader = None
        self.tb = None
        
    def begin_run(self, run, network, loader):
        self.run_start_time = time.time()
        
        self.run_params = run
        self.run_count += 1
        
        self.network = network
        self.loader = loader
        self.tb = SummaryWriter(comment=f'-{run}')
        
        images, labels = next(iter(self.loader))
        grid = torchvision.utils.make_grid(images)
        
        self.tb.add_image('images', grid)
        self.tb.add_graph(self.network, images)
        
    def end_run(self):
        self.tb.close()
        self.epoch_count = 0
        
    def begin_epoch(self):
        self.epoch_start_time = time.time()
        
        self.epoch_count += 1
        self.epoch_loss = 0
        self.epoch_num_correct = 0
        
    def end_epoch(self):
        epoch_duration = time.time() - self.epoch_start_time
        run_duration = time.time() - self.run_start_time
        
        loss = self.epoch_loss / len(self.loader.dataset)
        accuracy = self.epoch_num_correct / len(self.loader.dataset)
        
        self.tb.add_scalar('Loss', loss, self.epoch_count)
        self.tb.add_scalar('Accuracy', accuracy, self.epoch_count)
        
        for name, param in self.network.named_parameters():
            self.tb.add_histogram(name, param, self.epoch_count)
            self.tb.add_histogram(f'{name}.grad', param.grad, self.epoch_count)
            
            
        results = OrderedDict()
        results["run"] = self.run_count
        results["epoch"] = self.epoch_count
        results["loss"] = loss
        results["accuracy"] = accuracy
        results["epoch_duration"] = epoch_duration
        results["run_duration"] = run_duration
        for k,v in self.run_params._asdict().items(): results[k] = v
        self.run_data.append(results)
        df = pd.DataFrame.from_dict(self.run_data, orient='columns')

        clear_output(wait=True)
        display(df)
        
    def track_loss(self, loss):
        self.epoch_loss += loss.item()*self.loader.batch_size
    
    def track_num_correct(self, preds, labels):
        self.epoch_num_correct += self._get_num_correct(preds, labels)
    
    @torch.no_grad()
    def _get_num_correct(self, preds, labels):
        return preds.argmax(dim=1).eq(labels).sum().item()
    
    def save(self, fileName):
        pd.DataFrame.from_dict(
            self.run_data,
            orient='columns').to_csv(f'{fileName}.csv')
        with open(f'{fileName}.json','w', encoding='utf-8') as f:
            json.dump(self.run_data, f, ensure_ascii=False, indent=4)

使用RunManager和RunBuilder类可以使得程序更易扩展

In [7]:
params = OrderedDict(
    lr = [.01],
    batch_size =[1000],
    shuffle = [True, False]
)
m = RunManager()
for run in RunBuilder.get_runs(params):
    
    network = Network()
    loader = torch.utils.data.DataLoader(train_set, batch_size=run.batch_size, shuffle=run.shuffle)
    optimizer = optim.Adam(network.parameters(), lr=run.lr)
    
    m.begin_run(run, network, loader)
    for epoch in range(5):
        m.begin_epoch()
        for batch in loader:
            images, labels = batch
            preds = network(images)
            loss = F.cross_entropy(preds, labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            m.track_loss(loss)
            m.track_num_correct(preds, labels)
            
        m.end_epoch()
    m.end_run()
m.save('resuls')

Unnamed: 0,run,epoch,loss,accuracy,epoch_duration,run_duration,lr,batch_size,shuffle
0,1,1,0.945715,0.640083,8.050407,8.922229,0.01,1000,True
1,1,2,0.466817,0.823183,8.03354,17.085777,0.01,1000,True
2,1,3,0.39431,0.853383,7.610352,24.822081,0.01,1000,True
3,1,4,0.341661,0.874033,7.667969,32.603677,0.01,1000,True
4,1,5,0.327739,0.8782,7.693508,40.407475,0.01,1000,True
5,2,1,1.024029,0.606633,7.504553,8.189383,0.01,1000,False
6,2,2,0.564603,0.781017,7.58576,15.877475,0.01,1000,False
7,2,3,0.461118,0.829017,7.771919,23.765024,0.01,1000,False
8,2,4,0.399428,0.8528,7.68716,31.575365,0.01,1000,False
9,2,5,0.361144,0.866467,7.588447,39.287372,0.01,1000,False


### 6.2.4 使用DataLoader的多进程功能加速神经网络训练

* 使用data loader类的num_workers可选属性可加速神经网络的训练
* num_workers属性告诉data loader实例有多少个单元处理器用于数据加载
* num_workers值的选择的最好方式是进行试验

In [None]:
params = OrderedDict(
    lr = [.01],
    batch_size =[1000, 2000],
    shuffle = [True, False]，
    num_workers = [0,1,2,4,8,16]
)
m = RunManager()
for run in RunBuilder.get_runs(params):
    
    network = Network()
    loader = DataLoader(train_set, batch_size=run.batch_size, shuffle=run.shuffle, num_workers=run.num_workers)
    optimizer = optim.Adam(network.parameters(), lr=run.lr)
    
    m.begin_run(run, network, loader)
    for epoch in range(5):
        m.begin_epoch()
        for batch in loader:
            images, labels = batch
            preds = network(images)
            loss = F.cross_entropy(preds, labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            m.track_loss(loss)
            m.track_num_correct(preds, labels)
            
            
        m.end_epoch()
    m.end_run()
m.save('resuls')