<a href="https://colab.research.google.com/github/forMwish/MyDeepLearn/blob/master/d2l_7_7_DenseNet.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. 准备

In [None]:
# 挂载 gdrive，选择
from google.colab import drive
import os

gdrive_path = '/gdrive'
drive.mount(gdrive_path, force_remount=True)

os.chdir("%s/MyDrive"%gdrive_path)
try:
    os.mkdir("./d2l_7.7")
    os.chdir("./d2l_7.7")
except:
    os.chdir("./d2l_7.7")

# 安装 d2l
os.system("pip install d2l==0.17.5")

# 解决 matplot 相关问题
os.system("pip uninstall matplotlib")
os.system("pip install matplotlib==3.1.3")

In [None]:
# 其它配置
import torch
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import random

# notebook 设置tag补全
%config Completer.use_jedi = False

# 优先使用 gpu 设备
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("use device:", device)

# pyplot 使用黑暗模式
plt.style.use("default")
# plt.style.use("dark_background")

# pytorch 随机种子固定
torch.manual_seed(0)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(0)

# numpy 随机种子固定
np.random.seed(0)

# python 随机种子固定
random.seed(0)

# 2. 开始

In [None]:
import torchvision
from torchvision import transforms

def get_data_iter(batch_size):
  """ 返回 fashion MNIST 的训练迭代器和测试迭代器
  """
  trans = transforms.ToTensor()
  mnist_train = torchvision.datasets.FashionMNIST("./", train=True, download=True, transform=trans)
  mnist_test = torchvision.datasets.FashionMNIST("./", train=False, download=True, transform=trans)

  train_iter = torch.utils.data.DataLoader(mnist_train, batch_size=batch_size, shuffle=True, num_workers=2)
  test_iter  = torch.utils.data.DataLoader(mnist_test, batch_size=batch_size, shuffle=False, num_workers=2)

  return (train_iter, test_iter)

In [None]:
# 统计精确个数
def accuracy_num(y_hat, y):
    """分类问题，统计精确个数
    """
    if len(y_hat.shape) > 1 and y_hat.shape[1] > 1:
        y_hat = y_hat.argmax(axis=1)
        cmp = y_hat.type(y.dtype) == y
    return float(cmp.type(y.dtype).sum())

def parse_gradient(net:torch.nn.Module):
    """返回模型每层名称、梯度和、梯度均值、梯度方差的字典
    """
    layer_name = []
    grad_sum = []
    grad_mean = []
    grad_var = []
    for i, param in enumerate(net.named_parameters()):
        layer_name.append(param[0])
        grad_sum.append(param[1].grad.sum().cpu().numpy().tolist())
        grad_mean.append(param[1].grad.mean().cpu().numpy().tolist())
        grad_var.append(param[1].grad.var().cpu().numpy().tolist())
        # print(f"layer:{param[0]:20} sum:{param[1].grad.sum():10.4} mean:{param[1].grad.mean():10.4} var:{param[1].grad.var():10.4}")
    return {"layer":layer_name,
            "grad_sum":grad_sum,
            "grad_mean":grad_mean,
            "grad_var":grad_var
            }

In [None]:
import time

class Timer:
    def begin(self):
        self.start = time.time()
    def get(self):
        """ 返回 begin 到 get 之间的耗时(s)
        """
        self.end = time.time()
        return self.end - self.start
    def restart(self):
        """ 返回 begin 到 get 之间的耗时(s), 并重新初始化 begin
        """
        self.end = time.time()
        ret = self.end - self.start
        self.start = self.end
        return ret

In [None]:
# 每个 epoch 显示训练结果（loss & acc）
import matplotlib.pyplot as plt
from IPython import display
%matplotlib inline

class PlotFrames:
    """ 将传入的多个 pd.frame 依次调用 pyplot.plot 制图，frame 和 plot 对应关系如下：
            frame -> axes
            frame 的每个列 -> 单个 plot (列 label 对应曲线 label)
    """
    def __init__(self, frame_num, figsize=(10, 10), title=[]):
        self.fig, self.axes = plt.subplots(frame_num, 1, figsize=figsize)
        if not isinstance(self.axes, np.ndarray):
            self.axes = [self.axes]
        self.title = title
        
    def update(self, *frames:pd.DataFrame):
        for i, frame in enumerate(frames):
            self.axes[i].cla()        
            self.axes[i].grid()        

            for j, column in enumerate(frame.columns):
                y = frame.loc[:, column].values
                x = np.arange(len(y))
                label = column
                if len(self.title) == len(frames):
                    self.axes[i].set_title(self.title[i])
                self.axes[i].plot(x, y, label=f"{label} {y[-1]:10.3} max:{y.max():5.3} min:{y.min():5.3}")
                # self.axes[i].plot(x, y, label=f" {y[-1]:10.3} max:{y.max():5.3} min:{y.min():5.3}")
                self.axes[i].legend()

        display.display(self.fig)
        display.clear_output(wait=True)
    
    def save(self, path):
        self.fig.savefig(path)

In [None]:
def train(net, train_iter, test_iter, num_epochs, lr, device, save_csv, init=True):
    def init_weight(m):
        if type(m) == nn.Linear or type(m) == nn.Conv2d:
            nn.init.xavier_uniform_(m.weight)
    if init:
        net.apply(init_weight)
    net.to(device)
    optimizer = torch.optim.SGD(net.parameters(), lr=lr)
    loss = nn.CrossEntropyLoss()

    title = ["grad_sum", "grad_mean", "grad_var", "loss", "acc", "time"]
    grad_plot = PlotFrames(len(title), figsize=(40, 30), title=title) # 分别是 和、均值、方差
    t = Timer()
    t_list_train = []
    t_list_test = []

    for epoch in range(num_epochs):
        net.train()
        t.begin()
        for i, (X, y) in enumerate(train_iter):
            optimizer.zero_grad()
            X, y = X.to(device), y.to(device)
            y_hat = net(X)
            l = loss(y_hat, y)
            l.backward()
            optimizer.step()

            if i == 0:
                train_metric = np.zeros((3, 1)) # 损失和、准确度和、样本数
            train_metric += np.array([(l.sum()).detach().cpu().numpy(), 
                                accuracy_num(y_hat.cpu(), y.cpu()), y.cpu().numel()]).reshape(3, -1)
        t_train = t.restart()

        grad_data = parse_gradient(net)
        t_grad = t.restart()

        # 验证
        net.eval()
        with torch.no_grad():
            for i, (X, y) in enumerate(test_iter):
                if isinstance(X, list):
                    # BERT微调所需的（之后将介绍）
                    X = [x.to(device) for x in X]
                else:
                    X = X.to(device)
                y = y.to(device)
                y_hat = net(X)
                
                if i == 0:
                    test_metric = np.zeros((3, 1)) # 损失和、准确度和、样本数
                test_metric += np.array([(l.sum()).detach().cpu().numpy(), 
                                    accuracy_num(y_hat.cpu(), y.cpu()), y.cpu().numel()]).reshape(3, -1)
        t_test = t.get()


        pd_c = np.array(grad_data['layer'])
        pd_d0 = np.array(grad_data['grad_sum']).reshape((1, -1))
        pd_d1 = np.array(grad_data['grad_mean']).reshape((1, -1))
        pd_d2 = np.array(grad_data['grad_var']).reshape((1, -1))
        if epoch == 0:
            grad_pd_sum  = pd.DataFrame(pd_d0, columns=pd_c)
            grad_pd_mean = pd.DataFrame(pd_d1, columns=pd_c)
            grad_pd_var  = pd.DataFrame(pd_d2, columns=pd_c)
            
            loss_pd = pd.DataFrame(np.append(train_metric[0]/train_metric[2], test_metric[0]/test_metric[2]).reshape(1, -1), columns=["train", "test"])
            acc_pd  = pd.DataFrame(np.append(train_metric[1]/train_metric[2], test_metric[1]/test_metric[2]).reshape(1, -1), columns=["train", "test"])

            time_pd = pd.DataFrame([[t_train, t_grad, t_test]], columns=["train", "grad", "test"])
        else:
            grad_pd_sum = grad_pd_sum.append(pd.DataFrame(pd_d0, columns=pd_c))
            grad_pd_mean = grad_pd_mean.append(pd.DataFrame(pd_d1, columns=pd_c))
            grad_pd_var = grad_pd_var.append(pd.DataFrame(pd_d2, columns=pd_c))

            loss_pd = loss_pd.append(pd.DataFrame(np.append(train_metric[0]/train_metric[2], test_metric[0]/test_metric[2]).reshape(1, -1), 
                                                  columns=["train", "test"]))
            acc_pd  = acc_pd.append (pd.DataFrame(np.append(train_metric[1]/train_metric[2], test_metric[1]/test_metric[2]).reshape(1, -1), 
                                                  columns=["train", "test"]))
            time_pd = time_pd.append(pd.DataFrame([[t_train, t_grad, t_test]], columns=["train", "grad", "test"]))

        # 绘图
        grad_plot.update(grad_pd_sum, grad_pd_mean, grad_pd_var, loss_pd, acc_pd, time_pd)
            
        # 保存数据
        save_name = os.path.splitext(save_csv)[0]
        grad_plot.save(save_name)
        table = pd.DataFrame({
            "train_loss"    : train_metric[0]/train_metric[2],
            "train_accurate": train_metric[1]/train_metric[2],
            "test_loss"     : test_metric[0]/test_metric[2],
            "test_accurate" : test_metric[1]/test_metric[2],
        })
        table.to_csv(save_csv)

        if not os.path.exists(f"./{save_name}"):
            os.mkdir(f"./{save_name}")
        torch.save(net, f"./{save_name}/{epoch}.pt")



In [None]:
# Residual block
from torch import nn
from torch.nn import functional as F

def conv_block(input_channels, num_channels):
    return nn.Sequential(
        nn.BatchNorm2d(input_channels), nn.ReLU(),
        nn.Conv2d(input_channels, num_channels, kernel_size=3, padding=1))
    
class DenseBlock(nn.Module):
    def __init__(self, num_convs, input_channels, num_channels):
        super().__init__()
        layer = []
        for i in range(num_convs):
            layer.append(conv_block(num_channels*i + input_channels, num_channels))
        self.net = nn.Sequential(*layer)

    def forward(self, X):
        for blk in self.net:
            Y = blk(X)
            X = torch.cat((X, Y), dim=1)
        return X

def transition_block(input_channels, num_channels):
    return nn.Sequential(
        nn.BatchNorm2d(input_channels), nn.ReLU(),
        nn.Conv2d(input_channels, num_channels, kernel_size=1),
        nn.AvgPool2d(kernel_size=2, stride=2))

In [None]:
from torch import nn
from d2l import torch as d2l


lr, num_epochs = 0.1, 50
batch = 256

b1 = nn.Sequential(
    nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3),
    nn.BatchNorm2d(64), nn.ReLU(),
    nn.MaxPool2d(kernel_size=3, stride=2, padding=1))

# num_channels为当前的通道数
num_channels, growth_rate = 64, 32
num_convs_in_dense_blocks = [4, 4, 4, 4]
blks = []
for i, num_convs in enumerate(num_convs_in_dense_blocks):
    blks.append(DenseBlock(num_convs, num_channels, growth_rate))
    # 上一个稠密块的输出通道数
    num_channels += num_convs * growth_rate
    # 在稠密块之间添加一个转换层，使通道数量减半
    if i != len(num_convs_in_dense_blocks) - 1:
        blks.append(transition_block(num_channels, num_channels // 2))
        num_channels = num_channels // 2

net = nn.Sequential(
    b1, *blks,
    nn.BatchNorm2d(num_channels), nn.ReLU(),
    nn.AdaptiveAvgPool2d((1, 1)),
    nn.Flatten(),
    nn.Linear(num_channels, 10))

x = torch.rand(size=(2,1, 224,224), dtype=torch.float32)
for layer in net:
    x = layer(x)
    print(layer.__class__.__name__,'output shape: \t',x.shape)

# train_iter, test_iter = get_data_iter(batch)
train_iter, test_iter = d2l.load_data_fashion_mnist(batch, resize=96)
train(net, train_iter, test_iter, num_epochs, lr, device, save_csv="test_lre-1.csv", init=True)