In [1]:
import torch
from torch import nn
import torchvision
import torchvision.transforms as transforms
from alexnet_FashionMnist import FashionMnist
from alexnet_CIFAR100 import CIFAR100
from resnet_FashionMnist import resnet_FashionMnist
from googlenet_FashionMnist import Googlenet
from vgg_FashionMnist import vgg
from d2l import torch as d2l
import numpy as np
import pandas as pd
from ptflops import get_model_complexity_info
from train_layers import train_layers
from train import train_func
import matplotlib.pyplot as plt
from pathlib import Path

In [2]:
alexnet_fashionmnist = FashionMnist()
alexnet_cifar100 = CIFAR100()
resnet_fashionmnist = resnet_FashionMnist()
vgg_fashionmnist = resnet_FashionMnist()
googlenet_fashionmnist = Googlenet()

##### using ptflops to calculate the number of the flops in the model

In [3]:
with torch.cuda.device(0):
#     net = alexnet_fashionmnist
#     macs, params = get_model_complexity_info(net, (1, 224, 224), as_strings=True,
#                                             print_per_layer_stat=True, verbose=True)
    
    # net = alexnet_cifar100
    # macs, params = get_model_complexity_info(net, (3, 224, 224), as_strings=True,
    #                                         print_per_layer_stat=True, verbose=True)
    
    # net = resnet_fashionmnist
    # macs, params = get_model_complexity_info(net, (1, 224, 224), as_strings=True,
    #                                         print_per_layer_stat=True, verbose=True)
    
    # net = vgg_fashionmnist
    # macs, params = get_model_complexity_info(net, (1, 224, 224), as_strings=True,
    #                                         print_per_layer_stat=True, verbose=True)
    
    net = googlenet_fashionmnist
    macs, params = get_model_complexity_info(net, (1, 224, 224), as_strings=True,
                                            print_per_layer_stat=True, verbose=True)
    
    if net == resnet_fashionmnist:
        print(torch.backends.mps.is_built())
        print(torch.backends.mps.is_available())
    
    print('{:<30}  {:<8}'.format('Computational complexity: ', macs))
    print('{:<30}  {:<8}'.format('Number of parameters: ', params))

Sequential(
  5.98 M, 100.000% Params, 1.51 GMac, 99.594% MACs, 
  (0): Sequential(
    3.2 k, 0.054% Params, 41.75 MMac, 2.752% MACs, 
    (0): Conv2d(3.2 k, 0.054% Params, 40.14 MMac, 2.646% MACs, 1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3))
    (1): ReLU(0, 0.000% Params, 802.82 KMac, 0.053% MACs, )
    (2): MaxPool2d(0, 0.000% Params, 802.82 KMac, 0.053% MACs, kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  )
  (1): Sequential(
    114.94 k, 1.923% Params, 361.87 MMac, 23.856% MACs, 
    (0): Conv2d(4.16 k, 0.070% Params, 13.05 MMac, 0.860% MACs, 64, 64, kernel_size=(1, 1), stride=(1, 1))
    (1): ReLU(0, 0.000% Params, 200.7 KMac, 0.013% MACs, )
    (2): Conv2d(110.78 k, 1.853% Params, 347.42 MMac, 22.903% MACs, 64, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(0, 0.000% Params, 602.11 KMac, 0.040% MACs, )
    (4): MaxPool2d(0, 0.000% Params, 602.11 KMac, 0.040% MACs, kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=

[**我们构造一个**]高度和宽度都为224的(**单通道数据，来观察每一层输出的形状**)。
它与 :numref:`fig_alexnet`中的AlexNet架构相匹配。


In [4]:
X = torch.randn(1, 1, 224, 224) # FashionMNIST

# X = torch.randn(1, 3, 224, 224) # CIFAR100
for layer in net:
    X=layer(X)
    print(layer.__class__.__name__,'output shape:\t',X.shape)

Sequential output shape:	 torch.Size([1, 64, 56, 56])
Sequential output shape:	 torch.Size([1, 192, 28, 28])
Sequential output shape:	 torch.Size([1, 480, 14, 14])
Sequential output shape:	 torch.Size([1, 832, 7, 7])
Sequential output shape:	 torch.Size([1, 1024])
Linear output shape:	 torch.Size([1, 10])


## 读取数据集

尽管原文中AlexNet是在ImageNet上进行训练的，但本书在这里使用的是Fashion-MNIST数据集。因为即使在现代GPU上，训练ImageNet模型，同时使其收敛可能需要数小时或数天的时间。
将AlexNet直接应用于Fashion-MNIST的一个问题是，[**Fashion-MNIST图像的分辨率**]（$28 \times 28$像素）(**低于ImageNet图像。**)
为了解决这个问题，(**我们将它们增加到$224 \times 224$**)（通常来讲这不是一个明智的做法，但在这里这样做是为了有效使用AlexNet架构）。
这里需要使用`d2l.load_data_fashion_mnist`函数中的`resize`参数执行此调整。


In [5]:
def get_dataloader_workers():
    """Use 4 processes to read the data.

    Defined in :numref:`sec_utils`"""
    return 4
def load_data_cifar100(batch_size, resize=None):
    """Download the Fashion-MNIST dataset and then load it into memory.

    Defined in :numref:`sec_utils`"""
    trans = [transforms.ToTensor()]
    if resize:
        trans.insert(0, transforms.Resize(resize))
    trans = transforms.Compose(trans)
    # import the cifar100 dataset
    cifar_train = torchvision.datasets.CIFAR100(
        root="../data", train=True, transform=trans, download=True)
    cifar_test = torchvision.datasets.CIFAR100(
        root="../data", train=False, transform=trans, download=True)
    return (torch.utils.data.DataLoader(cifar_train, batch_size, shuffle=True,
                                        num_workers=get_dataloader_workers()),
            torch.utils.data.DataLoader(cifar_test, batch_size, shuffle=False,
                                        num_workers=get_dataloader_workers()))

In [6]:
batch_size = 128
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size, resize=224) # FashionMNIST

# train_iter, test_iter = load_data_cifar100(batch_size, resize=224) # CIFAR100
# print the shape of the train_iter
list_of_i = []
for i, (X, y) in enumerate(train_iter):
    list_of_i.append(i)

print('the shape of the train_iter is:', np.array(list_of_i).shape)
# print(list_of_i)
# print the first 10 batch of the train_iter
for i, (X, y) in enumerate(train_iter):
    if i < 10:
        print('the shape of the', i, 'batch of the train_iter is:', X.shape)
    else:
        break

the shape of the train_iter is: (469,)
the shape of the 0 batch of the train_iter is: torch.Size([128, 1, 224, 224])
the shape of the 1 batch of the train_iter is: torch.Size([128, 1, 224, 224])
the shape of the 2 batch of the train_iter is: torch.Size([128, 1, 224, 224])
the shape of the 3 batch of the train_iter is: torch.Size([128, 1, 224, 224])
the shape of the 4 batch of the train_iter is: torch.Size([128, 1, 224, 224])
the shape of the 5 batch of the train_iter is: torch.Size([128, 1, 224, 224])
the shape of the 6 batch of the train_iter is: torch.Size([128, 1, 224, 224])
the shape of the 7 batch of the train_iter is: torch.Size([128, 1, 224, 224])
the shape of the 8 batch of the train_iter is: torch.Size([128, 1, 224, 224])
the shape of the 9 batch of the train_iter is: torch.Size([128, 1, 224, 224])


In [7]:
import os
working_dir = os.getcwd()
print('The working dir is: ', working_dir)

# find out the parent directory
train_data = os.path.join(working_dir, 'Data/Googlenet_data')
# train_data = os.path.join(working_dir, 'Googlenet_train_data')
print('The train_data dir is: ', train_data)

The working dir is:  /root/GreenAI/GPU/universal
The train_data dir is:  /root/GreenAI/GPU/universal/Data/Googlenet_data


In [8]:
train_data_str = str(train_data)
print('The train_data_str is: ', train_data_str)
train_data_path = Path(train_data_str)

The train_data_str is:  /root/GreenAI/GPU/universal/Data/Googlenet_data


In [9]:
for i in range(1, 11):
    dir_path = train_data_path / f'round_{i}'
    dir_path.mkdir(parents=True, exist_ok=True)
    
# find out all the subfolders in the train_data
subfolders = [f.path for f in os.scandir(train_data) if f.is_dir()]
print('The folders in the train_data are: ', subfolders)

The folders in the train_data are:  ['/root/GreenAI/GPU/universal/Data/Googlenet_data/round_1', '/root/GreenAI/GPU/universal/Data/Googlenet_data/round_2', '/root/GreenAI/GPU/universal/Data/Googlenet_data/round_3', '/root/GreenAI/GPU/universal/Data/Googlenet_data/round_4', '/root/GreenAI/GPU/universal/Data/Googlenet_data/round_5', '/root/GreenAI/GPU/universal/Data/Googlenet_data/round_6', '/root/GreenAI/GPU/universal/Data/Googlenet_data/round_7', '/root/GreenAI/GPU/universal/Data/Googlenet_data/round_8', '/root/GreenAI/GPU/universal/Data/Googlenet_data/round_9', '/root/GreenAI/GPU/universal/Data/Googlenet_data/round_10']


In [10]:
lr, num_epochs = 0.01, 10
device = d2l.try_gpu()
for subfolder in subfolders:
    working_diri = os.path.join(train_data, subfolder)
    print('The sub_folder dir is: ', working_diri)
    # Time_AllEpochs, TestAcc, TrainLoss, TrainAcc, TimeEpoch, Energy_AllEpochs, TrainTime, TTrainAccLoss = train_func(alexnet_fashionmnist, train_iter, test_iter, num_epochs, lr, device) # FashionMNIST for Alexnet
    # Time_AllEpochs, TestAcc, TrainLoss, TrainAcc, TimeEpoch, Energy_AllEpochs, TrainTime, TTrainAccLoss = train_func(resnet_fashionmnist, train_iter, test_iter, num_epochs, lr, device) # FashionMNIST for Resnet
    # Time_AllEpochs, TestAcc, TrainLoss, TrainAcc, TimeEpoch, Energy_AllEpochs, TrainTime, TTrainAccLoss = train_func(alexnet_cifar100, train_iter, test_iter, num_epochs, lr, device) # CIFAR100 for Alexnet
    # Time_AllEpochs, TestAcc, TrainLoss, TrainAcc, TimeEpoch, Energy_AllEpochs, TrainTime, TTrainAccLoss = train_func(vgg_fashionmnist, train_iter, test_iter, num_epochs, lr, device) # FashionMNIST for VGG
    Time_AllEpochs, TestAcc, TrainLoss, TrainAcc, TimeEpoch, Energy_AllEpochs, TrainTime, TTrainAccLoss = train_func(googlenet_fashionmnist, train_iter, test_iter, num_epochs, lr, device) # FashionMNIST for Googlenet

    # # print('Forward Layers Time: \n', 
    
    # #       'Conv2d time: ', Time_Layers[0,0], '\n',
    # #       'ReLU time: ', Time_Layers[0,1], '\n',
    # #       'MaxPool2d time: ', Time_Layers[0,2], '\n',
    # #       'Linear time: ', Time_Layers[0,3], '\n',
    # #       'Dropout time: ', Time_Layers[0,4], '\n',
    # #       'Flatten time: ', Time_Layers[0,5])
    # # print('*'*50)
    # print('Time_AllEpochs: \n', 
    #     'Time to Device time: ', Time_AllEpochs[0,0], '\n',
    #     'Forward time: ', Time_AllEpochs[0,1], '\n',
    #     'Calculate Loss time: ', Time_AllEpochs[0,2], '\n',
    #     'Backward time: ', Time_AllEpochs[0,3], '\n',
    #     'Optimize time: ', Time_AllEpochs[0,4], '\n',
    #     'Test time: ', Time_AllEpochs[0,5])
    # print('*'*50)
    # print('Train Time of each epoch:', TrainTime[0])
    # print('*'*50)
    # print('Evaluation time: ', TTrainAccLoss[0])
    # print('*'*50)
    # print('TestAcc:', TestAcc)
    # print('*'*50)
    # print('TrainLoss:', TrainLoss)
    # print('*'*50)
    # print('TrainAcc:', TrainAcc)
    # print('*'*50)
    # print('TimeEpoch:', TimeEpoch[0])
    # print('*'*50)
    # print('Energy_AllEpochs:', Energy_AllEpochs[0], '\n',
    #     'Total Energy:',np.sum(Energy_AllEpochs[0,0]), '\n',
    #     'The time of the first epoch:', len(Energy_AllEpochs[0,0]))
    
    # save the Time_AllEpochs, TestAcc, TrainLoss, TrainAcc, TimeEpoch, Energy_AllEpochs, TrainTime, TTrainAccLoss to the sub_folder dir as .npy file
    np.save(os.path.join(working_diri, 'Time_AllEpochs.npy'), Time_AllEpochs)
    np.save(os.path.join(working_diri, 'TestAcc.npy'), TestAcc)
    np.save(os.path.join(working_diri, 'TrainLoss.npy'), TrainLoss)
    np.save(os.path.join(working_diri, 'TrainAcc.npy'), TrainAcc)
    np.save(os.path.join(working_diri, 'TimeEpoch.npy'), TimeEpoch)
    np.save(os.path.join(working_diri, 'Energy_AllEpochs.npy'), Energy_AllEpochs)
    np.save(os.path.join(working_diri, 'TrainTime.npy'), TrainTime)
    np.save(os.path.join(working_diri, 'TTrainAccLoss.npy'), TTrainAccLoss)
    
    

The sub_folder dir is:  /root/GreenAI/GPU/universal/Data/Googlenet_data/round_1
training on cuda:0
epoch 1


Epoch 1 completed: Avg Loss: 2.302415628051758, Avg Accuracy: 0.10278333333333334
test acc is 0.100000
epoch 1, time 40.764217 sec
epoch 2
Epoch 2 completed: Avg Loss: 2.301196041870117, Avg Accuracy: 0.1333
test acc is 0.100100
epoch 2, time 79.890367 sec
epoch 3
Epoch 3 completed: Avg Loss: 2.297764716084798, Avg Accuracy: 0.1633
test acc is 0.318600
epoch 3, time 119.226221 sec
epoch 4
Epoch 4 completed: Avg Loss: 2.279448267110189, Avg Accuracy: 0.26421666666666666
test acc is 0.299500
epoch 4, time 158.232580 sec
epoch 5
Epoch 5 completed: Avg Loss: 1.9377761805216471, Avg Accuracy: 0.32685
test acc is 0.456600
epoch 5, time 198.657584 sec
epoch 6
Epoch 6 completed: Avg Loss: 1.1397277678171793, Avg Accuracy: 0.55635
test acc is 0.615000
epoch 6, time 237.622959 sec
epoch 7
Epoch 7 completed: Avg Loss: 0.8867527978261311, Avg Accuracy: 0.6550166666666667
test acc is 0.708400
epoch 7, time 277.054525 sec
epoch 8
Epoch 8 completed: Avg Loss: 0.749682247543335, Avg Accuracy: 0.7148
t