In [1]:
import torch
from torch import nn
from alexnet import AlexNet
from d2l import torch as d2l
import time
import numpy as np
import pandas as pd
import psutil
from ptflops import get_model_complexity_info
from train_layers import train_layers
# from train import train_func

In [2]:
alexnet = AlexNet()

##### using ptflops to calculate the number of the flops in the model

In [3]:
with torch.cuda.device(0):
    net = alexnet
    macs, params = get_model_complexity_info(net, (1, 224, 224), as_strings=True,
                                            print_per_layer_stat=True, verbose=True)
    print('{:<30}  {:<8}'.format('Computational complexity: ', macs))
    print('{:<30}  {:<8}'.format('Number of parameters: ', params))
    alexnet_para_num = params

Sequential(
  46.76 M, 100.000% Params, 939.85 MMac, 99.883% MACs, 
  (0): Conv2d(11.71 k, 0.025% Params, 34.15 MMac, 3.630% MACs, 1, 96, kernel_size=(11, 11), stride=(4, 4), padding=(1, 1))
  (1): ReLU(0, 0.000% Params, 279.94 KMac, 0.030% MACs, )
  (2): MaxPool2d(0, 0.000% Params, 279.94 KMac, 0.030% MACs, kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
  (3): Conv2d(614.66 k, 1.314% Params, 415.51 MMac, 44.158% MACs, 96, 256, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
  (4): ReLU(0, 0.000% Params, 173.06 KMac, 0.018% MACs, )
  (5): MaxPool2d(0, 0.000% Params, 173.06 KMac, 0.018% MACs, kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
  (6): Conv2d(885.12 k, 1.893% Params, 127.46 MMac, 13.546% MACs, 256, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (7): ReLU(0, 0.000% Params, 55.3 KMac, 0.006% MACs, )
  (8): Conv2d(1.33 M, 2.839% Params, 191.16 MMac, 20.315% MACs, 384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (9): 

[**我们构造一个**]高度和宽度都为224的(**单通道数据，来观察每一层输出的形状**)。
它与 :numref:`fig_alexnet`中的AlexNet架构相匹配。


In [4]:
X = torch.randn(1, 1, 224, 224)
for layer in net:
    X=layer(X)
    print(layer.__class__.__name__,'output shape:\t',X.shape)

Conv2d output shape:	 torch.Size([1, 96, 54, 54])
ReLU output shape:	 torch.Size([1, 96, 54, 54])
MaxPool2d output shape:	 torch.Size([1, 96, 26, 26])
Conv2d output shape:	 torch.Size([1, 256, 26, 26])
ReLU output shape:	 torch.Size([1, 256, 26, 26])
MaxPool2d output shape:	 torch.Size([1, 256, 12, 12])
Conv2d output shape:	 torch.Size([1, 384, 12, 12])
ReLU output shape:	 torch.Size([1, 384, 12, 12])
Conv2d output shape:	 torch.Size([1, 384, 12, 12])
ReLU output shape:	 torch.Size([1, 384, 12, 12])
Conv2d output shape:	 torch.Size([1, 256, 12, 12])
ReLU output shape:	 torch.Size([1, 256, 12, 12])
MaxPool2d output shape:	 torch.Size([1, 256, 5, 5])
Flatten output shape:	 torch.Size([1, 6400])
Linear output shape:	 torch.Size([1, 4096])
ReLU output shape:	 torch.Size([1, 4096])
Dropout output shape:	 torch.Size([1, 4096])
Linear output shape:	 torch.Size([1, 4096])
ReLU output shape:	 torch.Size([1, 4096])
Dropout output shape:	 torch.Size([1, 4096])
Linear output shape:	 torch.Size([1,

## 读取数据集

尽管原文中AlexNet是在ImageNet上进行训练的，但本书在这里使用的是Fashion-MNIST数据集。因为即使在现代GPU上，训练ImageNet模型，同时使其收敛可能需要数小时或数天的时间。
将AlexNet直接应用于Fashion-MNIST的一个问题是，[**Fashion-MNIST图像的分辨率**]（$28 \times 28$像素）(**低于ImageNet图像。**)
为了解决这个问题，(**我们将它们增加到$224 \times 224$**)（通常来讲这不是一个明智的做法，但在这里这样做是为了有效使用AlexNet架构）。
这里需要使用`d2l.load_data_fashion_mnist`函数中的`resize`参数执行此调整。


In [5]:
batch_size = 128
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size, resize=224)
# print the shape of the train_iter
list_of_i = []
for i, (X, y) in enumerate(train_iter):
    list_of_i.append(i)

print('the shape of the train_iter is:', np.array(list_of_i).shape)
# print(list_of_i)
# print the first 10 batch of the train_iter
for i, (X, y) in enumerate(train_iter):
    if i < 10:
        print('the shape of the', i, 'batch of the train_iter is:', X.shape)
    else:
        break

the shape of the train_iter is: (469,)
the shape of the 0 batch of the train_iter is: torch.Size([128, 1, 224, 224])
the shape of the 1 batch of the train_iter is: torch.Size([128, 1, 224, 224])
the shape of the 2 batch of the train_iter is: torch.Size([128, 1, 224, 224])
the shape of the 3 batch of the train_iter is: torch.Size([128, 1, 224, 224])
the shape of the 4 batch of the train_iter is: torch.Size([128, 1, 224, 224])
the shape of the 5 batch of the train_iter is: torch.Size([128, 1, 224, 224])
the shape of the 6 batch of the train_iter is: torch.Size([128, 1, 224, 224])
the shape of the 7 batch of the train_iter is: torch.Size([128, 1, 224, 224])
the shape of the 8 batch of the train_iter is: torch.Size([128, 1, 224, 224])
the shape of the 9 batch of the train_iter is: torch.Size([128, 1, 224, 224])


## [**训练AlexNet**]

现在AlexNet可以开始被训练了。与 :numref:`sec_lenet`中的LeNet相比，这里的主要变化是使用更小的学习速率训练，这是因为网络更深更广、图像分辨率更高，训练卷积神经网络就更昂贵。


In [6]:
lr, num_epochs = 0.01, 1
device = d2l.try_gpu()
Time_Layers, Time_AllEpochs, TestAcc, TrainLoss, TrainAcc, TimeEpoch, Energy_AllEpochs, TrainTime, Timport= train_layers(alexnet, train_iter, test_iter, num_epochs, lr, device)
# Time_AllEpochs, TestAcc, TrainLoss, TrainAcc, TimeEpoch, Energy_AllEpochs, TrainTime, Timport= train_func(alexnet, train_iter, test_iter, num_epochs, lr, device)

training on cuda:0
epoch 1
round 0
time to device 0.005736 sec
time forward 0.164138 sec
loss time 0.018266 sec
backward time 0.108599 sec
optimizer time 0.007306 sec
training time in batch 0 cost 0.30516529083251953 sec
loss 2.297448, train acc 0.132812
round 1
time to device 0.004025 sec
time forward 0.000880 sec
loss time 0.000080 sec
backward time 0.001153 sec
optimizer time 0.000223 sec
training time in batch 1 cost 0.006602287292480469 sec
loss 2.299442, train acc 0.105469
round 2
time to device 0.003998 sec
time forward 0.000804 sec
loss time 0.000071 sec
backward time 0.001695 sec
optimizer time 0.000207 sec
training time in batch 2 cost 0.007006645202636719 sec
loss 2.300937, train acc 0.098958
round 3
time to device 0.004010 sec
time forward 0.000935 sec
loss time 0.000076 sec
backward time 0.001257 sec
optimizer time 0.000223 sec
training time in batch 3 cost 0.006720781326293945 sec
loss 2.302328, train acc 0.091797
round 4
time to device 0.003973 sec
time forward 0.000892 

In [7]:
print('Forward Layers Time: \n', 
      'Conv2d time: ', Time_Layers[0,0], '\n',
      'ReLU time: ', Time_Layers[0,1], '\n',
      'MaxPool2d time: ', Time_Layers[0,2], '\n',
      'Linear time: ', Time_Layers[0,3], '\n',
      'Dropout time: ', Time_Layers[0,4], '\n',
      'Flatten time: ', Time_Layers[0,5])
print('*'*50)
print('Time_AllEpochs: \n', 
      'Time to Device time: ', Time_AllEpochs[0,0], '\n',
      'Forward time: ', Time_AllEpochs[0,1], '\n',
      'Calculate Loss time: ', Time_AllEpochs[0,2], '\n',
      'Backward time: ', Time_AllEpochs[0,3], '\n',
      'Optimize time: ', Time_AllEpochs[0,4], '\n',
      'Test time: ', Time_AllEpochs[0,5])
print('*'*50)
print('Train Time of each epoch:', TrainTime)
print('*'*50)
print('Import data to ndarray time:', Timport)
print('*'*50)
print('TestAcc:', TestAcc)
print('*'*50)
print('TrainLoss:', TrainLoss)
print('*'*50)
print('TrainAcc:', TrainAcc)
print('*'*50)
print('TimeEpoch:', TimeEpoch)
print('*'*50)
print('Energy_AllEpochs:', Energy_AllEpochs, '\n',
      'Total Energy:',np.sum(Energy_AllEpochs[0,0]), '\n',
      'The time of the first epoch:', len(Energy_AllEpochs[0,0]))

Time_AllEpochs: 
 Time to Device time:  [1.88681507] 
 Forward time:  [0.57212591] 
 Calculate Loss time:  [0.05389261] 
 Backward time:  [0.74353361] 
 Optimize time:  [0.10294962] 
 Test time:  [2.78524184]
**************************************************
Train Time of each epoch: [3.4555859565734863]
**************************************************
Import data to ndarray time: [0]
**************************************************
TestAcc: [0.7477]
**************************************************
TrainLoss: [[2.297447681427002, 2.299442410469055, 2.3009370962778726, 2.30232834815979, 2.302603816986084, 2.3029493490854898, 2.302363395690918, 2.302022308111191, 2.3014929029676647, 2.301532769203186, 2.301142454147339, 2.3009710709253945, 2.3006096803225002, 2.3002463238579884, 2.3000536918640138, 2.299908086657524, 2.29983291906469, 2.299415535397, 2.2994698599765173, 2.299320328235626, 2.299118405296689, 2.2990413145585493, 2.298825678618058, 2.2986112336317697, 2.2985249614715

In [8]:
# import os
# working_dir = os.getcwd()
# working_dir

# # find out the parent directory
# parent_dir1 = os.path.dirname(working_dir)
# print(parent_dir1)

# parent_dir2 = os.path.dirname(parent_dir1)
# print(parent_dir2)

# parent_dir3 = os.path.dirname(parent_dir2)
# print(parent_dir3)

# data_folder = os.path.join(parent_dir3, 'data')
# print(data_folder)

In [9]:
# find the second_part folder
# second_part_dir = os.path.join(data_folder, 'epoch_20SGD_GPU')
# second_part_dir

In [10]:
# # save the data as .npy file
# np.save(os.path.join(second_part_dir, 'time_forward.npy'), time_forward)
# np.save(os.path.join(second_part_dir, 'time_round.npy'), time_round)
# np.save(os.path.join(second_part_dir, 'test_acc.npy'), test_acc)
# np.save(os.path.join(second_part_dir, 'train_acc.npy'), train_acc)
# np.save(os.path.join(second_part_dir, 'train_l.npy'), train_l)
# np.save(os.path.join(second_part_dir, 'time_epoch.npy'), time_epoch)
# np.save(os.path.join(second_part_dir, 'energy_epoch.npy'), energy_data_epoch)