In [1]:
import torch
from torch import nn
import torchvision
import torchvision.transforms as transforms
from d2l import torch as d2l
import numpy as np
import pandas as pd
from ptflops import get_model_complexity_info
from train import train_func
from train_nosync import train_func_nosync
import matplotlib.pyplot as plt
from pathlib import Path
import sys
import os

In [2]:
'''find the Model path'''
# find the current path
current_path = os.getcwd()
print('The current path is:', current_path)

# find the parent path
parent_path = Path(current_path).parent
print('The parent path is:', parent_path)

# find the model path
model_folder_path = os.path.join(parent_path, 'Model')
print('The model path is:', model_folder_path)

# find the model that for FashionMNIST
model_fashion_path = os.path.join(model_folder_path, 'FashionMNIST')
# find the model that for CIFAR100
model_cifar_path = os.path.join(model_folder_path, 'CIFAR100')

# add the model path to the system path
sys.path.append(model_fashion_path)
sys.path.append(model_cifar_path)

The current path is: /root/GreenAI/Cloud/4090/code
The parent path is: /root/GreenAI/Cloud/4090
The model path is: /root/GreenAI/Cloud/4090/Model


In [3]:
from AlexNet_F import alexnet_f
from GoogLeNet_F import googlenet_f
from ResNet_F import resnet_f
from VGG_F import vgg_f
from AlexNet_C import alexnet_c

In [4]:
conv_arch = ((1, 64), (1, 128), (2, 256), (2, 512), (2, 512))
alexnet_fashion = alexnet_f()
googlenet_fashion = googlenet_f()
resnet_fashion = resnet_f()
vgg_fashion = vgg_f(conv_arch)
alexnet_cifar = alexnet_c()

##### using ptflops to calculate the number of the flops in the model

In [5]:
with torch.cuda.device(0):
    # net = alexnet_fashion
    # macs, params = get_model_complexity_info(net, (1, 224, 224), as_strings=True,
    #                                         print_per_layer_stat=True, verbose=True)
    
    net = alexnet_cifar
    macs, params = get_model_complexity_info(net, (3, 224, 224), as_strings=True,
                                            print_per_layer_stat=True, verbose=True)
    
    # net = resnet_fashion
    # macs, params = get_model_complexity_info(net, (1, 224, 224), as_strings=True,
    #                                         print_per_layer_stat=True, verbose=True)
    
    # net = vgg_fashion
    # macs, params = get_model_complexity_info(net, (1, 224, 224), as_strings=True,
    #                                         print_per_layer_stat=True, verbose=True)
    
    # net = googlenet_fashion
    # macs, params = get_model_complexity_info(net, (1, 224, 224), as_strings=True,
    #                                         print_per_layer_stat=True, verbose=True)
    
    if net == resnet_fashion:
        print(torch.backends.mps.is_built())
        print(torch.backends.mps.is_available())
    
    print('{:<30}  {:<8}'.format('Computational complexity: ', macs))
    print('{:<30}  {:<8}'.format('Number of parameters: ', params))

Sequential(
  47.16 M, 100.000% Params, 1.01 GMac, 99.891% MACs, 
  (0): Conv2d(34.94 k, 0.074% Params, 101.9 MMac, 10.098% MACs, 3, 96, kernel_size=(11, 11), stride=(4, 4), padding=(1, 1))
  (1): ReLU(0, 0.000% Params, 279.94 KMac, 0.028% MACs, )
  (2): MaxPool2d(0, 0.000% Params, 279.94 KMac, 0.028% MACs, kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
  (3): Conv2d(614.66 k, 1.303% Params, 415.51 MMac, 41.177% MACs, 96, 256, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
  (4): ReLU(0, 0.000% Params, 173.06 KMac, 0.017% MACs, )
  (5): MaxPool2d(0, 0.000% Params, 173.06 KMac, 0.017% MACs, kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
  (6): Conv2d(885.12 k, 1.877% Params, 127.46 MMac, 12.631% MACs, 256, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (7): ReLU(0, 0.000% Params, 55.3 KMac, 0.005% MACs, )
  (8): Conv2d(1.33 M, 2.815% Params, 191.16 MMac, 18.944% MACs, 384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (9): R

[**我们构造一个**]高度和宽度都为224的(**单通道数据，来观察每一层输出的形状**)。
它与 :numref:`fig_alexnet`中的AlexNet架构相匹配。


In [6]:
# X = torch.randn(1, 1, 224, 224) # FashionMNIST

X = torch.randn(1, 3, 224, 224) # CIFAR100
for layer in net:
    X=layer(X)
    print(layer.__class__.__name__,'output shape:\t',X.shape)

Conv2d output shape:	 torch.Size([1, 96, 54, 54])
ReLU output shape:	 torch.Size([1, 96, 54, 54])
MaxPool2d output shape:	 torch.Size([1, 96, 26, 26])
Conv2d output shape:	 torch.Size([1, 256, 26, 26])
ReLU output shape:	 torch.Size([1, 256, 26, 26])
MaxPool2d output shape:	 torch.Size([1, 256, 12, 12])
Conv2d output shape:	 torch.Size([1, 384, 12, 12])
ReLU output shape:	 torch.Size([1, 384, 12, 12])
Conv2d output shape:	 torch.Size([1, 384, 12, 12])
ReLU output shape:	 torch.Size([1, 384, 12, 12])
Conv2d output shape:	 torch.Size([1, 256, 12, 12])
ReLU output shape:	 torch.Size([1, 256, 12, 12])
MaxPool2d output shape:	 torch.Size([1, 256, 5, 5])
Flatten output shape:	 torch.Size([1, 6400])
Linear output shape:	 torch.Size([1, 4096])
ReLU output shape:	 torch.Size([1, 4096])
Dropout output shape:	 torch.Size([1, 4096])
Linear output shape:	 torch.Size([1, 4096])
ReLU output shape:	 torch.Size([1, 4096])
Dropout output shape:	 torch.Size([1, 4096])
Linear output shape:	 torch.Size([1,

## 读取数据集

尽管原文中AlexNet是在ImageNet上进行训练的，但本书在这里使用的是Fashion-MNIST数据集。因为即使在现代GPU上，训练ImageNet模型，同时使其收敛可能需要数小时或数天的时间。
将AlexNet直接应用于Fashion-MNIST的一个问题是，[**Fashion-MNIST图像的分辨率**]（$28 \times 28$像素）(**低于ImageNet图像。**)
为了解决这个问题，(**我们将它们增加到$224 \times 224$**)（通常来讲这不是一个明智的做法，但在这里这样做是为了有效使用AlexNet架构）。
这里需要使用`d2l.load_data_fashion_mnist`函数中的`resize`参数执行此调整。


In [7]:
def get_dataloader_workers():
    """Use 4 processes to read the data.

    Defined in :numref:`sec_utils`"""
    return 4
def load_data_cifar100(batch_size, resize=None):
    """Download the Fashion-MNIST dataset and then load it into memory.

    Defined in :numref:`sec_utils`"""
    trans = [transforms.ToTensor()]
    if resize:
        trans.insert(0, transforms.Resize(resize))
    trans = transforms.Compose(trans)
    # import the cifar100 dataset
    cifar_train = torchvision.datasets.CIFAR100(
        root="../data", train=True, transform=trans, download=True)
    cifar_test = torchvision.datasets.CIFAR100(
        root="../data", train=False, transform=trans, download=True)
    return (torch.utils.data.DataLoader(cifar_train, batch_size, shuffle=True,
                                        num_workers=get_dataloader_workers()),
            torch.utils.data.DataLoader(cifar_test, batch_size, shuffle=False,
                                        num_workers=get_dataloader_workers()))

In [8]:
batch_size = 128

# train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size, resize=224) # FashionMNIST
train_iter, test_iter = load_data_cifar100(batch_size, resize=224) # CIFAR100
# print the shape of the train_iter
list_of_i = []
for i, (X, y) in enumerate(train_iter):
    list_of_i.append(i)

print('the shape of the train_iter is:', np.array(list_of_i).shape)
# print(list_of_i)
# print the first 10 batch of the train_iter
for i, (X, y) in enumerate(train_iter):
    if i < 10:
        print('the shape of the', i, 'batch of the train_iter is:', X.shape)
    else:
        break

Files already downloaded and verified
Files already downloaded and verified
the shape of the train_iter is: (391,)
the shape of the 0 batch of the train_iter is: torch.Size([128, 3, 224, 224])
the shape of the 1 batch of the train_iter is: torch.Size([128, 3, 224, 224])
the shape of the 2 batch of the train_iter is: torch.Size([128, 3, 224, 224])
the shape of the 3 batch of the train_iter is: torch.Size([128, 3, 224, 224])
the shape of the 4 batch of the train_iter is: torch.Size([128, 3, 224, 224])
the shape of the 5 batch of the train_iter is: torch.Size([128, 3, 224, 224])
the shape of the 6 batch of the train_iter is: torch.Size([128, 3, 224, 224])
the shape of the 7 batch of the train_iter is: torch.Size([128, 3, 224, 224])
the shape of the 8 batch of the train_iter is: torch.Size([128, 3, 224, 224])
the shape of the 9 batch of the train_iter is: torch.Size([128, 3, 224, 224])


In [9]:
# save the data to data folder
data_folder_path = os.path.join(parent_path, 'Data')
print('The data folder path is:', data_folder_path)

# find out the train_data path
fashion_data_path = os.path.join(data_folder_path, 'FashionMNIST')
cifar_data_data = os.path.join(data_folder_path, 'CIFAR100')
print('The FashionMNIST train_data dir is:', fashion_data_path)
print('The CIFAR100 train_data dir is:', cifar_data_data)

The data folder path is: /root/GreenAI/Cloud/4090/Data
The FashionMNIST train_data dir is: /root/GreenAI/Cloud/4090/Data/FashionMNIST
The CIFAR100 train_data dir is: /root/GreenAI/Cloud/4090/Data/CIFAR100


In [10]:
# find the current using data path
current_dataset_path = cifar_data_data
# current_dataset = cifar_train_data
print('The current using model data path is:', current_dataset_path)

# find the current using model
current_model_path = os.path.join(current_dataset_path, 'alexnet')
print('The current using model path is:', current_model_path)

# find the train pattern, sync or nosync
pattern_path = os.path.join(current_model_path, 'sync')
# pattern_path = os.path.join(current_model_path, 'nosync')
print('The current using pattern path is:', pattern_path)

The current using model data path is: /root/GreenAI/Cloud/4090/Data/CIFAR100
The current using model path is: /root/GreenAI/Cloud/4090/Data/CIFAR100/alexnet
The current using pattern path is: /root/GreenAI/Cloud/4090/Data/CIFAR100/alexnet/sync


In [11]:
train_data_str = str(pattern_path)
print('The train_data_str is: ', train_data_str)
train_data_path = Path(train_data_str)

The train_data_str is:  /root/GreenAI/Cloud/4090/Data/CIFAR100/alexnet/sync


In [12]:
for i in range(1, 11):
    dir_path = train_data_path / f'round_{i}'
    dir_path.mkdir(parents=True, exist_ok=True)
    
# find out all the subfolders in the train_data
subfolders = [f.path for f in os.scandir(train_data_path) if f.is_dir()]
print('The folders in the train_data are: ', subfolders)

The folders in the train_data are:  ['/root/GreenAI/Cloud/4090/Data/CIFAR100/alexnet/sync/round_1', '/root/GreenAI/Cloud/4090/Data/CIFAR100/alexnet/sync/round_2', '/root/GreenAI/Cloud/4090/Data/CIFAR100/alexnet/sync/round_3', '/root/GreenAI/Cloud/4090/Data/CIFAR100/alexnet/sync/round_4', '/root/GreenAI/Cloud/4090/Data/CIFAR100/alexnet/sync/round_5', '/root/GreenAI/Cloud/4090/Data/CIFAR100/alexnet/sync/round_6', '/root/GreenAI/Cloud/4090/Data/CIFAR100/alexnet/sync/round_7', '/root/GreenAI/Cloud/4090/Data/CIFAR100/alexnet/sync/round_8', '/root/GreenAI/Cloud/4090/Data/CIFAR100/alexnet/sync/round_9', '/root/GreenAI/Cloud/4090/Data/CIFAR100/alexnet/sync/round_10']


In [13]:
lr, num_epochs = 0.01, 10
device = d2l.try_gpu()
for subfolder in subfolders:
    working_diri = subfolder
    print('The sub_folder dir is: ', working_diri)
    # Time_AllEpochs, TestAcc, TrainLoss, TrainAcc, TimeEpoch, Energy_AllEpochs, TrainTime, TTrainAccLoss = train_func(alexnet_fashion, train_iter, test_iter, num_epochs, lr, device) # FashionMNIST for Alexnet
    # Time_AllEpochs, TestAcc, TrainLoss, TrainAcc, TimeEpoch, Energy_AllEpochs, TrainTime, TTrainAccLoss = train_func(resnet_fashion, train_iter, test_iter, num_epochs, lr, device) # FashionMNIST for Resnet
    # Time_AllEpochs, TestAcc, TrainLoss, TrainAcc, TimeEpoch, Energy_AllEpochs, TrainTime, TTrainAccLoss = train_func(vgg_fashion, train_iter, test_iter, num_epochs, lr, device) # FashionMNIST for VGG
    # Time_AllEpochs, TestAcc, TrainLoss, TrainAcc, TimeEpoch, Energy_AllEpochs, TrainTime, TTrainAccLoss = train_func(googlenet_fashion, train_iter, test_iter, num_epochs, lr, device) # FashionMNIST for Googlenet
    # Time_AllEpochs, TestAcc, TrainLoss, TrainAcc, TimeEpoch, Energy_AllEpochs, TrainTime, TTrainAccLoss = train_func_nosync(alexnet_fashion, train_iter, test_iter, num_epochs, lr, device) # FashionMNIST for alexnet
    # Time_AllEpochs, TestAcc, TrainLoss, TrainAcc, TimeEpoch, Energy_AllEpochs, TrainTime, TTrainAccLoss = train_func_nosync(resnet_fashion, train_iter, test_iter, num_epochs, lr, device) # FashionMNIST for resnet
    # Time_AllEpochs, TestAcc, TrainLoss, TrainAcc, TimeEpoch, Energy_AllEpochs, TrainTime, TTrainAccLoss = train_func_nosync(vgg_fashion, train_iter, test_iter, num_epochs, lr, device) # FashionMNIST for vgg
    # Time_AllEpochs, TestAcc, TrainLoss, TrainAcc, TimeEpoch, Energy_AllEpochs, TrainTime, TTrainAccLoss = train_func_nosync(googlenet_fashion, train_iter, test_iter, num_epochs, lr, device) # FashionMNIST for Googlenet
    Time_AllEpochs, TestAcc, TrainLoss, TrainAcc, TimeEpoch, Energy_AllEpochs, TrainTime, TTrainAccLoss = train_func(alexnet_cifar, train_iter, test_iter, num_epochs, lr, device) # CIFAR100 for Alexnet

    # save the Time_AllEpochs, TestAcc, TrainLoss, TrainAcc, TimeEpoch, Energy_AllEpochs, TrainTime, TTrainAccLoss to the sub_folder dir as .npy file
    np.save(os.path.join(working_diri, 'Time_AllEpochs.npy'), Time_AllEpochs)
    np.save(os.path.join(working_diri, 'TestAcc.npy'), TestAcc)
    np.save(os.path.join(working_diri, 'TrainLoss.npy'), TrainLoss)
    np.save(os.path.join(working_diri, 'TrainAcc.npy'), TrainAcc)
    np.save(os.path.join(working_diri, 'TimeEpoch.npy'), TimeEpoch)
    np.save(os.path.join(working_diri, 'Energy_AllEpochs.npy'), Energy_AllEpochs)
    np.save(os.path.join(working_diri, 'TrainTime.npy'), TrainTime)
    np.save(os.path.join(working_diri, 'TTrainAccLoss.npy'), TTrainAccLoss)
    
    

The sub_folder dir is:  /root/GreenAI/Cloud/4090/Data/CIFAR100/alexnet/sync/round_1


training on cuda:0
epoch 1
Epoch 1 completed: Avg Loss: 4.603879885253907, Avg Accuracy: 0.01072
test acc is 0.014400
epoch 1, time 29.481923 sec
epoch 2
Epoch 2 completed: Avg Loss: 4.585683992919922, Avg Accuracy: 0.01468
test acc is 0.026800
epoch 2, time 57.816724 sec
epoch 3
Epoch 3 completed: Avg Loss: 4.409966425170898, Avg Accuracy: 0.03084
test acc is 0.052200
epoch 3, time 88.968757 sec
epoch 4
Epoch 4 completed: Avg Loss: 4.238674783325195, Avg Accuracy: 0.04892
test acc is 0.081800
epoch 4, time 117.252536 sec
epoch 5
Epoch 5 completed: Avg Loss: 4.104903151245117, Avg Accuracy: 0.06802
test acc is 0.109100
epoch 5, time 144.565315 sec
epoch 6
Epoch 6 completed: Avg Loss: 3.971388832397461, Avg Accuracy: 0.09216
test acc is 0.126700
epoch 6, time 173.976818 sec
epoch 7
Epoch 7 completed: Avg Loss: 3.8511103021240234, Avg Accuracy: 0.11146
test acc is 0.152700
epoch 7, time 202.012824 sec
epoch 8
Epoch 8 completed: Avg Loss: 3.7314320123291016, Avg Accuracy: 0.13234
test acc