In [1]:
import torch
from torch import nn
import torchvision
import torchvision.transforms as transforms
# from alexnet_FashionMnist import FashionMnist
# from alexnet_CIFAR100 import CIFAR100
# from resnet_FashionMnist import resnet_FashionMnist
# from googlenet_FashionMnist import Googlenet
# from vgg_FashionMnist import vgg
from d2l import torch as d2l
import numpy as np
import pandas as pd
from ptflops import get_model_complexity_info
from train import train_func
from train_nosync import train_func_nosync
import matplotlib.pyplot as plt
from pathlib import Path
import sys
import os

In [2]:
'''find the Model path'''
# find the current path
current_path = os.getcwd()
print('The current path is:', current_path)

# find the parent path
parent_path = Path(current_path).parent
print('The parent path is:', parent_path)

# find the model path
model_folder_path = os.path.join(parent_path, 'Model')
print('The model path is:', model_folder_path)

# find the model that for FashionMNIST
model_fashion_path = os.path.join(model_folder_path, 'FashionMNIST')
# find the model that for CIFAR100
model_cifar_path = os.path.join(model_folder_path, 'CIFAR100')

# add the model path to the system path
sys.path.append(model_fashion_path)
sys.path.append(model_cifar_path)

The current path is: /root/GreenAI/Cloud/4090/code
The parent path is: /root/GreenAI/Cloud/4090
The model path is: /root/GreenAI/Cloud/4090/Model


In [3]:
from AlexNet_F import alexnet_f
from GoogLeNet_F import googlenet_f
from ResNet_F import resnet_f
from VGG_F import vgg_f
from AlexNet_C import alexnet_c

In [4]:
conv_arch = ((1, 64), (1, 128), (2, 256), (2, 512), (2, 512))
alexnet_fashion = alexnet_f()
googlenet_fashion = googlenet_f()
resnet_fashion = resnet_f()
vgg_fashion = vgg_f(conv_arch)
alexnet_cifar = alexnet_c()

##### using ptflops to calculate the number of the flops in the model

In [5]:
with torch.cuda.device(0):
    # net = alexnet_fashion
    # macs, params = get_model_complexity_info(net, (1, 224, 224), as_strings=True,
    #                                         print_per_layer_stat=True, verbose=True)
    
    # net = alexnet_cifar100
    # macs, params = get_model_complexity_info(net, (3, 224, 224), as_strings=True,
    #                                         print_per_layer_stat=True, verbose=True)
    
    # net = resnet_fashion
    # macs, params = get_model_complexity_info(net, (1, 224, 224), as_strings=True,
    #                                         print_per_layer_stat=True, verbose=True)
    
    net = vgg_fashion
    macs, params = get_model_complexity_info(net, (1, 224, 224), as_strings=True,
                                            print_per_layer_stat=True, verbose=True)
    
    # net = googlenet_fashion
    # macs, params = get_model_complexity_info(net, (1, 224, 224), as_strings=True,
    #                                         print_per_layer_stat=True, verbose=True)
    
    if net == resnet_fashion:
        print(torch.backends.mps.is_built())
        print(torch.backends.mps.is_available())
    
    print('{:<30}  {:<8}'.format('Computational complexity: ', macs))
    print('{:<30}  {:<8}'.format('Number of parameters: ', params))

Sequential(
  128.81 M, 100.000% Params, 7.57 GMac, 99.821% MACs, 
  (0): Sequential(
    640, 0.000% Params, 38.54 MMac, 0.508% MACs, 
    (0): Conv2d(640, 0.000% Params, 32.11 MMac, 0.424% MACs, 1, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(0, 0.000% Params, 3.21 MMac, 0.042% MACs, )
    (2): MaxPool2d(0, 0.000% Params, 3.21 MMac, 0.042% MACs, kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (1): Sequential(
    73.86 k, 0.057% Params, 929.66 MMac, 12.262% MACs, 
    (0): Conv2d(73.86 k, 0.057% Params, 926.45 MMac, 12.219% MACs, 64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(0, 0.000% Params, 1.61 MMac, 0.021% MACs, )
    (2): MaxPool2d(0, 0.000% Params, 1.61 MMac, 0.021% MACs, kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (2): Sequential(
    885.25 k, 0.687% Params, 2.78 GMac, 36.648% MACs, 
    (0): Conv2d(295.17 k, 0.229% Params, 925.65 MMac, 12.209% MACs, 128, 256, kernel_size=(3,

[**我们构造一个**]高度和宽度都为224的(**单通道数据，来观察每一层输出的形状**)。
它与 :numref:`fig_alexnet`中的AlexNet架构相匹配。


In [6]:
X = torch.randn(1, 1, 224, 224) # FashionMNIST

# X = torch.randn(1, 3, 224, 224) # CIFAR100
for layer in net:
    X=layer(X)
    print(layer.__class__.__name__,'output shape:\t',X.shape)

Sequential output shape:	 torch.Size([1, 64, 112, 112])
Sequential output shape:	 torch.Size([1, 128, 56, 56])
Sequential output shape:	 torch.Size([1, 256, 28, 28])
Sequential output shape:	 torch.Size([1, 512, 14, 14])
Sequential output shape:	 torch.Size([1, 512, 7, 7])
Flatten output shape:	 torch.Size([1, 25088])
Linear output shape:	 torch.Size([1, 4096])
ReLU output shape:	 torch.Size([1, 4096])
Dropout output shape:	 torch.Size([1, 4096])
Linear output shape:	 torch.Size([1, 4096])
ReLU output shape:	 torch.Size([1, 4096])
Dropout output shape:	 torch.Size([1, 4096])
Linear output shape:	 torch.Size([1, 10])


## 读取数据集

尽管原文中AlexNet是在ImageNet上进行训练的，但本书在这里使用的是Fashion-MNIST数据集。因为即使在现代GPU上，训练ImageNet模型，同时使其收敛可能需要数小时或数天的时间。
将AlexNet直接应用于Fashion-MNIST的一个问题是，[**Fashion-MNIST图像的分辨率**]（$28 \times 28$像素）(**低于ImageNet图像。**)
为了解决这个问题，(**我们将它们增加到$224 \times 224$**)（通常来讲这不是一个明智的做法，但在这里这样做是为了有效使用AlexNet架构）。
这里需要使用`d2l.load_data_fashion_mnist`函数中的`resize`参数执行此调整。


In [7]:
def get_dataloader_workers():
    """Use 4 processes to read the data.

    Defined in :numref:`sec_utils`"""
    return 4
def load_data_cifar100(batch_size, resize=None):
    """Download the Fashion-MNIST dataset and then load it into memory.

    Defined in :numref:`sec_utils`"""
    trans = [transforms.ToTensor()]
    if resize:
        trans.insert(0, transforms.Resize(resize))
    trans = transforms.Compose(trans)
    # import the cifar100 dataset
    cifar_train = torchvision.datasets.CIFAR100(
        root="../data", train=True, transform=trans, download=True)
    cifar_test = torchvision.datasets.CIFAR100(
        root="../data", train=False, transform=trans, download=True)
    return (torch.utils.data.DataLoader(cifar_train, batch_size, shuffle=True,
                                        num_workers=get_dataloader_workers()),
            torch.utils.data.DataLoader(cifar_test, batch_size, shuffle=False,
                                        num_workers=get_dataloader_workers()))

In [8]:
batch_size = 128
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size, resize=224) # FashionMNIST

# train_iter, test_iter = load_data_cifar100(batch_size, resize=224) # CIFAR100
# print the shape of the train_iter
list_of_i = []
for i, (X, y) in enumerate(train_iter):
    list_of_i.append(i)

print('the shape of the train_iter is:', np.array(list_of_i).shape)
# print(list_of_i)
# print the first 10 batch of the train_iter
for i, (X, y) in enumerate(train_iter):
    if i < 10:
        print('the shape of the', i, 'batch of the train_iter is:', X.shape)
    else:
        break

the shape of the train_iter is: (469,)
the shape of the 0 batch of the train_iter is: torch.Size([128, 1, 224, 224])
the shape of the 1 batch of the train_iter is: torch.Size([128, 1, 224, 224])
the shape of the 2 batch of the train_iter is: torch.Size([128, 1, 224, 224])
the shape of the 3 batch of the train_iter is: torch.Size([128, 1, 224, 224])
the shape of the 4 batch of the train_iter is: torch.Size([128, 1, 224, 224])
the shape of the 5 batch of the train_iter is: torch.Size([128, 1, 224, 224])
the shape of the 6 batch of the train_iter is: torch.Size([128, 1, 224, 224])
the shape of the 7 batch of the train_iter is: torch.Size([128, 1, 224, 224])
the shape of the 8 batch of the train_iter is: torch.Size([128, 1, 224, 224])
the shape of the 9 batch of the train_iter is: torch.Size([128, 1, 224, 224])


In [9]:
# save the data to data folder
data_folder_path = os.path.join(parent_path, 'Data')
print('The data folder path is:', data_folder_path)

# find out the train_data path
fashion_data_path = os.path.join(data_folder_path, 'FashionMNIST')
cifar_data_data = os.path.join(data_folder_path, 'CIFAR100')
print('The FashionMNIST train_data dir is:', fashion_data_path)
print('The CIFAR100 train_data dir is:', cifar_data_data)

The data folder path is: /root/GreenAI/Cloud/4090/Data
The FashionMNIST train_data dir is: /root/GreenAI/Cloud/4090/Data/FashionMNIST
The CIFAR100 train_data dir is: /root/GreenAI/Cloud/4090/Data/CIFAR100


In [10]:
# find the current using data path
current_dataset_path = fashion_data_path
# current_dataset = cifar_train_data
print('The current using model data path is:', current_dataset_path)

# find the current using model
current_model_path = os.path.join(current_dataset_path, 'vgg')
print('The current using model path is:', current_model_path)

# find the train pattern, sync or nosync
pattern_path = os.path.join(current_model_path, 'sync')
# pattern_path = os.path.join(current_model_path, 'nosync')
print('The current using pattern path is:', pattern_path)

The current using model data path is: /root/GreenAI/Cloud/4090/Data/FashionMNIST
The current using model path is: /root/GreenAI/Cloud/4090/Data/FashionMNIST/vgg
The current using pattern path is: /root/GreenAI/Cloud/4090/Data/FashionMNIST/vgg/sync


In [11]:
train_data_str = str(pattern_path)
print('The train_data_str is: ', train_data_str)
train_data_path = Path(train_data_str)

The train_data_str is:  /root/GreenAI/Cloud/4090/Data/FashionMNIST/vgg/sync


In [12]:
for i in range(1, 11):
    dir_path = train_data_path / f'round_{i}'
    dir_path.mkdir(parents=True, exist_ok=True)
    
# find out all the subfolders in the train_data
subfolders = [f.path for f in os.scandir(train_data_path) if f.is_dir()]
print('The folders in the train_data are: ', subfolders)

The folders in the train_data are:  ['/root/GreenAI/Cloud/4090/Data/FashionMNIST/vgg/sync/round_1', '/root/GreenAI/Cloud/4090/Data/FashionMNIST/vgg/sync/round_2', '/root/GreenAI/Cloud/4090/Data/FashionMNIST/vgg/sync/round_3', '/root/GreenAI/Cloud/4090/Data/FashionMNIST/vgg/sync/round_4', '/root/GreenAI/Cloud/4090/Data/FashionMNIST/vgg/sync/round_5', '/root/GreenAI/Cloud/4090/Data/FashionMNIST/vgg/sync/round_6', '/root/GreenAI/Cloud/4090/Data/FashionMNIST/vgg/sync/round_7', '/root/GreenAI/Cloud/4090/Data/FashionMNIST/vgg/sync/round_8', '/root/GreenAI/Cloud/4090/Data/FashionMNIST/vgg/sync/round_9', '/root/GreenAI/Cloud/4090/Data/FashionMNIST/vgg/sync/round_10']


In [13]:
lr, num_epochs = 0.01, 10
device = d2l.try_gpu()
for subfolder in subfolders:
    working_diri = subfolder
    print('The sub_folder dir is: ', working_diri)
    # Time_AllEpochs, TestAcc, TrainLoss, TrainAcc, TimeEpoch, Energy_AllEpochs, TrainTime, TTrainAccLoss = train_func(alexnet_fashion, train_iter, test_iter, num_epochs, lr, device) # FashionMNIST for Alexnet
    # Time_AllEpochs, TestAcc, TrainLoss, TrainAcc, TimeEpoch, Energy_AllEpochs, TrainTime, TTrainAccLoss = train_func(resnet_fashion, train_iter, test_iter, num_epochs, lr, device) # FashionMNIST for Resnet
    # Time_AllEpochs, TestAcc, TrainLoss, TrainAcc, TimeEpoch, Energy_AllEpochs, TrainTime, TTrainAccLoss = train_func(alexnet_cifar100, train_iter, test_iter, num_epochs, lr, device) # CIFAR100 for Alexnet
    Time_AllEpochs, TestAcc, TrainLoss, TrainAcc, TimeEpoch, Energy_AllEpochs, TrainTime, TTrainAccLoss = train_func(vgg_fashion, train_iter, test_iter, num_epochs, lr, device) # FashionMNIST for VGG
    # Time_AllEpochs, TestAcc, TrainLoss, TrainAcc, TimeEpoch, Energy_AllEpochs, TrainTime, TTrainAccLoss = train_func(googlenet_fashion, train_iter, test_iter, num_epochs, lr, device) # FashionMNIST for Googlenet
    # Time_AllEpochs, TestAcc, TrainLoss, TrainAcc, TimeEpoch, Energy_AllEpochs, TrainTime, TTrainAccLoss = train_func_nosync(alexnet_fashion, train_iter, test_iter, num_epochs, lr, device) # FashionMNIST for alexnet
    # Time_AllEpochs, TestAcc, TrainLoss, TrainAcc, TimeEpoch, Energy_AllEpochs, TrainTime, TTrainAccLoss = train_func_nosync(resnet_fashion, train_iter, test_iter, num_epochs, lr, device) # FashionMNIST for resnet
    # Time_AllEpochs, TestAcc, TrainLoss, TrainAcc, TimeEpoch, Energy_AllEpochs, TrainTime, TTrainAccLoss = train_func_nosync(vgg_fashion, train_iter, test_iter, num_epochs, lr, device) # FashionMNIST for vgg
    # Time_AllEpochs, TestAcc, TrainLoss, TrainAcc, TimeEpoch, Energy_AllEpochs, TrainTime, TTrainAccLoss = train_func_nosync(googlenet_fashion, train_iter, test_iter, num_epochs, lr, device) # FashionMNIST for Googlenet

    # save the Time_AllEpochs, TestAcc, TrainLoss, TrainAcc, TimeEpoch, Energy_AllEpochs, TrainTime, TTrainAccLoss to the sub_folder dir as .npy file
    np.save(os.path.join(working_diri, 'Time_AllEpochs.npy'), Time_AllEpochs)
    np.save(os.path.join(working_diri, 'TestAcc.npy'), TestAcc)
    np.save(os.path.join(working_diri, 'TrainLoss.npy'), TrainLoss)
    np.save(os.path.join(working_diri, 'TrainAcc.npy'), TrainAcc)
    np.save(os.path.join(working_diri, 'TimeEpoch.npy'), TimeEpoch)
    np.save(os.path.join(working_diri, 'Energy_AllEpochs.npy'), Energy_AllEpochs)
    np.save(os.path.join(working_diri, 'TrainTime.npy'), TrainTime)
    np.save(os.path.join(working_diri, 'TTrainAccLoss.npy'), TTrainAccLoss)
    
    

The sub_folder dir is:  /root/GreenAI/Cloud/4090/Data/FashionMNIST/vgg/sync/round_1


training on cuda:0
epoch 1
Epoch 1 completed: Avg Loss: 1.7167102532704672, Avg Accuracy: 0.3792333333333333
test acc is 0.699400
epoch 1, time 90.839314 sec
epoch 2
Epoch 2 completed: Avg Loss: 0.6445926860173543, Avg Accuracy: 0.7570666666666667
test acc is 0.819400
epoch 2, time 180.664310 sec
epoch 3
Epoch 3 completed: Avg Loss: 0.49773944390614827, Avg Accuracy: 0.8159833333333333
test acc is 0.819700
epoch 3, time 271.087088 sec
epoch 4
Epoch 4 completed: Avg Loss: 0.4343259724934896, Avg Accuracy: 0.8393666666666667
test acc is 0.857800
epoch 4, time 361.661833 sec
epoch 5
Epoch 5 completed: Avg Loss: 0.3919973711649577, Avg Accuracy: 0.8549
test acc is 0.862100
epoch 5, time 451.788730 sec
epoch 6
Epoch 6 completed: Avg Loss: 0.3653864227294922, Avg Accuracy: 0.8656333333333334
test acc is 0.869100
epoch 6, time 542.123398 sec
epoch 7
Epoch 7 completed: Avg Loss: 0.34253708985646564, Avg Accuracy: 0.87495
test acc is 0.871900
epoch 7, time 632.474572 sec
epoch 8
Epoch 8 complet