# NNI NAS实践

>参考文档：https://nni.readthedocs.io/en/latest/tutorials/hello_nas.html

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import nni
from nni.nas.nn.pytorch import LayerChoice, ModelSpace, MutableDropout, MutableLinear

## 定义模型变体

假设基础模型定义如下：

In [2]:
class Net(ModelSpace):  # should inherit ModelSpace rather than nn.Module
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 32, 3, 1)
        self.conv2 = nn.Conv2d(32, 64, 3, 1)
        self.dropout1 = nn.Dropout(0.25)
        self.dropout2 = nn.Dropout(0.5)
        self.fc1 = nn.Linear(9216, 128)
        self.fc2 = nn.Linear(128, 10)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.max_pool2d(self.conv2(x), 2)
        x = torch.flatten(self.dropout1(x), 1)
        x = self.fc2(self.dropout2(F.relu(self.fc1(x))))
        output = F.log_softmax(x, dim=1)
        return output

基础模型只是一个具体模型，而不是模型空间。NNI建立一个包含多个模型的模型空间。基于上述基本模型，可以定义如下的模型空间。

In [3]:
class DepthwiseSeparableConv(nn.Module):
    def __init__(self, in_ch, out_ch):
        super().__init__()
        self.depthwise = nn.Conv2d(in_ch, in_ch, kernel_size=3, groups=in_ch)
        self.pointwise = nn.Conv2d(in_ch, out_ch, kernel_size=1)

    def forward(self, x):
        return self.pointwise(self.depthwise(x))


class MyModelSpace(ModelSpace):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 32, 3, 1)
        # LayerChoice is used to select a layer between Conv2d and DwConv.
        self.conv2 = LayerChoice([
            nn.Conv2d(32, 64, 3, 1),
            DepthwiseSeparableConv(32, 64)
        ], label='conv2')
        # nni.choice is used to select a dropout rate.
        # The result can be used as parameters of `MutableXXX`.
        self.dropout1 = MutableDropout(nni.choice('dropout', [0.25, 0.5, 0.75]))  # choose dropout rate from 0.25, 0.5 and 0.75
        self.dropout2 = nn.Dropout(0.5)
        feature = nni.choice('feature', [64, 128, 256])
        self.fc1 = MutableLinear(9216, feature)
        self.fc2 = MutableLinear(feature, 10)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.max_pool2d(self.conv2(x), 2)
        x = torch.flatten(self.dropout1(x), 1)
        x = self.fc2(self.dropout2(F.relu(self.fc1(x))))
        output = F.log_softmax(x, dim=1)
        return output


model_space = MyModelSpace()
model_space

MyModelSpace(
  (conv1): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1))
  (conv2): LayerChoice(
    label='conv2'
    (0): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1))
    (1): DepthwiseSeparableConv(
      (depthwise): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), groups=32)
      (pointwise): Conv2d(32, 64, kernel_size=(1, 1), stride=(1, 1))
    )
  )
  (dropout1): MutableDropout(p=Categorical([0.25, 0.5, 0.75], label='dropout'))
  (dropout2): Dropout(p=0.5, inplace=False)
  (fc1): MutableLinear(in_features=9216, out_features=Categorical([64, 128, 256], label='feature'))
  (fc2): MutableLinear(in_features=Categorical([64, 128, 256], label='feature'), out_features=10)
)

nn.LayerChoice采用候选模块列表（本例中为两个），将为每个采样模型选择一个。它可以像普通 PyTorch 模块一样使用。 nni.choice()用作MutableDropout的参数，然后将结果作为 dropout 率。

## 选择探索策略

In [4]:
import nni.nas.strategy as strategy
search_strategy = strategy.Random()  # dedup=False if deduplication is not wanted

In [5]:
import nni

from torchvision import transforms
from torchvision.datasets import MNIST
from torch.utils.data import DataLoader


def train_epoch(model, device, train_loader, optimizer, epoch):
    loss_fn = torch.nn.CrossEntropyLoss()
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = loss_fn(output, target)
        loss.backward()
        optimizer.step()
        if batch_idx % 10 == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()))


def test_epoch(model, device, test_loader):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            pred = output.argmax(dim=1, keepdim=True)
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)
    accuracy = 100. * correct / len(test_loader.dataset)

    print('\nTest set: Accuracy: {}/{} ({:.0f}%)\n'.format(
          correct, len(test_loader.dataset), accuracy))

    return accuracy


def evaluate_model(model):
    # By v3.0, the model will be instantiated by default.
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    model.to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    transf = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])
    train_loader = DataLoader(MNIST('data/mnist', download=True, transform=transf), batch_size=64, shuffle=True)
    test_loader = DataLoader(MNIST('data/mnist', download=True, train=False, transform=transf), batch_size=64)

    for epoch in range(3):
        # train the model for one epoch
        train_epoch(model, device, train_loader, optimizer, epoch)
        # test the model for one epoch
        accuracy = test_epoch(model, device, test_loader)
        # call report intermediate result. Result can be float or dict
        nni.report_intermediate_result(accuracy)

    # report final test result
    nni.report_final_result(accuracy)

In [6]:
from nni.nas.evaluator import FunctionalEvaluator
evaluator = FunctionalEvaluator(evaluate_model)

## 启动实验

In [7]:
from nni.nas.experiment import NasExperiment
exp = NasExperiment(model_space, evaluator, search_strategy)

[2024-09-23 23:59:24] [32mConfig is not provided. Will try to infer.[0m
[2024-09-23 23:59:24] [32mUsing execution engine based on training service. Trial concurrency is set to 1.[0m
[2024-09-23 23:59:24] [32mUsing simplified model format.[0m
[2024-09-23 23:59:24] [32mUsing local training service.[0m


In [8]:
exp.config.max_trial_number = 3   # spawn 3 trials at most
exp.config.trial_concurrency = 1  # will run 1 trial concurrently
exp.config.trial_gpu_number = 0   # will not use GPU

以与可视化正常超参数调整实验相同的方式可视化实验。例如，在浏览器中打开localhost:8083，8083 是您在exp.run中设置的端口。

In [9]:
exp.run(port=8083) # 启动后需等待一会

[2024-09-23 23:59:25] [32mCreating experiment, Experiment ID: [36m0ixvwofj[0m
[2024-09-23 23:59:25] [32mStarting web server...[0m
[2024-09-23 23:59:25] [31mERROR: rest request GET http://localhost:8083/api/v1/nni/check-status failed: 502 [0m
[2024-09-23 23:59:26] [32mSetting up...[0m
[2024-09-23 23:59:26] [32mWeb portal URLs: [36mhttp://127.0.0.1:8083 http://10.109.253.71:8083 http://172.21.0.1:8083 http://172.17.0.1:8083 http://172.20.0.1:8083[0m
[2024-09-23 23:59:26] [32mSuccessfully update searchSpace.[0m
[2024-09-23 23:59:26] [32mCheckpoint saved to /home/chenyuli/nni-experiments/0ixvwofj/checkpoint.[0m
[2024-09-23 23:59:26] [32mExperiment initialized successfully. Starting exploration strategy...[0m
[2024-09-24 00:03:13] [32mWaiting for models submitted to engine to finish...[0m
[2024-09-24 00:04:29] [32mExperiment is completed.[0m
[2024-09-24 00:04:29] [32mSearch process is done. You can put an `time.sleep(FOREVER)` here to block the process if you want to 

True

## 导出顶级模型

In [11]:
for model_dict in exp.export_top_models(formatter='dict'):
    print(model_dict)

{'conv2': 0, 'dropout': 0.75, 'feature': 256}
