In [67]:
import os
import torch
import torch.nn as nn
# import torch.nn.parallel as par
import torch.distributed as dist
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
import torch.multiprocessing as mp

import torch.distributed.autograd as dist_autograd
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.distributed.optim import DistributedOptimizer
from torch.utils.data.distributed import DistributedSampler

# import argparse
from tqdm import tqdm  # Progress Bar 출력

In [68]:
# parser = argparse.ArgumentParser(description='test parser') 
# parser.add_argument('--rank', type=int, default=0, help='rank')
# parser.add_argument('--world_size', type=int, default=8, help='world_size')
# parser.add_argument('--init_method', type=str, default='tcp://localhost:23456', help='init_method')

# opt = parser.parse_args()  

In [69]:
def setup(rank, world_size):
    # os.environ['MASTER_ADDR'] = '127.0.0.1'
    # os.environ['MASTER_PORT'] = '8080'
    
    # initialize the process group
    dist.init_process_group(
        backend="nccl", 
        init_method='tcp://localhost:23456',
        rank=rank, 
        world_size=world_size
    )
    
    # print(torch.cuda.is_available())
    # print(torch.cuda.get_device_name())
    # print(torch.cuda.device_count())
    # print(dist.get_rank())
    # print(torch.backends.cudnn.enabled)
    # print(torch.__version__)

In [70]:
def cleanup():
    dist.destroy_process_group()

In [71]:
class AlexNet(nn.Module):
    def __init__(self):
        super(AlexNet, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(3, 96, kernel_size=11, stride=4, padding=0),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),

            nn.Conv2d(96, 256, kernel_size=5, stride=1, padding=2),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),

            nn.Conv2d(256, 384, kernel_size=3, stride=1, padding=1),
            nn.ReLU(inplace=True),

            nn.Conv2d(384, 384, kernel_size=3, stride=1, padding=1),
            nn.ReLU(inplace=True),

            nn.Conv2d(384, 256, kernel_size=3, stride=1, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),
        )
        self.classifier = nn.Sequential(
            nn.Dropout(p = 0.5),
            nn.Linear(256 * 6 * 6, 4096),
            nn.ReLU(inplace=True),
            nn.Dropout(p = 0.5),
            nn.Linear(4096, 4096),
            nn.ReLU(inplace=True),
            nn.Linear(4096, 10),
        )

    def forward(self, x):
        x = self.features(x)
        x = x.view(x.size(0), 256 * 6 * 6)
        output = self.classifier(x)
        return output

In [72]:
def train(model, trainloaer, loss_fn, optimizer, device):
    model.train()
    correct = 0
    running_size = 0
    running_loss = 0

    prograss_bar = tqdm(trainloaer)

    for i, data in enumerate(prograss_bar):
        inputs, labels = data[0].to(device), data[1].to(device)

        optimizer.zero_grad()
        output = model(inputs)
        loss = loss_fn(output, labels)
        loss.backward()
        optimizer.step()

        _, pred = output.max(dim=1)
        correct += pred.eq(labels).sum().item()
        
        running_loss += loss.item() * inputs.size(0) # FIXME:
        running_size += inputs.size(0)

    loss = running_loss / running_size
    acc = correct / running_size
    return loss, acc

In [73]:
def test(model, testloader, device):
    model.eval()
    correct = 0

    with torch.no_grad():
        for data in testloader:
            images, labels = data[0].to(device), data[1].to(device)
            outputs = model(images)

            _, predicted = torch.max(outputs.data, 1)
            correct += (predicted == labels).sum().item()

    acc = correct / len(testloader.dataset)
    return acc

In [74]:
def main_worker(rank, world_size):


    # 특정 GPU 번호로 device 설정
    # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    # args = parser.parse_args()
    # print(f"Running basic DDP example on rank {rank}.")
    
    # 데이터셋 불러오기
    transform = transforms.Compose([
        transforms.Resize(size=(227, 227)),
        transforms.ToTensor(), #이미지를 pytorch tensors 타입으로 변형, 0.0~1.0 으로 변환
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) # rgb, -1~1로 변환
    ])
    
    rank = 0
    world_size = 8
    batch_size = 64
    setup(rank, world_size)
    
    trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
    train_sampler = DistributedSampler(trainset)
    trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size/world_size, shuffle=True, sampler=train_sampler)
    
    testset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)
    test_sampler = DistributedSampler(testset)
    testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size/world_size, shuffle=False, sampler=test_sampler)

    # 모델 초기화, ddp, 하이퍼파라미터 설정
    # device_id = rank % torch.cuda.device_count()
    model = AlexNet().to(rank)
    ddp_model = DDP(model, device_ids=[0])

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(ddp_model.parameters(), lr=0.001)

    # train, test
    num_epochs = 5

    for epoch in range(num_epochs):
        train_loss, train_acc = train(ddp_model, trainloader, criterion, optimizer, rank)   
        test_acc = test(ddp_model, testloader, rank)

        print(f'epoch {epoch+1:02d}, train loss: {train_loss:.5f}, train acc: {train_acc:.5f}, test accuracy: {test_acc:.5f}')
    
    cleanup()

In [75]:
def main():     
    # world_size = torch.cuda.device_count()
    world_size = 1
    mp.spawn(main_worker, nprocs=world_size, args=(world_size,), join=True)

In [76]:
if __name__ == '__main__':
    main()

Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/home/eunjin/anaconda3/envs/study/lib/python3.8/multiprocessing/spawn.py", line 116, in spawn_main
    exitcode = _main(fd, parent_sentinel)
  File "/home/eunjin/anaconda3/envs/study/lib/python3.8/multiprocessing/spawn.py", line 126, in _main
    self = reduction.pickle.load(from_parent)
AttributeError: Can't get attribute 'main_worker' on <module '__main__' (built-in)>


ProcessExitedException: process 0 terminated with exit code 1

In [None]:
import torch

print(torch.cuda.device_count())

8
