# CNN Part 3: Introduction to DDP with Pytorch

Check for GPU availability.

In [None]:
import torch

In [None]:
print('GPUs are available = {}'.format(torch.cuda.is_available()))

In [None]:
print('The number of GPUs available are {}'.format(torch.cuda.device_count()))

In [None]:
print('You are currently using GPU with local rank = {}'.format(torch.cuda.current_device()))

## Introduction to DDP with Pytorch

### Create Process Group 

In [None]:
import torch.distributed as dist
def init_distributed(local_rank, world_size):
    '''
    local_rank: identifier for pariticular GPU on one node
    world: total number of process in a the group
    '''
    os.environ['MASTER_ADDR'] = 'localhost'           # IP address of rank 0 process
    os.environ['MASTER_PORT'] = '12355'               # a free port used to communicate amongst processors
    torch.cuda.set_device(local_rank)                 
    dist.init_process_group("nccl",                   # backend being used; nccl typically used with distributed GPU training
                            rank=local_rank,          # rank of the current process being used
                            world_size=world_size)    # total number of processors being used

### Create Data DistributedSampler

In [None]:
from torch.utils.data.distributed import DistributedSampler

def load_dataset(train_dataset):
    train_data = torch.utils.data.DataLoader(
        dataset=train_dataset,
        batch_size=32,
        #################################################
        shuffle=False,                             # shuffle should be set to False when using DistributedSampler
        sampler=DistributedSampler(train_dataset), # passing the distributed loader
        ################################################
    )
    return train_data

## MNIST Example

In [None]:
import os
import torch 
from torch.utils.data import DataLoader
import numpy as np 
import matplotlib.pyplot as plt
import torchvision
import torch.nn as nn
import torch.nn.functional as F
%matplotlib inline

### Non-Distributed Code

#### Get Data
Load the MNIST dataset and pass it to a dataloader.

In [None]:
def prepare_data(batch_size=32):

    # download MNIST dataset
    trainset = torchvision.datasets.MNIST(
                            root=os.path.join(os.environ['SCRATCH'], "data"),      # path to where data is stored
                            train=True,                                         # specifies if data is train or test
                            download=True,                                      # downloads data if not available at root
                            transform=torchvision.transforms.ToTensor()         # trasforms both features and targets accordingly
                            )
    # pass dataset to the dataloader
    train_dataloader = DataLoader(trainset,
                                  shuffle=False,
                                  batch_size=batch_size)

    return train_dataloader

trainloader=prepare_data(batch_size=32)

Visualize a few images from the MNIST dataset.

In [None]:
def imshow(img):
    npimg = img.numpy()
    plt.imshow(np.transpose(npimg, (1, 2, 0)))
    plt.show()


# get some random training images
dataiter = iter(trainloader)
images, labels = next(dataiter)

imshow(torchvision.utils.make_grid(images))

#### Build network

In [None]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.flatten = torch.nn.Flatten()
        self.linear_relu_stack = torch.nn.Sequential(
            torch.nn.Linear(28*28, 128),
            torch.nn.ReLU(),
            torch.nn.Dropout(0.2),
            torch.nn.Linear(128, 10),
        )

    def forward(self, x):
        x = self.flatten(x)
        prob = self.linear_relu_stack(x)
        return prob

#### Train Model 

In [None]:
def train_loop(device, dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    for batch, (X, y) in enumerate(dataloader):
        # transfer data to GPU if available
        X = X.to(device)
        y = y.to(device)

        # Compute prediction and loss
        pred = model(X)
        loss = loss_fn(pred, y)
        
        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 1000 == 0:
            loss, current = loss.item(), batch * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

def main(device):

    # Setup Dataloader
    train_dataloader=prepare_data(batch_size=4)
    
    # Instantiate Model 
    model = Net().to(device)
    
    # instantiate loss and optimizer 
    loss_fn = torch.nn.CrossEntropyLoss() 
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

    # Train Model 
    epochs = 3
    for t in range(epochs):
        print(f"Epoch {t+1}\n-------------------------------")
        train_loop(device, train_dataloader, model, loss_fn, optimizer)
        
    print("Done!")
    return model

Train the model by calling `main`.

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
model = main(device)

### Distributed Code for Multiple GPUs on One Node.

In [None]:
#################################################
# 1. Create a process group (function)
def init_distributed(local_rank, world_size):
    '''
    local_rank: identifier for pariticular GPU on one node
    world: total number of process in a the group
    '''
    os.environ['MASTER_ADDR'] = 'localhost'           # IP address of rank 0 process
    os.environ['MASTER_PORT'] = '12355'               # a free port used to communicate amongst processors
    torch.cuda.set_device(local_rank)                 
    dist.init_process_group("nccl",                   # backend being used; nccl typically used with distributed GPU training
                            rank=local_rank,          # rank of the current process being used
                            world_size=world_size)    # total number of processors being used
#################################################  
    
def prepare_data(local_rank, world_size, batch_size=32):

    trainset = torchvision.datasets.MNIST(
                            root=os.path.join(os.environ['SCRATCH'], "data"),      # path to where data is stored
                            train=True,                                         # specifies if data is train or test
                            download=True,                                      # downloads data if not available at root
                            transform=torchvision.transforms.ToTensor()         # trasforms both features and targets accordingly
                            )

    # pass data to the distributed sampler and dataloader
    train_dataloader = DataLoader(trainset,
                                  ################################################
                                  # 2. Setup Dataloader with Distributed Sampler
                                  shuffle=False,
                                  sampler=DistributedSampler(trainset, num_replicas=world_size, rank=local_rank),
                                  ################################################
                                  batch_size=batch_size)

    return train_dataloader

# training loop for one epoch
def train_loop(local_rank, dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    for batch, (X, y) in enumerate(dataloader):
        # transfer data to GPU if available
        X = X.to(local_rank)
        y = y.to(local_rank)

        # Compute prediction and loss
        pred = model(X)
        loss = loss_fn(pred, y)
        
        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        ################################################
        # 4. Only write/print model information on one GPU
        if local_rank == 0:
            if batch % 100 == 0:
                loss, current = loss.item(), batch * len(X)
                print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")
        ################################################

def main(local_rank, world_size):
    ################################################
    # 1. Set up Process Group
    init_distributed(local_rank, world_size)
    ################################################

    ################################################
    # 2. Setup Dataloader with Distributed Sampler
    train_dataloader = prepare_data(local_rank, world_size)
    ################################################

    ################################################
    # 3. Wrap Model with DDP
    model = DDP(Net().to(local_rank),
        device_ids=[local_rank],                  # list of gpu that model lives on
        output_device=local_rank,                 # where to output model
    )
    ################################################

    # instantiate loss and optimizer
    loss_fn = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

    # Train Model
    epochs = 10
    for t in range(epochs):
        print(f"Epoch {t+1}\n-------------------------------")
        train_loop(local_rank, train_dataloader, model, loss_fn, optimizer)

    #################################################
    # 5. Close Process Group
    dist.destroy_process_group()
    #################################################
    
    print("Done!")
    return model

In [None]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

In [1]:
from pathlib import Path
from IPython import display

`display_python` displays `cnn_part3/mnist_parallel.py` with Python syntax highlighting as the cell output.

In [2]:
# Run this cell to show syntax highlighting
display.Code("cnn_part3/mnist_parallel.py")

Run `cnn_part3/mnist_parallel.py` below.

In [None]:
!python3 cnn_part3/mnist_parallel.py

## Additional Exercise

There is a script called `cnn_part3/simple_linear_regression_serial.py` that implements a simple linear regression model with PyTorch. Below, the script is modified to run on multiple GPUs on one node using PyTorch's DDP. 

In [3]:
# Run this cell to show syntax highlighting
display.Code("cnn_part3/simple_linear_regression_serial.py")

In [None]:
!python3 cnn_part3/simple_linear_regression_serial.py

`cnn_part3/simple_linear_regression_parallel.py` is the modified code to work with DDP

In [4]:
# Run this cell to show syntax highlighting
display.Code("cnn_part3/simple_linear_regression_parallel.py")

In [None]:
!python3 cnn_part3/simple_linear_regression_parallel.py