## This notebook contains sample code for running AMP with PyTorch in Gradient. For a worked tutorial, please see `amp_recipe.ipynb`

Sample Workflow for working with AMP

```
from apex import amp
model, optimizer = amp.initialize(model, optimizer, opt_level="O1")


loss = criterion(…)
loss.backward()
optimizer.step()


loss = criterion(…)
with amp.scale_loss(loss, optimizer) as scaled_loss:
    scaled_loss.backward()
optimizer.step()

```

## Capturing function calls

In [None]:
orig_linear = torch.nn.functional.linear
def wrapped_linear(*args):
 casted_args = []
  for arg in args:
    if torch.is_tensor(arg) and torch.is_floating_point(arg):
      casted_args.append(torch.cast(arg, torch.float16))
    else:
      casted_args.append(arg)
  return orig_linear(*casted_args)
torch.nn.functional.linear = wrapped_linear

## Autocasting and Gradient Scaling Using PyTorch

In [None]:
# Creates model and optimizer in default precision
model = Net().cuda()
optimizer = optim.SGD(model.parameters(), ...)

# Creates a GradScaler once at the beginning of training.
scaler = GradScaler()

for epoch in epochs:
    for input, target in data:
        optimizer.zero_grad()

        # Runs the forward pass with autocasting.
        with autocast(device_type='cuda', dtype=torch.float16):
            output = model(input)
            loss = loss_fn(output, target)

        # Backward ops run in the same dtype autocast chose for corresponding forward ops.
        scaler.scale(loss).backward()

        # scaler.step() first unscales the gradients of the optimizer's assigned params.
   
        scaler.step(optimizer)

        # Updates the scale for next iteration.
        scaler.update()
        # Unscales the gradients of optimizer's assigned params in-place
        scaler.unscale_(optimizer)
        # Since the gradients of optimizer's assigned params are unscaled, clips as usual: 
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)    

## Working with scaled gradients

### Gradient accumulation

In [None]:
scaler = GradScaler()

for epoch in epochs:
    for i, (input, target) in enumerate(data):
        with autocast():
            output = model(input)
            loss = loss_fn(output, target)
            # normalize the loss 
            loss = loss / iters_to_accumulate

        # Accumulates scaled gradients.
        scaler.scale(loss).backward()
          # weights update
        if (i + 1) % iters_to_accumulate == 0:
            # may unscale_ here if desired 
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()

### Gradient penalty

In [None]:
for epoch in epochs:
    for input, target in data:
        optimizer.zero_grad()
        output = model(input)
        loss = loss_fn(output, target)

        # Creates gradients
        grad_prams = torch.autograd.grad(outputs=loss,
                                          inputs=model.parameters(),
                                          create_graph=True)

        # Computes the penalty term and adds it to the loss
        grad_norm = 0
        for grad in grad_prams:
            grad_norm += grad.pow(2).sum()
        grad_norm = grad_norm.sqrt()
        loss = loss + grad_norm

        loss.backward()

        # You can clip gradients here

        optimizer.step()

In [None]:
scaler = GradScaler()

for epoch in epochs:
    for input, target in data:
        optimizer.zero_grad()
        with autocast():
            output = model(input)
            loss = loss_fn(output, target)

        # Perform loss scaling for autograd.grad's backward pass, resulting #scaled_grad_prams
        scaled_grad_prams = torch.autograd.grad(outputs=scaler.scale(loss),
                                                 inputs=model.parameters(),
                                                 create_graph=True)

        # Creates grad_prams before computing the penalty(grad_prams must be #unscaled). 
        # Because no optimizer owns scaled_grad_prams, conventional division #is used instead of scaler.unscale_:
        inv_scaled = 1./scaler.get_scale()
        grad_prams = [p * inv_scaled for p in scaled_grad_prams]

        # The penalty term is computed and added to the loss. 
        with autocast():
            grad_norm = 0
            for grad in grad_prams:
                grad_norm += grad.pow(2).sum()
            grad_norm = grad_norm.sqrt()
            loss = loss + grad_norm

        # Applies scaling to the backward call.
        # Accumulates properly scaled leaf gradients.
        scaler.scale(loss).backward()

        # You can unscale_ here 

        # step() and update() proceed as usual.
        scaler.step(optimizer)
        scaler.update()


## Working With Multiple Models, Losses, and Optimizers

In [None]:
scaler = torch.cuda.amp.GradScaler()

for epoch in epochs:
    for input, target in data:
        optimizer1.zero_grad()
        optimizer2.zero_grad()
        with autocast():
            output1 = model1(input)
            output2 = model2(input)
            loss1 = loss_fn(2 * output1 + 3 * output2, target)
            loss2 = loss_fn(3 * output1 - 5 * output2, target)

       #Although retain graph is unrelated to amp, it is present in this  #example since both backward() calls share certain regions of graph. 
        scaler.scale(loss1).backward(retain_graph=True)
        scaler.scale(loss2).backward()

        # If you wish to view or adjust the gradients of the params they #possess, you may specify which optimizers get explicit unscaling. .
        scaler.unscale_(optimizer1)

        scaler.step(optimizer1)
        scaler.step(optimizer2)

        scaler.update()

## Working with multiple GPUs

In [None]:
model = Model_m()
p_model = nn.DataParallel(model)

# Sets autocast in the main thread
with autocast():
    # There will be autocasting in p_model. 
    output = p_model(input)
    # loss_fn also autocast
    loss = loss_fn(output)