In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch, time, gc
from typing import TypeVar, List, Tuple
# this is tutorial test, much of the code is taken from
#https://pytorch.org/tutorials/recipes/recipes/amp_recipe.html

In [2]:
# codes taken from pytorch documentation

# Timing utilities
start_time = None

def start_timer():
    global start_time
    gc.collect()
    torch.cuda.empty_cache()
    torch.cuda.reset_max_memory_allocated()
    torch.cuda.synchronize()
    start_time = time.time()

def end_timer_and_print(local_msg):
    torch.cuda.synchronize()
    end_time = time.time()
    print("\n" + local_msg)
    print("Total execution time = {:.3f} sec".format(end_time - start_time))
    print("Max memory used by tensors = {} bytes".format(torch.cuda.max_memory_allocated()))

In [3]:

def addActivation(modules: List, activation: str):
    """
    author: Hyeon-Seo Yun Aug 2021
    """
    if activation == "elu":
        modules.append(nn.ELU())
    elif activation == "relu":
        modules.append(nn.ReLU())
    elif activation == "tanh":
        modules.append(nn.Hardtanh())
    elif activation == "leaky":
        modules.append(nn.LeakyReLU())
    else:
        pass
    return modules

def fillModules(
    modules: List,
    input_dim: int,
    output_dim: int,
    layer_depth: int,
    hidden_nodes: int,
    device: str,
    activation = "relu",
    bias = True,
    put_batchnorm = False):
    """
    author: Hyeon-Seo Yun Aug 2021
    we typically expect modules parameter to be an empty list
    We have it like this to leave the option to add modularity
    when put_batchnorm == True, we add batchnorm before every 
    dense layer, except before the input layer
    """
    print("fillModules device: ", device)
    # add the input layer
    # print("input_dim ", input_dim)
    # print("hidden_nodes: ", hidden_nodes)
    # print("modules: ", len(modules))
    modules.append(nn.Linear(input_dim, hidden_nodes, bias=bias).to(device))
    modules = addActivation(modules, activation)
    for _ in range(layer_depth-2):
        if put_batchnorm:
            modules.append(nn.BatchNorm1d(hidden_nodes).to(device))
        modules.append(nn.Linear(hidden_nodes, hidden_nodes, bias=bias).to(device))
        modules = addActivation(modules, activation)  
    # last layer
    if put_batchnorm:
        modules.append(nn.BatchNorm1d(hidden_nodes).to(device))
    modules.append(nn.Linear(hidden_nodes, output_dim, bias=bias).to(device))
    modules = addActivation(modules, activation)
    return modules

In [4]:
# codes taken from pytorch documentation

batch_size = 256 # Try, for example, 128, 256, 512, 513.
in_size = 4000
out_size = 4000
num_layers = 20#1000
hidden_num = 1000
num_batches = 8**3
epochs = 10

# Creates data in default precision.
# The same data is used for both default and mixed precision trials below.
# You don't need to manually change inputs' dtype when enabling mixed precision.
data = [torch.randn(batch_size, in_size, device="cuda") for _ in range(num_batches)]
targets = [torch.randn(batch_size, out_size, device="cuda") for _ in range(num_batches)]

loss_fn = torch.nn.MSELoss().cuda()

In [5]:
# codes taken from pytorch documentation

for epoch in range(0): # 0 epochs, this section is for illustration only
    for input, target in zip(data, targets):
        # Runs the forward pass under autocast.
        with torch.cuda.amp.autocast():
            output = model(input)
            # output is float16 because linear layers autocast to float16.
            assert output.dtype is torch.float16

            loss = loss_fn(output, target)
            # loss is float32 because mse_loss layers autocast to float32.
            assert loss.dtype is torch.float32

        # Exits autocast before backward().
        # Backward passes under autocast are not recommended.
        # Backward ops run in the same dtype autocast chose for corresponding forward ops.
        loss.backward()
        optim.step()
        optim.zero_grad() # set_to_none=True here can modestly improve performance

In [6]:
# codes taken from pytorch documentation

# seems like using gradScaler, you can backprop with mixed precision too


# Constructs scaler once, at the beginning of the convergence run, using default args.
# If your network fails to converge with default GradScaler args, please file an issue.
# The same GradScaler instance should be used for the entire convergence run.
# If you perform multiple convergence runs in the same script, each run should use
# a dedicated fresh GradScaler instance.  GradScaler instances are lightweight.
scaler = torch.cuda.amp.GradScaler()

for epoch in range(0): # 0 epochs, this section is for illustration only
    for input, target in zip(data, targets):
        with torch.cuda.amp.autocast():
            output = model(input)
            loss = loss_fn(output, target)

        # Scales loss.  Calls backward() on scaled loss to create scaled gradients.
        scaler.scale(loss).backward()

        # scaler.step() first unscales the gradients of the optimizer's assigned params.
        # If these gradients do not contain infs or NaNs, optimizer.step() is then called,
        # otherwise, optimizer.step() is skipped.
        scaler.step(opt)

        # Updates the scale for next iteration.
        scaler.update()

        opt.zero_grad() # set_to_none=True here can modestly improve performance

In [7]:
def make_model(in_size, out_size, num_layers):
    layers = []
    for _ in range(num_layers - 1):
        layers.append(torch.nn.Linear(in_size, in_size))
        layers.append(torch.nn.ReLU())
    layers.append(torch.nn.Linear(in_size, out_size))
    return torch.nn.Sequential(*tuple(layers)).cuda()

In [16]:
# modules = fillModules(
#     [],
#     in_size,
#     out_size,
#     num_layers,
#     hidden_num,
#     "cuda"
# )
# model = nn.Sequential(*modules)
model =  make_model(in_size, out_size, num_layers)
optim = torch.optim.SGD(model.parameters(), lr=0.001)

In [20]:
# codes taken from pytorch documentation
use_amp = True

scaler = torch.cuda.amp.GradScaler(enabled=use_amp)

start_timer()
for epoch in range(epochs):
    for input, target in zip(data, targets):
        with torch.cuda.amp.autocast(enabled=use_amp):
            output = model(input)
            loss = loss_fn(output, target)
        scaler.scale(loss).backward()
        scaler.step(optim)
        scaler.update()
        optim.zero_grad() # set_to_none=True here can modestly improve performance
end_timer_and_print("Mixed precision:")

KeyboardInterrupt: 

In [21]:
# codes taken from pytorch documentation
"""
Without torch.cuda.amp, the following simple network executes 
all ops in default precision (torch.float32)
"""



start_timer()
for epoch in range(epochs):
    for input, target in zip(data, targets):
        output = model(input)
        loss = loss_fn(output, target)
        loss.backward()
        optim.step()
        optim.zero_grad() # set_to_none=True here can modestly improve performance
end_timer_and_print("Default precision:")

KeyboardInterrupt: 

In [19]:
modules = fillModules(
    [],
    in_size,
    out_size,
    num_layers,
    hidden_num,
    "cuda"
)
model = nn.Sequential(*modules)
optim = torch.optim.SGD(model.parameters(), lr=0.001)

fillModules device:  cuda


In [12]:
# codes taken from pytorch documentation
use_amp = True

scaler = torch.cuda.amp.GradScaler(enabled=use_amp)

start_timer()
for epoch in range(epochs):
    for input, target in zip(data, targets):
        with torch.cuda.amp.autocast(enabled=use_amp):
            output = model(input)
            loss = loss_fn(output, target)
        scaler.scale(loss).backward()
        scaler.step(optim)
        scaler.update()
        optim.zero_grad() # set_to_none=True here can modestly improve performance
end_timer_and_print("Mixed precision:")


Mixed precision:
Total execution time = 35.269 sec
Max memory used by tensors = 7079164928 bytes


In [13]:
# codes taken from pytorch documentation
"""
Without torch.cuda.amp, the following simple network executes 
all ops in default precision (torch.float32)
"""



start_timer()
for epoch in range(epochs):
    for input, target in zip(data, targets):
        output = model(input)
        loss = loss_fn(output, target)
        loss.backward()
        optim.step()
        optim.zero_grad() # set_to_none=True here can modestly improve performance
end_timer_and_print("Default precision:")


Default precision:
Total execution time = 23.101 sec
Max memory used by tensors = 4552902656 bytes
