## taken from : 
https://pytorch.org/tutorials/recipes/recipes/amp_recipe.html

In [27]:
import torch, time , gc
start_time = None

def start_timer():
    global start_time
    gc.collect()
    torch.cuda.empty_cache()
    torch.cuda.reset_max_memory_allocated()
    torch.cuda.synchronize()
    start_time = time.time()

def end_timer_and_print(local_msg):
    torch.cuda.synchronize()
    end_time = time.time()
    print("\n" + local_msg)
    print("Total execution time = {:.3f} sec".format(end_time - start_time))
    print("Max memory used by tensors = {} Mbytes".format(torch.cuda.max_memory_allocated()*(1e-6)))


### 2. simple network, dataset, loss and so on (basic setup)

In [29]:
def make_model(in_size, out_size, num_layers):
    layers = []
    for _ in range(num_layers - 1):
        layers.append(torch.nn.Linear(in_size, in_size))
        layers.append(torch.nn.ReLU())
    layers.append(torch.nn.Linear(in_size, out_size))
    return torch.nn.Sequential(*tuple(layers)).cuda()

In [30]:
batch_size = 512 # Try, for example, 128, 256, 513.
in_size = 4096
out_size = 4096
num_layers = 3
num_batches = 50
epochs = 3

# Creates data in default precision.
# The same data is used for both default and mixed precision trials below.
# You don't need to manually change inputs' dtype when enabling mixed precision.
data = [torch.randn(batch_size, in_size, device="cuda") for _ in range(num_batches)]
targets = [torch.randn(batch_size, out_size, device="cuda") for _ in range(num_batches)]

loss_fn = torch.nn.MSELoss().cuda()

In [31]:
start_timer()
net = make_model(in_size, out_size, num_layers)
for input, target in zip(data, targets):
    output = net(input)
    
end_timer_and_print("Forward Pass : ")


Forward Pass : 
Total execution time = 0.558 sec
Max memory used by tensors = 1501.708288 Mbytes


### 3. Default Precision (no AMP)

In [32]:
net = make_model(in_size, out_size, num_layers)
opt = torch.optim.SGD(net.parameters(), lr=0.001)

start_timer()
for epoch in range(epochs):
    for input, target in zip(data, targets):
        output = net(input)
        loss = loss_fn(output, target)
        loss.backward()
        opt.step()
        opt.zero_grad() # set_to_none=True here can modestly improve performance
end_timer_and_print("Default precision:")


Default precision:
Total execution time = 1.860 sec
Max memory used by tensors = 1703.084032 Mbytes


### 4. Now doing AMP

#### 4.1 Adding Autocast
automatically casts parts of the model to fp16

```
with torch.autocast(device_type='cuda', dtype=torch.float16):
    output = net(input)
    loss = loss_fn(output, target)    
```

In [34]:
start_timer()
for epoch in range(epochs): # 0 epochs, this section is for illustration only
    for input, target in zip(data, targets):
        # Runs the forward pass under autocast.
        with torch.autocast(device_type='cuda', dtype=torch.float16):
            output = net(input)
            # output is float16 because linear layers autocast to float16.
            assert output.dtype is torch.float16

            loss = loss_fn(output, target)
            # loss is float32 because mse_loss layers autocast to float32.
            assert loss.dtype is torch.float32

        # Exits autocast before backward().
        # Backward passes under autocast are not recommended.
        # Backward ops run in the same dtype autocast chose for corresponding forward ops.
        loss.backward()
        opt.step()
        opt.zero_grad() # set_to_none=True here can modestly improve performance
end_timer_and_print("Autocast :")


Autocast :
Total execution time = 0.691 sec
Max memory used by tensors = 1405.198848 Mbytes


#### 4.2. Adding GradScaler

scaling the grad by S and so on (prevent small grad values from overflow)

=> MUST CREATE NEW GRAD FOR EVERY NEW FOLD!  

```
scaler = torch.cuda.amp.GradScaler() #an object
##do the autocast we did before 
with torch.autocast(device_type='cuda', dtype=torch.float16):
    output = net(input)
    loss = loss_fn(output, target)

scaler.scale(loss).backward() #do backward ON the scaled (fp16) loss 

scaler.step(opt) #scale the opt (unscales the gradients and if it does not contian NaN do actual optimizer.step() 

scaler.update() #update the scaler => updates loss and opt simulatenously I think
```

In [38]:
start_timer()
# Constructs scaler once, at the beginning of the convergence run, using default args.
# If your network fails to converge with default GradScaler args, please file an issue.
# The same GradScaler instance should be used for the entire convergence run.
# If you perform multiple convergence runs in the same script, each run should use
# a dedicated fresh GradScaler instance.  GradScaler instances are lightweight.
scaler = torch.cuda.amp.GradScaler() 

for epoch in range(epochs): # 0 epochs, this section is for illustration only
    for input, target in zip(data, targets):
        with torch.autocast(device_type='cuda', dtype=torch.float16):
            output = net(input)
            loss = loss_fn(output, target)

        # Scales loss.  Calls backward() on scaled loss to create scaled gradients.
        scaler.scale(loss).backward()

        # scaler.step() first unscales the gradients of the optimizer's assigned params.
        # If these gradients do not contain infs or NaNs, optimizer.step() is then called,
        # otherwise, optimizer.step() is skipped.
        scaler.step(opt)

        # Updates the scale for next iteration.
        scaler.update()

        opt.zero_grad() # set_to_none=True here can modestly improve performance
end_timer_and_print("Autocast and GradScaler :")


Autocast and GradScaler :
Total execution time = 0.795 sec
Max memory used by tensors = 1405.200384 Mbytes


#### 4.3 easily switching b/w AMP and default 

`GradScaler` 과 `autocast`에서 (`enabled = True or False`)

 ```
 scaler = torch.cuda.amp.GradScaler(enabled = enable_AMP) 
 with torch.autocast(device_type='cuda', dtype=torch.float16, enabled = enable_AMP):
 ```

In [45]:
start_timer()
enable_AMP = False
scaler = torch.cuda.amp.GradScaler(enabled = enable_AMP) 

for epoch in range(epochs): # 0 epochs, this section is for illustration only
    for input, target in zip(data, targets):
        with torch.autocast(device_type='cuda', dtype=torch.float16, enabled = enable_AMP):
            output = net(input)
            loss = loss_fn(output, target)

        # Scales loss.  Calls backward() on scaled loss to create scaled gradients.
        scaler.scale(loss).backward()

        # scaler.step() first unscales the gradients of the optimizer's assigned params.
        # If these gradients do not contain infs or NaNs, optimizer.step() is then called,
        # otherwise, optimizer.step() is skipped.
        scaler.step(opt)

        # Updates the scale for next iteration.
        scaler.update()

        opt.zero_grad() # set_to_none=True here can modestly improve performance
end_timer_and_print("Autocast and GradScaler :")


Autocast and GradScaler :
Total execution time = 1.859 sec
Max memory used by tensors = 1359.069696 Mbytes


나머지 : 생략 