<a href="https://colab.research.google.com/github/fmars/n00bGPT/blob/main/colab/mem_usage.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install accelerate nvidia-ml-py3



In [None]:
import torch
from pynvml import *

In [None]:
def gpu_util(s=''):
    nvmlInit()
    gb = 1024**3
    t = torch.cuda.get_device_properties(0).total_memory
    r = torch.cuda.memory_reserved(0)
    a = torch.cuda.memory_allocated(0)
    f = t-a  # free inside reserved
    print(f'{s:<25}: GPU memory (Torch) total: {t/gb:.1f}, reserved: {r/gb:.1f}, allocated: {a/gb:.1f}, free: {f/gb:.1f}')

gpu_util()

                         : GPU memory (Torch) total: 14.7, reserved: 0.0, allocated: 0.0, free: 14.7


# Memory usage in forward(), backward(), opt.step()

In [None]:
dev = torch.device('cuda')
class Model(torch.nn.Module):
  def __init__(self):
    super().__init__()
    self.a = torch.nn.Parameter(torch.randn(1024*1024*1024//4).to(dev))
    self.b = torch.nn.Parameter(torch.randn(1024*1024*1024//4).to(dev))
  def forward(self):
    c = self.a + self.b
    gpu_util('inside foward 1')
    d = self.a - self.b
    gpu_util('inside foward 2')
    e = self.a * self.b
    gpu_util('inside foward 3')
    f = c + d
    gpu_util('inside foward4 ')
    return f

1. Model parameter has 2GB
2. Optimizer doesn't take memory when created
3. Forward() will allocate memory for all intermediate tensors (4*1GB=4GB)
4. After forward(), only the output tensor (1GB) is stored and all other intermediate tensors are freed (3GB)
5. Backward() allocates and computes gradient for each parameter (2GB)
6. Optimizer step, SGD optimizer takes no extra memory but update weights in-place. Adam stores momentum plus delta for each parameter (2*2GB=4GB)

In [None]:
gpu_util('before creating model')
m = Model()
gpu_util('after creating model')
opt = torch.optim.Adam(m.parameters())
# opt = torch.optim.SGD(m.parameters(),lr=1e-3)
gpu_util('after creating opt')
for i in range(5):
  print(f'iteration {i}')
  preds = m()
  gpu_util('after forward')
  loss = torch.mean(preds)
  gpu_util('after loss')
  loss.backward()
  gpu_util('after loss.backward')
  opt.step()
  gpu_util('after opt.step')
  opt.zero_grad()
  gpu_util('after opt.zero_grad')

before creating model    : GPU memory (Torch) total: 14.7, reserved: 0.0, allocated: 0.0, free: 14.7
after creating model     : GPU memory (Torch) total: 14.7, reserved: 2.0, allocated: 2.0, free: 12.7
after creating opt       : GPU memory (Torch) total: 14.7, reserved: 2.0, allocated: 2.0, free: 12.7
iteration 0
inside foward 1          : GPU memory (Torch) total: 14.7, reserved: 3.0, allocated: 3.0, free: 11.7
inside foward 2          : GPU memory (Torch) total: 14.7, reserved: 4.0, allocated: 4.0, free: 10.7
inside foward 3          : GPU memory (Torch) total: 14.7, reserved: 5.0, allocated: 5.0, free: 9.7
inside foward4           : GPU memory (Torch) total: 14.7, reserved: 6.0, allocated: 6.0, free: 8.7
after forward            : GPU memory (Torch) total: 14.7, reserved: 6.0, allocated: 3.0, free: 11.7
after loss               : GPU memory (Torch) total: 14.7, reserved: 6.0, allocated: 3.0, free: 11.7
after loss.backward      : GPU memory (Torch) total: 14.7, reserved: 6.0, allocat

In [None]:
torch.cuda.memory._record_memory_history(enabled=True)

In [None]:
for i in range(10):
  x=torch.randn(100).to('cuda')
  y=torch.randn(100).to('cuda')
  z=x+y

In [None]:
s=torch.cuda.memory._snapshot()

In [None]:
s

{'segments': [{'device': 0,
   'address': 134211956637696,
   'total_size': 1073741824,
   'allocated_size': 0,
   'active_size': 0,
   'requested_size': 0,
   'stream': 0,
   'segment_type': 'large',
   'blocks': [{'size': 1073741824,
     'requested_size': 1073741824,
     'state': 'inactive'}]},
  {'device': 0,
   'address': 134213030379520,
   'total_size': 1073741824,
   'allocated_size': 0,
   'active_size': 0,
   'requested_size': 0,
   'stream': 0,
   'segment_type': 'large',
   'blocks': [{'size': 1073741824,
     'requested_size': 1073741824,
     'state': 'inactive'}]},
  {'device': 0,
   'address': 134214104121344,
   'total_size': 1073741824,
   'allocated_size': 0,
   'active_size': 0,
   'requested_size': 0,
   'stream': 0,
   'segment_type': 'large',
   'blocks': [{'size': 1073741824,
     'requested_size': 1073741824,
     'state': 'inactive'}]},
  {'device': 0,
   'address': 134215177863168,
   'total_size': 1073741824,
   'allocated_size': 1073741824,
   'active_size

# Cuda cache allocator

In [None]:
import torch
import ctypes
import traceback

# Load the CUDA library
# cudart = ctypes.CDLL('libcudart.so')  # Adjust the library name as needed

# Define a custom CUDA allocator
class CustomCudaAllocator(torch.cuda.MemoryAllocator):
    def __init__(self):
        self.default_allocator = torch.cuda.memory._get_memory_allocator()

    def allocate(self, size):
        # Record allocation information here (shape, stack trace)
        allocation = self.default_allocator.allocate(size)
        return allocation

    def deallocate(self, allocation):
        # Record deallocation information here (shape, stack trace)
        self.default_allocator.deallocate(allocation)

# Replace the default CUDA allocator with the custom allocator
custom_allocator = CustomCudaAllocator()
torch.cuda.memory._set_memory_allocator(custom_allocator)

# Example usage
x = torch.cuda.FloatTensor(10)
y = torch.cuda.FloatTensor(5, 5)

# Restore the original CUDA allocator
torch.cuda.memory._set_memory_allocator(custom_allocator.default_allocator)
