# GPU sanity checks

In [1]:
!nvidia-smi

Thu Feb 27 16:59:12 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.127.08             Driver Version: 550.127.08     CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100 80GB PCIe          On  |   00000001:00:00.0 Off |                    0 |
| N/A   31C    P0             46W /  300W |       1MiB /  81920MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2025 NVIDIA Corporation
Built on Wed_Jan_15_19:20:09_PST_2025
Cuda compilation tools, release 12.8, V12.8.61
Build cuda_12.8.r12.8/compiler.35404655_0


In [5]:
!hostname

jupyter-01jn1g9v33epps6f5f49rzeeyp


In [12]:
!/tmp/cuda-samples/Samples/0_Introduction/vectorAdd/vectorAdd

[Vector addition of 50000 elements]
Copy input data from the host memory to the CUDA device
CUDA kernel launch with 196 blocks of 256 threads
Copy output data from the CUDA device to the host memory
Test PASSED
Done


## Checking GPU availability and memory...

In [13]:
import torch
import torch.cuda as cuda

In [14]:
print(torch.cuda.is_available())
print(torch.cuda.device_count())
print(torch.cuda.current_device())
print(torch.cuda.device(0))
print(torch.cuda.get_device_name(0))
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device:", device)

True
1
0
<torch.cuda.device object at 0x7fb34433c950>
NVIDIA A100 80GB PCIe
device: cuda:0


In [15]:
t = torch.cuda.get_device_properties(0).total_memory
r = torch.cuda.memory_reserved(0)
a = torch.cuda.memory_allocated(0)
f = r-a  # free inside reserved
print(t)
print(r)
print(a)
print(f)

84974239744
0
0
0


In [26]:
torch.zeros(1).cuda()

tensor([0.], device='cuda:0')

In [16]:
import re
with open('/proc/meminfo') as f:
    meminfo = f.read()
matched = re.search(r'^MemTotal:\s+(\d+)', meminfo)
if matched: 
    mem_total_kB = int(matched.groups()[0])
print(mem_total_kB)

226764500


In [17]:
# Function to check the number of GPUs and their memory
def print_gpus():
    gpu_count = cuda.device_count()
    print(f"Number of available GPUs: {gpu_count}")
    for i in range(gpu_count):
        gpu_properties = cuda.get_device_properties(i)
        print(f"GPU {i}: {gpu_properties.name}")
        print(f"  Total Memory: {gpu_properties.total_memory / 1024 ** 3:.2f} GB")

In [18]:
print_gpus()

Number of available GPUs: 1
GPU 0: NVIDIA A100 80GB PCIe
  Total Memory: 79.14 GB


In [36]:
# To check the number of GPUs and their memory

gpu_count = cuda.device_count()
print(f"Number of available GPUs: {gpu_count}")

for i in range(gpu_count):
    gpu_properties = cuda.get_device_properties(i)
    print(f"GPU {i}: {gpu_properties.name}")
    print(f"  Total Memory: {gpu_properties.total_memory / 1024 ** 3:.2f} GB")

Number of available GPUs: 1
GPU 0: NVIDIA A100 80GB PCIe
  Total Memory: 79.14 GB


## Validating expected GPU count and memory...

In [40]:
# Parameters

expected_gpu_count = 1  # Change as needed
expected_memory_gb = 79  # Change as needed (per GPU)
tensor_size_gb = 79  # Change as needed

In [41]:
# To validate the expected number of GPUs and memory

actual_gpu_count = cuda.device_count()

def validate_gpus(expected_gpu_count, expected_memory_gb):
    
    if actual_gpu_count < expected_gpu_count:
        raise ValueError(f"Expected at least {expected_gpu_count} GPUs, but found {actual_gpu_count}")

    for i in range(expected_gpu_count):
        gpu_properties = cuda.get_device_properties(i)
        actual_memory_gb = gpu_properties.total_memory / 1024 ** 3
        if actual_memory_gb < expected_memory_gb:
            raise ValueError(f"Expected GPU {i} to have at least {expected_memory_gb} GB, but found {actual_memory_gb:.2f} GB")
        print(f"GPU {i} has sufficient memory: {actual_memory_gb:.2f} GB")

validate_gpus(expected_gpu_count, expected_memory_gb)

GPU 0 has sufficient memory: 79.14 GB


In [42]:
# Function to test loading a tensor onto the GPU
def test_tensor_load(gpu_index, size_gb):
    tensor_size = int(size_gb * 1024 ** 3 / 4)  # size in floats (4 bytes per float)
    device = torch.device(f'cuda:{gpu_index}')
    try:
        tensor = torch.rand(tensor_size, device=device)
        print(f"Successfully loaded tensor of size {size_gb} GB onto GPU {gpu_index}")
    except RuntimeError as e:
        print(f"Failed to load tensor of size {size_gb} GB onto GPU {gpu_index}: {e}")
        
# Function to test loading an oversized tensor onto the GPU
def test_oversized_tensor_load(gpu_index, size_gb):
    tensor_size = int(size_gb * 1024 ** 3 / 4)  # size in floats (4 bytes per float)
    device = torch.device(f'cuda:{gpu_index}')
    try:
        tensor = torch.rand(tensor_size, device=device)
        print(f"Unexpectedly succeeded in loading tensor of size {size_gb} GB onto GPU {gpu_index}")
    except RuntimeError as e:
        print(f"Correctly failed to load tensor of size {size_gb} GB onto GPU {gpu_index}: {e}")
    
for i in range(expected_gpu_count):
    print(f"\nTesting tensor load on GPU {i}...")
    test_tensor_load(i, tensor_size_gb)
    print(f"\nTesting oversized tensor load on GPU {i}...")
    test_oversized_tensor_load(i, 2 * tensor_size_gb)


Testing tensor load on GPU 0...
Failed to load tensor of size 79 GB onto GPU 0: CUDA out of memory. Tried to allocate 79.00 GiB. GPU 0 has a total capacity of 79.14 GiB of which 78.65 GiB is free. Process 957717 has 492.00 MiB memory in use. Of the allocated memory 17.50 KiB is allocated by PyTorch, and 1.98 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

Testing oversized tensor load on GPU 0...
Correctly failed to load tensor of size 158 GB onto GPU 0: CUDA out of memory. Tried to allocate 158.00 GiB. GPU 0 has a total capacity of 79.14 GiB of which 78.65 GiB is free. Process 957717 has 492.00 MiB memory in use. Of the allocated memory 17.50 KiB is allocated by PyTorch, and 1.98 MiB is reserved by PyTorch but unallocated. If reserved but unallocat

The validation confirms that the current system configuration meets the expected requirements:
- **GPU Count:** The expected number of GPUs is available.
- **GPU Memory:** Each GPU has sufficient memory to support the planned workload.

This ensures that the system is adequately provisioned for the tasks requiring GPU resources.

## More Validation

In [43]:
X_train = torch.FloatTensor([0., 1., 2.])
print(X_train.is_cuda)
X_train = X_train.to(device)
print(X_train.is_cuda)

False
True


In [44]:
import torch
import math
# device = torch.device("cpu")
# device = torch.device("cuda:0") 
data_type = torch.float
x = torch.linspace(-math.pi, math.pi, 1500, device=device, dtype=data_type)
y = torch.sin(x)
a = torch.randn((), device=device, dtype=data_type)
b = torch.randn((), device=device, dtype=data_type)
c = torch.randn((), device=device, dtype=data_type)
d = torch.randn((), device=device, dtype=data_type)
learning_rate = 1e-6
m = 1
for i in range(1500):
    y_pred = a + b * m + c * m ** 2 + d * m ** 3
    loss = (y_pred - y).pow(2).sum().item()
    if i % 100 == 99:
        print(i, loss)
    grad_a = y_pred.sum()
    grad_b = (y_pred * m).sum()
    grad_c = (y_pred * m** 2).sum()
    grad_d = (y_pred * m ** 3).sum()
    a -= learning_rate * grad_a
    b -= learning_rate * grad_b
    c -= learning_rate * grad_c
    d -= learning_rate * grad_d

print(f'Result: y = {a.item()} + {b.item()} x + {c.item()} x^2 + {d.item()} x^3')

99 754.3153076171875
199 754.3112182617188
299 754.3071899414062
399 754.3031005859375
499 754.2990112304688
599 754.2949829101562
699 754.2909545898438
799 754.2869262695312
899 754.2828979492188
999 754.27880859375
1099 754.2747802734375
1199 754.270751953125
1299 754.2667236328125
1399 754.2626953125
1499 754.2586669921875
Result: y = -0.37436068058013916 + -0.8177333474159241 x + 0.864264190196991 x^2 + 0.384154349565506 x^3


In [45]:
from ipywidgets import IntSlider
IntSlider()

IntSlider(value=0)