# GPU sanity checks

In [1]:
!nvidia-smi

Fri Aug 23 11:53:43 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla V100S-PCIE-32GB          On  |   00000000:00:05.0 Off |                    0 |
| N/A   40C    P0             27W /  250W |       0MiB /  32768MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

## Checking GPU availability and memory...

In [2]:
import torch
import torch.cuda as cuda

In [3]:
# To check the number of GPUs and their memory

gpu_count = cuda.device_count()
print(f"Number of available GPUs: {gpu_count}")
for i in range(gpu_count):
    gpu_properties = cuda.get_device_properties(i)
    print(f"GPU {i}: {gpu_properties.name}")
    print(f"  Total Memory: {gpu_properties.total_memory / 1024 ** 3:.2f} GB")

Number of available GPUs: 1
GPU 0: Tesla V100S-PCIE-32GB
  Total Memory: 31.73 GB


In [4]:
# Parameters

expected_gpu_count = 1  # Change as needed
expected_memory_gb = 31  # Change as needed (per GPU)
tensor_size_gb = 31  # Change as needed

In [5]:
# To validate the expected number of GPUs and memory

actual_gpu_count = cuda.device_count()
if actual_gpu_count < expected_gpu_count:
    raise ValueError(f"Expected at least {expected_gpu_count} GPUs, but found {actual_gpu_count}")

for i in range(expected_gpu_count):
    gpu_properties = cuda.get_device_properties(i)
    actual_memory_gb = gpu_properties.total_memory / 1024 ** 3
    if actual_memory_gb < expected_memory_gb:
        raise ValueError(f"Expected GPU {i} to have at least {expected_memory_gb} GB, but found {actual_memory_gb:.2f} GB")
    print(f"GPU {i} has sufficient memory: {actual_memory_gb:.2f} GB")

GPU 0 has sufficient memory: 31.73 GB


In [6]:
# Function to test loading a tensor onto the GPU
def test_tensor_load(gpu_index, size_gb):
    tensor_size = int(size_gb * 1024 ** 3 / 4)  # size in floats (4 bytes per float)
    device = torch.device(f'cuda:{gpu_index}')
    try:
        tensor = torch.rand(tensor_size, device=device)
        print(f"Successfully loaded tensor of size {size_gb} GB onto GPU {gpu_index}")
    except RuntimeError as e:
        print(f"Failed to load tensor of size {size_gb} GB onto GPU {gpu_index}: {e}")
        
# Function to test loading an oversized tensor onto the GPU
def test_oversized_tensor_load(gpu_index, size_gb):
    tensor_size = int(size_gb * 1024 ** 3 / 4)  # size in floats (4 bytes per float)
    device = torch.device(f'cuda:{gpu_index}')
    try:
        tensor = torch.rand(tensor_size, device=device)
        print(f"Unexpectedly succeeded in loading tensor of size {size_gb} GB onto GPU {gpu_index}")
    except RuntimeError as e:
        print(f"Correctly failed to load tensor of size {size_gb} GB onto GPU {gpu_index}: {e}")
    
for i in range(expected_gpu_count):
    print(f"\nTesting tensor load on GPU {i}...")
    test_tensor_load(i, tensor_size_gb)
    
    print(f"\nTesting oversized tensor load on GPU {i}...")
    test_oversized_tensor_load(i, 2 * tensor_size_gb)


Testing tensor load on GPU 0...
Successfully loaded tensor of size 31 GB onto GPU 0

Testing oversized tensor load on GPU 0...
Correctly failed to load tensor of size 62 GB onto GPU 0: CUDA out of memory. Tried to allocate 62.00 GiB. GPU 


The validation confirms that the current system configuration meets the expected requirements:
- **GPU Count:** The expected number of GPUs is available.
- **GPU Memory:** Each GPU has sufficient memory to support the planned workload.

This ensures that the system is adequately provisioned for the tasks requiring GPU resources.