# GPU sanity checks

In [1]:
!nvidia-smi

/bin/bash: line 1: nvidia-smi: command not found


In [None]:
!nvcc --version

In [None]:
!hostname

In [None]:
!./.cuda-samples/Samples/0_Introduction/vectorAdd/vectorAdd

# Checks

In [6]:
# Import necessary libraries
import torch
import torch.cuda as cuda

In [None]:
import torch
print(torch.cuda.is_available())
print(torch.cuda.device_count())
print(torch.cuda.current_device())
print(torch.cuda.device(0))
print(torch.cuda.get_device_name(0))
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device:", device)

In [None]:
torch.zeros(1).cuda()

In [None]:
import torch
t = torch.cuda.get_device_properties(0).total_memory
r = torch.cuda.memory_reserved(0)
a = torch.cuda.memory_allocated(0)
f = r-a  # free inside reserved
print(t)
print(r)
print(a)
print(f)

In [None]:
import re
with open('/proc/meminfo') as f:
    meminfo = f.read()
matched = re.search(r'^MemTotal:\s+(\d+)', meminfo)
if matched: 
    mem_total_kB = int(matched.groups()[0])
print(mem_total_kB)

In [None]:
from ipywidgets import IntSlider
IntSlider()

In [7]:
# Function to check the number of GPUs and their memory
def check_gpus():
    gpu_count = cuda.device_count()
    print(f"Number of available GPUs: {gpu_count}")
    for i in range(gpu_count):
        gpu_properties = cuda.get_device_properties(i)
        print(f"GPU {i}: {gpu_properties.name}")
        print(f"  Total Memory: {gpu_properties.total_memory / 1024 ** 3:.2f} GB")

# Function to validate the expected number of GPUs and memory
def validate_gpus(expected_gpu_count, expected_memory_gb):
    actual_gpu_count = cuda.device_count()
    if actual_gpu_count < expected_gpu_count:
        raise ValueError(f"Expected at least {expected_gpu_count} GPUs, but found {actual_gpu_count}")
    
    for i in range(expected_gpu_count):
        gpu_properties = cuda.get_device_properties(i)
        actual_memory_gb = gpu_properties.total_memory / 1024 ** 3
        if actual_memory_gb < expected_memory_gb:
            raise ValueError(f"Expected GPU {i} to have at least {expected_memory_gb} GB, but found {actual_memory_gb:.2f} GB")
        print(f"GPU {i} has sufficient memory: {actual_memory_gb:.2f} GB")

# Function to test loading a tensor onto the GPU
def test_tensor_load(gpu_index, size_gb):
    tensor_size = int(size_gb * 1024 ** 3 / 4)  # size in floats (4 bytes per float)
    device = torch.device(f'cuda:{gpu_index}')
    try:
        tensor = torch.rand(tensor_size, device=device)
        print(f"Successfully loaded tensor of size {size_gb} GB onto GPU {gpu_index}")
    except RuntimeError as e:
        print(f"Failed to load tensor of size {size_gb} GB onto GPU {gpu_index}: {e}")

# Function to test loading an oversized tensor onto the GPU
def test_oversized_tensor_load(gpu_index, size_gb):
    tensor_size = int(size_gb * 1024 ** 3 / 4)  # size in floats (4 bytes per float)
    device = torch.device(f'cuda:{gpu_index}')
    try:
        tensor = torch.rand(tensor_size, device=device)
        print(f"Unexpectedly succeeded in loading tensor of size {size_gb} GB onto GPU {gpu_index}")
    except RuntimeError as e:
        print(f"Correctly failed to load tensor of size {size_gb} GB onto GPU {gpu_index}: {e}")

## Checking GPU availability and memory...

In [9]:
check_gpus()

Number of available GPUs: 2
GPU 0: Tesla V100S-PCIE-32GB
  Total Memory: 31.73 GB
GPU 1: Tesla V100S-PCIE-32GB
  Total Memory: 31.73 GB


## Validating expected GPU count and memory...

In [8]:
# Parameters
expected_gpu_count = 2  # Change as needed
expected_memory_gb = 31  # Change as needed (per GPU)
tensor_size_gb = 31  # Change as needed

In [10]:
validate_gpus(expected_gpu_count, expected_memory_gb)

for i in range(expected_gpu_count):
    print(f"\nTesting tensor load on GPU {i}...")
    test_tensor_load(i, tensor_size_gb)
    
    print(f"\nTesting oversized tensor load on GPU {i}...")
    test_oversized_tensor_load(i, 2 * tensor_size_gb)

GPU 0 has sufficient memory: 31.73 GB
GPU 1 has sufficient memory: 31.73 GB

Testing tensor load on GPU 0...
Successfully loaded tensor of size 31 GB onto GPU 0

Testing oversized tensor load on GPU 0...
Correctly failed to load tensor of size 62 GB onto GPU 0: CUDA out of memory. Tried to allocate 62.00 GiB. GPU 0 has a total capacity of 31.73 GiB of which 31.43 GiB is free. Process 837903 has 308.00 MiB memory in use. Of the allocated memory 0 bytes is allocated by PyTorch, and 0 bytes is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

Testing tensor load on GPU 1...
Successfully loaded tensor of size 31 GB onto GPU 1

Testing oversized tensor load on GPU 1...
Correctly failed to load tensor of size 62 GB onto GPU 1: CUDA out of memory. Tried to allocate 

The validation confirms that the current system configuration meets the expected requirements:
- **GPU Count:** The expected number of GPUs (2) is available.
- **GPU Memory:** Each GPU has sufficient memory (31 GB per GPU) to support the planned workload.
This ensures that the system is adequately provisioned for the tasks requiring GPU resources.

## More Validation

In [None]:
X_train = torch.FloatTensor([0., 1., 2.])
print(X_train.is_cuda)
X_train = X_train.to(device)
print(X_train.is_cuda)

In [None]:
import torch
import math
# device = torch.device("cpu")
# device = torch.device("cuda:0") 
data_type = torch.float
x = torch.linspace(-math.pi, math.pi, 1500, device=device, dtype=data_type)
y = torch.sin(x)
a = torch.randn((), device=device, dtype=data_type)
b = torch.randn((), device=device, dtype=data_type)
c = torch.randn((), device=device, dtype=data_type)
d = torch.randn((), device=device, dtype=data_type)
learning_rate = 1e-6
m = 1
for i in range(1500):
    y_pred = a + b * m + c * m ** 2 + d * m ** 3
    loss = (y_pred - y).pow(2).sum().item()
    if i % 100 == 99:
        print(i, loss)
    grad_a = y_pred.sum()
    grad_b = (y_pred * m).sum()
    grad_c = (y_pred * m** 2).sum()
    grad_d = (y_pred * m ** 3).sum()
    a -= learning_rate * grad_a
    b -= learning_rate * grad_b
    c -= learning_rate * grad_c
    d -= learning_rate * grad_d

print(f'Result: y = {a.item()} + {b.item()} x + {c.item()} x^2 + {d.item()} x^3')