# PYTORCH - CUDA CHECKLIST NOTEBOOK

In [None]:
import torch
import sys, os
import subprocess
import platform

In [None]:
def run_command(command):
    """Runs a shell command and returns the output as a string."""
    try:
        result = subprocess.run(
            command, 
            stdout=subprocess.PIPE, 
            stderr=subprocess.PIPE, 
            text=True, 
            shell=True
        )
        if result.returncode != 0:
            return f"Error/Not Found (Code {result.returncode}): {result.stderr.strip()}"
        return result.stdout.strip()
    except Exception as e:
        return f"Execution Failed: {e}"

## SYSTEM & VERSIONS

In [None]:
print(f"Diagnostics initialized on: {platform.system()} {platform.release()}")
print(f"Python Version:    {sys.version.split()[0]}")
print(f"PyTorch Version:   {torch.__version__}")
print(f"OpenMP Enabled:  {torch.backends.openmp.is_available()}")
print(f"MKL Enabled:     {torch.backends.mkl.is_available()}")

### System CUDA

In [None]:
nvcc_output = run_command("nvcc --version")
if "release" in nvcc_output:
    lines = nvcc_output.split('\n')
    version_line = [l for l in lines if "release" in l][0]
    print(f"System NVCC:     {version_line.strip()}")
    print("Supported compute capabilities:\n  - {arch_list}".format(arch_list = run_command("nvcc --list-gpu-arch").replace('\n', '\n  - ')))
    print("Supported GPU architectures:\n  - {arch_list}".format(arch_list = run_command("nvcc --list-gpu-code").replace('\n', '\n  - ')))
else:
    print("System NVCC:     Not found in PATH (This is common if you only use Conda/Pip CUDA)")

driver_ver = run_command("nvidia-smi --query-gpu=driver_version --format=csv,noheader")
print(f"GPU Driver:      {driver_ver}")

visible_devices = os.environ.get('CUDA_VISIBLE_DEVICES')
print(f"CUDA_VISIBLE_DEVICES: {visible_devices if visible_devices else 'Not Set (All GPUs visible)'}")



### PyTorch Bundled with CUDA

In [None]:
print(f"PyTorch Version: {torch.__version__}")
print(f"Debug Build:     {torch.version.debug}")

cuda_available = torch.cuda.is_available()
print(f"CUDA Available:   {cuda_available}")

if cuda_available:
    # CUDA Version bundled with PyTorch
    print(f"PyTorch CUDA Version:      {torch.version.cuda}")
    print(f"Architecture List: {torch.cuda.get_arch_list()}")
else:
    print("\n[!] CUDA is not available. Please check your NVIDIA drivers and PyTorch installation.")

In [None]:
# Checks available backends for scaled_dot_product_attention (SDPA)
sdpa_available = hasattr(torch.nn.functional, 'scaled_dot_product_attention')
if sdpa_available:
    print(f"SDPA Available:     {sdpa_available}")
    # Check which backends are preferred/supported
    # (Note: exact API availability depends on specific PyTorch nightly/version)
    try:
        from torch.backends import cuda as cuda_backend
        print(f"Flash Attention:    {cuda_backend.flash_sdp_enabled()}")
        print(f"Mem-Efficient Attn: {cuda_backend.mem_efficient_sdp_enabled()}")
        print(f"Math (Fallback):    {cuda_backend.math_sdp_enabled()}")
    except ImportError:
        print("Could not query specific SDPA backends.")
else:
    print("SDPA Available:     No (Upgrade PyTorch)")

In [None]:
distributed_available = torch.distributed.is_available()
if distributed_available:
    print(f"Distributed Available: {distributed_available}")
    
    # 1. NCCL Check (NVIDIA Collective Communications Library)
    # Note: NCCL is generally NOT available on Windows (it's Linux-only)
    nccl_available = hasattr(torch.backends, "nccl") and torch.backends.nccl.is_built()
    print(f"NCCL Available:      {nccl_available}")
    if nccl_available:
        try:
            print(f"NCCL Version:        {torch.cuda.nccl.version()}")
        except:
            pass

    # 2. Gloo Check (Facebook's collective library - standard for Windows)
    gloo_available = hasattr(torch.distributed, "is_gloo_available")
    print(f"Gloo Available:      {gloo_available}")
    
    # 3. MPI Check (Message Passing Interface)
    mpi_available = torch.distributed.is_mpi_available()
    print(f"MPI Available:       {mpi_available}")
        
else:
    print(f"Distributed Available: {distributed_available}")

## DETECTED DEVICES

### Detected by System

In [None]:
print(run_command("nvidia-smi"))

### Detected by PyTorch

In [None]:
if cuda_available:
    device_count = torch.cuda.device_count()
    print(f"Detected GPUs:   {device_count}")
    
    for i in range(device_count):
        props = torch.cuda.get_device_properties(i)
        print(f"\n[GPU {i}: {props.name}]")
        print(f"  Compute Capability:  {props.major}.{props.minor}")
        print(f"  Total Memory:        {props.total_memory / (1024**3):.2f} GB")
        print(f"  Multiprocessors:     {props.multi_processor_count}")
        
        # Precision Support Checks
        try:
            # TF32 (TensorFloat-32) - Ampere+ only
            print(f"  TF32 Allowed (Matmul): {torch.backends.cuda.matmul.allow_tf32}")
            print(f"  TF32 Allowed (CuDNN):  {torch.backends.cudnn.allow_tf32}")
            
            # BF16 (BFloat16) - Ampere+ only
            print(f"  BF16 Supported:        {torch.cuda.is_bf16_supported()}")
            
            # FP16 (Half Precision)
            # Generally supported on all modern GPUs, but good to check context
            hw_fp16 = "YES (Native)" if props.major >= 6 else "Partial (Storage Only/Slow)"
            if props.major >= 7:
                hw_fp16 += " + Tensor Cores"
            print(f"  FP16 Support (HW):   {hw_fp16}")
        except AttributeError:
            print("  (Newer precision checks skipped - PyTorch version too old)")
else:
    print("No CUDA devices found.")

## FUNCTIONAL TEST

In [None]:
if cuda_available:
    try:
        # Create a tensor on CPU
        x = torch.rand(1000, 1000)
        print("1. Tensor created on CPU.")
        
        # Move to GPU
        device = torch.device("cuda")
        x_gpu = x.to(device)
        print(f"2. Tensor successfully moved to: {x_gpu.device}")
        
        # Perform Operation (Matrix Multiplication)
        y_gpu = torch.matmul(x_gpu, x_gpu)
        print("3. Matrix multiplication on GPU successful.")
        
        # Move back to CPU
        y_cpu = y_gpu.cpu()
        print("4. Result moved back to CPU.")
        print("[OK] PyTorch CUDA functionality is working.")
        
    except Exception as e:
        print(f"\n[!] ERROR during functional test:\n{e}")

## MEMORY DIAGNOSTICS

In [None]:
if cuda_available:
    # Current memory usage
    allocated = torch.cuda.memory_allocated(0) / (1024**3)
    reserved = torch.cuda.memory_reserved(0) / (1024**3)
    print(f"Current Memory Allocated: {allocated:.4f} GB")
    print(f"Current Memory Reserved:  {reserved:.4f} GB")

In [None]:
print(torch.cuda.memory_summary(device=0, abbreviated=True))