# Torch / ROCm GPU sanity check!

thhis notebook prints Pytorch plus ROCm info and runs a tiny GPU matmul if a GPU is visible.


In [None]:
import os, sys, platform
print('Python:', sys.version)
print('Platform:', platform.platform())
print('Executable:', sys.executable)


In [None]:
import torch
print('torch.__version__:', torch.__version__)
print('torch.version.hip:', getattr(torch.version, 'hip', None))
print('torch.version.cuda:', getattr(torch.version, 'cuda', None))
print('torch.cuda.is_available():', torch.cuda.is_available())
print('torch.cuda.device_count():', torch.cuda.device_count())

if torch.cuda.is_available():
    for i in range(torch.cuda.device_count()):
        print(f'Device {i}:', torch.cuda.get_device_name(i))
        props = torch.cuda.get_device_properties(i)
        # Some properties may not exist on all builds; keep it robust
        print('  total_memory (GiB):', getattr(props, 'total_memory', 0)/1024**3)
        print('  multi_processor_count:', getattr(props, 'multi_processor_count', None))
        print('  gcnArchName:', getattr(props, 'gcnArchName', None))
else:
    print('\nNo GPU visible to torch')


In [None]:
import time
import torch

def run_matmul(device):
    # some smallish matmul to confirm compute :)
    a = torch.randn((2048, 2048), device=device, dtype=torch.float16)
    b = torch.randn((2048, 2048), device=device, dtype=torch.float16)
    # warmup
    for _ in range(5):
        c = a @ b
    torch.cuda.synchronize() if device.type == 'cuda' else None
    t0 = time.time()
    for _ in range(20):
        c = a @ b
    if device.type == 'cuda':
        torch.cuda.synchronize()
    t1 = time.time()
    print(f'Matmul OK on {device} | elapsed: {t1 - t0:.3f}s | c.mean(): {c.mean().item():.6f}')

if torch.cuda.is_available():
    run_matmul(torch.device('cuda:0'))
else:
    print('Skipping matmul: no torch-visible GPU.')


In [None]:
import subprocess, shlex

def try_cmd(cmd):
    print('\n$ ' + cmd)
    try:
        out = subprocess.check_output(shlex.split(cmd), stderr=subprocess.STDOUT, text=True)
        print(out[:4000])
        if len(out) > 4000:
            print('... (truncated)')
    except FileNotFoundError:
        print('Command not found.')
    except subprocess.CalledProcessError as e:
        print('Command failed with code', e.returncode)
        print((e.output or '')[:4000])

try_cmd('rocminfo')
try_cmd('rocm-smi')
