In [1]:
import torch
import torch.nn as nn
import time

print("PyTorch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())

if torch.cuda.is_available():
    print("CUDA version:", torch.version.cuda)
    print("Device count:", torch.cuda.device_count())
    print("Current device:", torch.cuda.current_device())
    print("Device name:", torch.cuda.get_device_name(0))
    print("Device memory:", f"{torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
    
    # Test basic tensor operations on GPU
    print("\n--- GPU Performance Test ---")
    device = torch.device('cuda')
    
    # Create test tensors
    size = 1000
    a = torch.randn(size, size, device=device)
    b = torch.randn(size, size, device=device)
    
    # Warm up
    _ = torch.matmul(a, b)
    torch.cuda.synchronize()
    
    # Time matrix multiplication
    start = time.time()
    for _ in range(10):
        c = torch.matmul(a, b)
    torch.cuda.synchronize()
    gpu_time = (time.time() - start) / 10
    
    print(f"GPU matrix multiply ({size}x{size}): {gpu_time*1000:.2f} ms")
    
    # Compare with CPU
    a_cpu = a.cpu()
    b_cpu = b.cpu()
    start = time.time()
    for _ in range(10):
        c_cpu = torch.matmul(a_cpu, b_cpu)
    cpu_time = (time.time() - start) / 10
    
    print(f"CPU matrix multiply ({size}x{size}): {cpu_time*1000:.2f} ms")
    print(f"GPU speedup: {cpu_time/gpu_time:.1f}x")
    
else:
    print("CUDA not available - using CPU only")
    device = torch.device('cpu')
    print("Device:", device)

PyTorch version: 1.10.0
CUDA available: True
CUDA version: 10.2
Device count: 1
Current device: 0
Device name: NVIDIA Tegra X1
Device memory: 4.2 GB

--- GPU Performance Test ---
GPU matrix multiply (1000x1000): 28.22 ms
GPU matrix multiply (1000x1000): 28.22 ms
CPU matrix multiply (1000x1000): 74.26 ms
GPU speedup: 2.6x
CPU matrix multiply (1000x1000): 74.26 ms
GPU speedup: 2.6x
