In [1]:
import torch
import csv
import numpy as np
import matplotlib.pyplot as plt

Ensure CUDA is available

In [2]:
assert torch.cuda.is_available(), "CUDA must be available to run this code on GPU."
device = torch.device("cuda")

User-configurable parameters

In [3]:
#dtype = torch.complex128
dtype = torch.float64

In [4]:
m = 4               # number of rows (fixed)
n = 4               # number of columns (fixed)
batch_sizes = [32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536]

Warm-up execution

In [5]:
# Warm-up to initialize CUDA context correctly for both real and complex:
if dtype in (torch.complex64, torch.complex128):
    # generate real and imaginary parts on GPU
    float_dtype = torch.float32 if dtype==torch.complex64 else torch.float64
    real = torch.randn(1, m, n, dtype=float_dtype, device=device)
    imag = torch.randn(1, m, n, dtype=float_dtype, device=device)
    # combine into a true complex tensor
    warm_up = torch.complex(real, imag)
else:
    warm_up = torch.randn(1, m, n, dtype=dtype, device=device)

# now do the SVD-values warm-up
S_warm_up = torch.linalg.svdvals(warm_up)
torch.cuda.synchronize()

Iterate over the batch size

In [6]:
elapsed_times = []
avg_sv_errors = []

for B in batch_sizes:
    # ── 1) Create a random batch on CPU (NumPy) with specified dtype ──
    if dtype in [torch.complex64, torch.complex128]:
        # Generate complex data (real + imaginary parts)
        real_part = np.random.randn(B, m, n).astype(np.float32 if dtype == torch.complex64 else np.float64)
        imag_part = np.random.randn(B, m, n).astype(np.float32 if dtype == torch.complex64 else np.float64)
        batch_cpu = real_part + 1j * imag_part
    else:
        # Generate real data
        batch_cpu = np.random.randn(B, m, n).astype(np.float32 if dtype == torch.float32 else np.float64)

    # 2) Move to GPU
    X = torch.from_numpy(batch_cpu).to(device=device, dtype=dtype)

    # 3) Time only-the-values SVD
    start = torch.cuda.Event(enable_timing=True)
    end   = torch.cuda.Event(enable_timing=True)
    torch.cuda.synchronize()
    start.record()

    S_gpu = torch.linalg.svdvals(X)

    end.record()
    torch.cuda.synchronize()
    elapsed_times.append(start.elapsed_time(end) * 1e-3)

    # 4) Bring back to CPU
    S_gpu_cpu = S_gpu.cpu().numpy()  # shape (B, k)

    # 5) Compute reference singular values & error
    errors = []
    for i in range(B):
        Ai = batch_cpu[i]
        S_ref = np.linalg.svd(Ai, compute_uv=False)
        # sort descending (torch and numpy both return sorted SVs)
        err = np.linalg.norm(S_gpu_cpu[i] - S_ref) / np.linalg.norm(S_ref)
        errors.append(err)

    avg_sv_errors.append(np.mean(errors))
    print(f"B={B:4d} → time={elapsed_times[-1]:.6f}s, avg rel SV-error={avg_sv_errors[-1]:.2e}")

B=  32 → time=0.031204s, avg rel SV-error=8.37e-16
B=  64 → time=0.002630s, avg rel SV-error=7.64e-16
B= 128 → time=0.005414s, avg rel SV-error=7.87e-16
B= 256 → time=0.008200s, avg rel SV-error=7.99e-16
B= 512 → time=0.030365s, avg rel SV-error=7.45e-16
B=1024 → time=0.026955s, avg rel SV-error=7.50e-16
B=2048 → time=0.053343s, avg rel SV-error=7.50e-16
B=4096 → time=0.101542s, avg rel SV-error=7.64e-16
B=8192 → time=0.181040s, avg rel SV-error=7.54e-16
B=16384 → time=0.188657s, avg rel SV-error=7.66e-16
B=32768 → time=0.306101s, avg rel SV-error=7.63e-16
B=65536 → time=0.612079s, avg rel SV-error=7.60e-16


Saving the results

In [7]:
# Save data to CSV
filename = "resultsPyTorch.csv"
with open(filename, 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(["elapsed_time", "error"])  # Header row
    writer.writerows(zip(elapsed_times, errors))

print(f"Data saved to {filename}")

Data saved to resultsPyTorch.csv
