NumPy (CPU), CuPy, pytorch (both GPU) and scikit-learn versions. Compare against CUDA Fortran matrix multiplication (which started at 350Gflops and hit 3,000Gflops). Numpy is hitting 266Gflops max, pytorch is hitting 107Gflops max on RTX 4060!

In [5]:
import numpy as np
import time

def calculate_gflops(M, N, K, time_seconds):
    """Calculate GFLOPS (billion floating point operations per second)"""
    # Same formula as CUDA version: 2 * M * N * K operations (multiply-add)
    flops = 2.0 * float(M) * float(N) * float(K)
    gflops = (flops / 1e9) / time_seconds
    return gflops

def benchmark_matmul(M=5120, N=5120, K=5120, num_runs=10, num_warmup=5):
    """Benchmark matrix multiplication using NumPy"""
    
    # Initialize matrices with random values (same as CUDA version)
    A = np.random.random((M, K)).astype(np.float64)  # Using float64 to match CUDA double precision
    B = np.random.random((K, N)).astype(np.float64)
    
    # Warmup runs
    print("Performing warmup runs...")
    for _ in range(num_warmup):
        C = np.matmul(A, B)
    
    # Performance measurement runs
    print("Performing timing runs...")
    times = []
    gflops_array = []
    
    for run in range(num_runs):
        # Time the matrix multiplication
        start_time = time.perf_counter()
        C = np.matmul(A, B)
        end_time = time.perf_counter()
        
        # Calculate timing and performance
        exec_time = end_time - start_time
        gflops = calculate_gflops(M, N, K, exec_time)
        
        times.append(exec_time)
        gflops_array.append(gflops)
        
        print(f"Run {run + 1:2d} - Time: {exec_time*1000:8.2f} ms, Performance: {gflops:8.2f} GFLOPS")
    
    # Calculate statistics
    min_gflops = min(gflops_array)
    max_gflops = max(gflops_array)
    avg_gflops = sum(gflops_array) / len(gflops_array)
    std_gflops = np.std(gflops_array)
    
    # Print results in similar format to CUDA version
    print("\nPerformance Results ({:d} runs):".format(num_runs))
    print(f"  Minimum: {min_gflops:10.2f} GFLOPS")
    print(f"  Maximum: {max_gflops:10.2f} GFLOPS")
    print(f"  Average: {avg_gflops:10.2f} GFLOPS")
    print(f"  Std Dev: {std_gflops:10.2f} GFLOPS")
    
    # Print average time in milliseconds
    avg_time_ms = np.mean(times) * 1000
    print(f"\nAverage execution time: {avg_time_ms:.2f} ms")
    
    return {
        'min_gflops': min_gflops,
        'max_gflops': max_gflops,
        'avg_gflops': avg_gflops,
        'std_gflops': std_gflops,
        'times_ms': np.array(times) * 1000
    }

# Run the benchmark
if __name__ == "__main__":
    # Print NumPy configuration
    print("NumPy Configuration:")
    print(f"  Version: {np.__version__}")
    print(f"  BLAS Info: {np.__config__.show()}")
    print()
    
    # Run benchmark
    results = benchmark_matmul(
        M=5120, 
        N=5120, 
        K=5120, 
        num_runs=10, 
        num_warmup=5
    )

NumPy Configuration:
  Version: 2.0.2
Build Dependencies:
  blas:
    detection method: pkgconfig
    found: true
    include directory: /opt/_internal/cpython-3.12.2/lib/python3.12/site-packages/scipy_openblas64/include
    lib directory: /opt/_internal/cpython-3.12.2/lib/python3.12/site-packages/scipy_openblas64/lib
    name: scipy-openblas
    openblas configuration: OpenBLAS 0.3.27  USE64BITINT DYNAMIC_ARCH NO_AFFINITY
      Zen MAX_THREADS=64
    pc file directory: /project/.openblas
    version: 0.3.27
  lapack:
    detection method: pkgconfig
    found: true
    include directory: /opt/_internal/cpython-3.12.2/lib/python3.12/site-packages/scipy_openblas64/include
    lib directory: /opt/_internal/cpython-3.12.2/lib/python3.12/site-packages/scipy_openblas64/lib
    name: scipy-openblas
    openblas configuration: OpenBLAS 0.3.27  USE64BITINT DYNAMIC_ARCH NO_AFFINITY
      Zen MAX_THREADS=64
    pc file directory: /project/.openblas
    version: 0.3.27
Compilers:
  c:
    commands

In [6]:
import cupy as cp
import numpy as np
import time

def calculate_gflops(M, N, K, time_seconds):
    """Calculate GFLOPS (billion floating point operations per second)"""
    flops = 2.0 * float(M) * float(N) * float(K)
    gflops = (flops / 1e9) / time_seconds
    return gflops

def benchmark_matmul_gpu():
    # Matrix dimensions
    M = N = K = 5120
    NUM_RUNS = 10
    NUM_WARMUP = 5
    
    print("CuPy GPU Configuration:")
    print(f"  Device: {cp.cuda.runtime.getDeviceProperties(0)['name'].decode()}")
    print(f"  Memory: {cp.cuda.runtime.memGetInfo()[1] / 1e9:.1f} GB")
    print(f"  Matrix Size: {M}x{M}")
    print()
    
    # Initialize matrices on CPU then transfer to GPU
    print("Initializing matrices...")
    A_cpu = np.random.random((M, K)).astype(np.float64)
    B_cpu = np.random.random((K, N)).astype(np.float64)
    
    # Transfer to GPU
    A = cp.array(A_cpu)
    B = cp.array(B_cpu)
    
    # Warmup runs
    print("Performing warmup runs...")
    for i in range(NUM_WARMUP):
        C = cp.matmul(A, B)
        cp.cuda.Stream.null.synchronize()
    
    # Performance measurement runs
    print("\nPerforming timing runs...")
    times = []
    gflops_array = []
    
    for run in range(NUM_RUNS):
        # Create CUDA events for timing
        start_event = cp.cuda.Event()
        end_event = cp.cuda.Event()
        
        # Record start event
        start_event.record()
        
        # Perform matrix multiplication
        C = cp.matmul(A, B)
        
        # Record end event and synchronize
        end_event.record()
        end_event.synchronize()
        
        # Calculate elapsed time in milliseconds
        elapsed_time_ms = cp.cuda.get_elapsed_time(start_event, end_event)
        elapsed_time_s = elapsed_time_ms / 1000.0  # Convert to seconds
        
        # Calculate GFLOPS
        gflops = calculate_gflops(M, N, K, elapsed_time_s)
        
        times.append(elapsed_time_ms)
        gflops_array = np.append(gflops_array if len(gflops_array) > 0 else [], gflops)
        
        print(f"Run {run + 1:2d} - Time: {elapsed_time_ms:8.2f} ms, Performance: {gflops:8.2f} GFLOPS")
    
    # Calculate statistics
    min_gflops = np.min(gflops_array)
    max_gflops = np.max(gflops_array)
    avg_gflops = np.mean(gflops_array)
    std_gflops = np.std(gflops_array)
    
    # Print final results
    print("\nPerformance Results ({:d} runs):".format(NUM_RUNS))
    print(f"  Minimum: {min_gflops:10.2f} GFLOPS")
    print(f"  Maximum: {max_gflops:10.2f} GFLOPS")
    print(f"  Average: {avg_gflops:10.2f} GFLOPS")
    print(f"  Std Dev: {std_gflops:10.2f} GFLOPS")
    
    # Print average time
    avg_time_ms = np.mean(times)
    print(f"\nAverage execution time: {avg_time_ms:.2f} ms")
    
    # Memory cleanup
    del A, B, C
    cp.get_default_memory_pool().free_all_blocks()
    
    return {
        'min_gflops': min_gflops,
        'max_gflops': max_gflops,
        'avg_gflops': avg_gflops,
        'std_gflops': std_gflops,
        'times_ms': times
    }

# Run the benchmark
if __name__ == "__main__":
    results = benchmark_matmul_gpu()

CuPy GPU Configuration:
  Device: NVIDIA GeForce RTX 4060
  Memory: 8.2 GB
  Matrix Size: 5120x5120

Initializing matrices...
Performing warmup runs...

Performing timing runs...
Run  1 - Time:  1304.91 ms, Performance:   205.71 GFLOPS
Run  2 - Time:  1311.88 ms, Performance:   204.62 GFLOPS
Run  3 - Time:  1297.96 ms, Performance:   206.81 GFLOPS
Run  4 - Time:  1305.07 ms, Performance:   205.69 GFLOPS
Run  5 - Time:  1290.61 ms, Performance:   207.99 GFLOPS
Run  6 - Time:  1282.92 ms, Performance:   209.24 GFLOPS
Run  7 - Time:  1277.91 ms, Performance:   210.06 GFLOPS
Run  8 - Time:  1281.27 ms, Performance:   209.51 GFLOPS
Run  9 - Time:  1230.59 ms, Performance:   218.13 GFLOPS
Run 10 - Time:  1258.79 ms, Performance:   213.25 GFLOPS

Performance Results (10 runs):
  Minimum:     204.62 GFLOPS
  Maximum:     218.13 GFLOPS
  Average:     209.10 GFLOPS
  Std Dev:       3.87 GFLOPS

Average execution time: 1284.19 ms


In [7]:
import torch
import numpy as np
import time

def calculate_gflops(M, N, K, time_seconds):
    """Calculate GFLOPS (billion floating point operations per second)"""
    flops = 2.0 * float(M) * float(N) * float(K)
    gflops = (flops / 1e9) / time_seconds
    return gflops

def benchmark_matmul_gpu():
    # Matrix dimensions
    M = N = K = 5120
    NUM_RUNS = 10
    NUM_WARMUP = 5
    
    # Check if CUDA is available
    if not torch.cuda.is_available():
        print("CUDA is not available. Running on CPU instead.")
        device = torch.device("cpu")
    else:
        device = torch.device("cuda")
        print("CUDA is available. Running on GPU.")
    
    print("\nPyTorch Configuration:")
    print(f"  PyTorch Version: {torch.__version__}")
    if torch.cuda.is_available():
        print(f"  Device: {torch.cuda.get_device_name(0)}")
        print(f"  CUDA Version: {torch.version.cuda}")
    print(f"  Matrix Size: {M}x{M}")
    print()
    
    # Initialize matrices on CPU then transfer to GPU
    print("Initializing matrices...")
    A = torch.randn(M, K, dtype=torch.float64, device=device)
    B = torch.randn(K, N, dtype=torch.float64, device=device)
    
    # Warmup runs
    print("Performing warmup runs...")
    for i in range(NUM_WARMUP):
        with torch.no_grad():
            C = torch.matmul(A, B)
        torch.cuda.synchronize()
    
    # Performance measurement runs
    print("\nPerforming timing runs...")
    times = []
    gflops_array = []
    
    for run in range(NUM_RUNS):
        torch.cuda.synchronize()
        start_time = time.perf_counter()
        
        with torch.no_grad():
            C = torch.matmul(A, B)
        
        torch.cuda.synchronize()
        end_time = time.perf_counter()
        
        # Calculate elapsed time in milliseconds
        elapsed_time_ms = (end_time - start_time) * 1000
        elapsed_time_s = elapsed_time_ms / 1000.0
        
        # Calculate GFLOPS
        gflops = calculate_gflops(M, N, K, elapsed_time_s)
        
        times.append(elapsed_time_ms)
        gflops_array = np.append(gflops_array if len(gflops_array) > 0 else [], gflops)
        
        print(f"Run {run + 1:2d} - Time: {elapsed_time_ms:8.2f} ms, Performance: {gflops:8.2f} GFLOPS")
    
    # Calculate statistics
    min_gflops = np.min(gflops_array)
    max_gflops = np.max(gflops_array)
    avg_gflops = np.mean(gflops_array)
    std_gflops = np.std(gflops_array)
    
    # Print final results
    print("\nPerformance Results ({:d} runs):".format(NUM_RUNS))
    print(f"  Minimum: {min_gflops:10.2f} GFLOPS")
    print(f"  Maximum: {max_gflops:10.2f} GFLOPS")
    print(f"  Average: {avg_gflops:10.2f} GFLOPS")
    print(f"  Std Dev: {std_gflops:10.2f} GFLOPS")
    
    # Print average time
    avg_time_ms = np.mean(times)
    print(f"\nAverage execution time: {avg_time_ms:.2f} ms")
    
    # Memory cleanup
    del A, B, C
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    
    return {
        'min_gflops': min_gflops,
        'max_gflops': max_gflops,
        'avg_gflops': avg_gflops,
        'std_gflops': std_gflops,
        'times_ms': times
    }

# Run the benchmark
if __name__ == "__main__":
    results = benchmark_matmul_gpu()

CUDA is available. Running on GPU.

PyTorch Configuration:
  PyTorch Version: 2.5.1+cu124
  Device: NVIDIA GeForce RTX 4060
  CUDA Version: 12.4
  Matrix Size: 5120x5120

Initializing matrices...
Performing warmup runs...

Performing timing runs...
Run  1 - Time:  1295.15 ms, Performance:   207.26 GFLOPS
Run  2 - Time:  1312.47 ms, Performance:   204.53 GFLOPS
Run  3 - Time:  1309.67 ms, Performance:   204.96 GFLOPS
Run  4 - Time:  1306.22 ms, Performance:   205.50 GFLOPS
Run  5 - Time:  1297.24 ms, Performance:   206.93 GFLOPS
Run  6 - Time:  1294.98 ms, Performance:   207.29 GFLOPS
Run  7 - Time:  1297.62 ms, Performance:   206.87 GFLOPS
Run  8 - Time:  1289.60 ms, Performance:   208.15 GFLOPS
Run  9 - Time:  1291.38 ms, Performance:   207.87 GFLOPS
Run 10 - Time:  1260.45 ms, Performance:   212.97 GFLOPS

Performance Results (10 runs):
  Minimum:     204.53 GFLOPS
  Maximum:     212.97 GFLOPS
  Average:     207.23 GFLOPS
  Std Dev:       2.23 GFLOPS

Average execution time: 1295.48 

In [8]:
from sklearn.utils.extmath import safe_sparse_dot
import numpy as np
import time

def calculate_gflops_sklearn(M, N, K, time_seconds):
    """Calculate GFLOPS (billion floating point operations per second)"""
    # Same formula as other versions: 2 * M * N * K operations (multiply-add)
    flops = 2.0 * float(M) * float(N) * float(K)
    gflops = (flops / 1e9) / time_seconds
    return gflops

def benchmark_matmul_sklearn(M=5120, N=5120, K=5120, num_runs=10, num_warmup=5):
    """Benchmark matrix multiplication using scikit-learn's safe_sparse_dot"""
    
    # Initialize matrices with random values (same as other versions)
    A = np.random.random((M, K)).astype(np.float64)
    B = np.random.random((K, N)).astype(np.float64)
    
    # Warmup runs
    print("Performing warmup runs...")
    for _ in range(num_warmup):
        C = safe_sparse_dot(A, B)
    
    # Performance measurement runs
    print("Performing timing runs...")
    times = []
    gflops_array = []
    
    for run in range(num_runs):
        # Time the matrix multiplication
        start_time = time.perf_counter()
        C = safe_sparse_dot(A, B)
        end_time = time.perf_counter()
        
        # Calculate timing and performance
        exec_time = end_time - start_time
        gflops = calculate_gflops_sklearn(M, N, K, exec_time)
        
        times.append(exec_time)
        gflops_array.append(gflops)
        
        print(f"Run {run + 1:2d} - Time: {exec_time*1000:8.2f} ms, Performance: {gflops:8.2f} GFLOPS")
    
    # Calculate statistics
    min_gflops = min(gflops_array)
    max_gflops = max(gflops_array)
    avg_gflops = sum(gflops_array) / len(gflops_array)
    std_gflops = np.std(gflops_array)
    
    # Print results in similar format to other versions
    print("\nPerformance Results ({:d} runs):".format(num_runs))
    print(f"  Minimum: {min_gflops:10.2f} GFLOPS")
    print(f"  Maximum: {max_gflops:10.2f} GFLOPS")
    print(f"  Average: {avg_gflops:10.2f} GFLOPS")
    print(f"  Std Dev: {std_gflops:10.2f} GFLOPS")
    
    # Print average time in milliseconds
    avg_time_ms = np.mean(times) * 1000
    print(f"\nAverage execution time: {avg_time_ms:.2f} ms")
    
    return {
        'min_gflops': min_gflops,
        'max_gflops': max_gflops,
        'avg_gflops': avg_gflops,
        'std_gflops': std_gflops,
        'times_ms': np.array(times) * 1000
    }

# Print scikit-learn configuration
import sklearn
print("scikit-learn Configuration:")
print(f"  Version: {sklearn.__version__}")
print()

# Run benchmark
results_sklearn = benchmark_matmul_sklearn()


scikit-learn Configuration:
  Version: 1.5.2

Performing warmup runs...
Performing timing runs...
Run  1 - Time:  1134.61 ms, Performance:   236.59 GFLOPS
Run  2 - Time:  1195.37 ms, Performance:   224.56 GFLOPS
Run  3 - Time:  1251.86 ms, Performance:   214.43 GFLOPS
Run  4 - Time:  1079.56 ms, Performance:   248.65 GFLOPS
Run  5 - Time:  1134.50 ms, Performance:   236.61 GFLOPS
Run  6 - Time:  1094.22 ms, Performance:   245.32 GFLOPS
Run  7 - Time:  1174.98 ms, Performance:   228.46 GFLOPS
Run  8 - Time:  1225.04 ms, Performance:   219.12 GFLOPS
Run  9 - Time:  1082.35 ms, Performance:   248.01 GFLOPS
Run 10 - Time:  1100.02 ms, Performance:   244.03 GFLOPS

Performance Results (10 runs):
  Minimum:     214.43 GFLOPS
  Maximum:     248.65 GFLOPS
  Average:     234.58 GFLOPS
  Std Dev:      11.72 GFLOPS

Average execution time: 1147.25 ms
