In [2]:
# test of the cupy kernel to compare with the numpy kernel (as anticipated performance boost in matrix_dot did not materialise for rest of kernels)
# test all kernels:
# 1) matrix dot (matrix_dot)
# 2) tensor matrix multiply (matmul)
# 3) vector matrix dot (vector_matmul)
# 4) batched matrix multiply (batched_matrix_multiply)
# 5) batched vector matrix dot (batched_vector_matrix_dot)
# Jan 22, 2025: all kernels now work, all tests now work (with batched vector matrix 3D memory kludged for now by slicing into 2D)
from fortran_tensor_ops_cupy import FortranTensorOps

# Initialize tensor operations (cupy version all GPU)
tensor_cp = FortranTensorOps()

In [3]:
# matrix_dot fixed!
# now matrix dot is crashing
import time  
import cupy as cp  
import numpy as np
from fortran_tensor_ops_cupy import FortranTensorOps

# Initialize tensor operations (cupy version all GPU)
tensor_cp = FortranTensorOps()

# Set random seed for reproducibility
np.random.seed(42)

# Print CuPy configuration
print("CuPy configuration:")
print(cp.show_config())

# Create a random numpy array of size 1000x1000 and scale it
a_np = np.random.rand(100, 100)
scale_factor = np.linalg.norm(a_np)
a_np = a_np / (scale_factor * np.power(1.1, 1/1000))  # Scale to prevent overflow

# Use the same input data for all three methods
a_cp = cp.array(a_np)  # Copy to GPU for CuPy
a_cuda = a_np.copy()   # Copy for CUDA Fortran

print("\nInitial matrix first few elements:")
print("NumPy:", a_np[0:2, 0:2])
print("CuPy:", cp.asnumpy(a_cp)[0:2, 0:2])
print("CUDA input:", a_cuda[0:2, 0:2])

# First multiplication test
print("\nAfter first multiplication:")
result_np = np.dot(a_np, a_np)
print("NumPy:", result_np[0:2, 0:2])

result_cp = cp.dot(a_cp, a_cp)
print("CuPy:", cp.asnumpy(result_cp)[0:2, 0:2])

result_cuda = tensor_cp.matrix_dot1(a_cuda, 1)
print("CUDA Fortran:", result_cuda[0:2, 0:2])

print("\nRunning full tests...")

# NumPy test
start = time.time()
result_numpy = a_np.copy()
for i in range(1000):  
    result_numpy = np.dot(result_numpy, a_np)
numpy_time = time.time() - start
print(f"Time taken by NumPy: {numpy_time:.3f} seconds")

# CuPy test
start = time.time()
result_cupy = a_cp.copy()
for i in range(1000):  
    result_cupy = cp.dot(result_cupy, a_cp)
result_cupy = cp.asnumpy(result_cupy)
cupy_time = time.time() - start
print(f"Time taken by CuPy: {cupy_time:.3f} seconds")

# CUDA Fortran test
start = time.time()
result_cuda = tensor_cp.matrix_dot1(a_cuda, 1000)
cuda_time = time.time() - start
print(f"Time taken by CUDA Fortran: {cuda_time:.3f} seconds")

print("\nFinal results (first few elements):")
print("NumPy result:")
print(result_numpy[0:2, 0:2])
print("\nCuPy result:")
print(result_cupy[0:2, 0:2])
print("\nCUDA Fortran result:")
print(result_cuda[0:2, 0:2])

# Compare results at specific iterations
def run_specific_iterations(n_iters):
    result_np = a_np.copy()
    result_cp = a_cp.copy()
    result_cu = tensor_cp.matrix_dot1(a_cuda, n_iters)
    
    for i in range(n_iters):
        result_np = np.dot(result_np, a_np)
    
    for i in range(n_iters):
        result_cp = cp.dot(result_cp, a_cp)
    
    return result_np, cp.asnumpy(result_cp), result_cu

# Check accuracy at different iterations
for iters in [1, 10, 100]:
    print(f"\nResults after {iters} iterations:")
    res_np, res_cp, res_cu = run_specific_iterations(iters)
    print(f"NumPy[0,0]: {res_np[0,0]:.6e}")
    print(f"CuPy[0,0]:  {res_cp[0,0]:.6e}")
    print(f"CUDA[0,0]:  {res_cu[0,0]:.6e}")
    print(f"Ratios:")
    print(f"CUDA/NumPy: {res_cu[0,0]/res_np[0,0]:.6f}")
    print(f"CUDA/CuPy:  {res_cu[0,0]/res_cp[0,0]:.6f}")


CuPy configuration:
OS                           : Linux-6.12.9-200.fc41.x86_64-x86_64-with-glibc2.40
Python Version               : 3.12.7
CuPy Version                 : 13.3.0
CuPy Platform                : NVIDIA CUDA
NumPy Version                : 2.0.2
SciPy Version                : 1.14.1
Cython Build Version         : 0.29.36
Cython Runtime Version       : None
CUDA Root                    : /var/home/fraser/nvidia/hpc_sdk/Linux_x86_64/24.11/cuda
nvcc PATH                    : /var/home/fraser/nvidia/hpc_sdk/Linux_x86_64/24.11/cuda/bin/nvcc
CUDA Build Version           : 12060
CUDA Driver Version          : 12070
CUDA Runtime Version         : 12060 (linked to CuPy) / 12060 (locally installed)
CUDA Extra Include Dirs      : []
cuBLAS Version               : (available)
cuFFT Version                : 11300
cuRAND Version               : 10307
cuSOLVER Version             : (11, 7, 1)
cuSPARSE Version             : (available)
NVRTC Version                : (12, 6)
Thrust Version 

In [4]:
# matrix_dot
import time  
import cupy as cp  
import numpy as np
from numpy.ctypeslib import ndpointer
import ctypes
from fortran_tensor_ops_cupy import FortranTensorOps

# Initialize tensor operations (cupy version all GPU)
tensor_cp = FortranTensorOps()

# create a random numpy array of size 1000x1000 and scale it
a_np = np.random.rand(1000, 1000)
# Modified scaling
a_np = np.random.rand(1000, 1000)
scale_factor = np.linalg.norm(a_np)
a_np = a_np / (scale_factor * np.power(1.1, 1/1000))  # More gradual scaling

a_cp = cp.array(a_np)  # same scaled data for CuPy
a_cuda = a_np.copy()   # same scaled data for CUDA Fortran

# NumPy test
start = time.time()
result_numpy = a_np.copy()
for i in range(1000):  
    result_numpy = np.dot(result_numpy, a_np)
numpy_time = time.time() - start
print(f"Time taken by NumPy: {numpy_time:.3f} seconds")

# CuPy test
start = time.time()
result_cupy = a_cp.copy()
for i in range(1000):  
    result_cupy = cp.dot(result_cupy, a_cp)
result_cupy = cp.asnumpy(result_cupy)
cupy_time = time.time() - start
print(f"Time taken by CuPy: {cupy_time:.3f} seconds")

# CUDA Fortran test
start = time.time()
result_cuda = tensor_cp.matrix_dot(a_cuda, 1000)
cuda_time = time.time() - start
print(f"Time taken by CUDA Fortran: {cuda_time:.3f} seconds")

print("\nChecking results (first few elements):")
print("NumPy result:")
print(result_numpy[0:2, 0:2])
print("\nCuPy result:")
print(result_cupy[0:2, 0:2])
print("\nCUDA Fortran result:")
print(result_cuda[0:2, 0:2])

print("\nChecking for infinities or NaNs:")
print(f"NumPy contains inf: {np.isinf(result_numpy).any()}")
print(f"NumPy contains NaN: {np.isnan(result_numpy).any()}")
print(f"CuPy contains inf: {np.isinf(result_cupy).any()}")
print(f"CuPy contains NaN: {np.isnan(result_cupy).any()}")
print(f"CUDA contains inf: {np.isinf(result_cuda).any()}")
print(f"CUDA contains NaN: {np.isnan(result_cuda).any()}")

if not (np.isinf(result_numpy).any() or np.isinf(result_cupy).any() or np.isinf(result_cuda).any()):
    print("\nAccuracy comparison (maximum absolute differences):")
    print(f"NumPy vs CuPy:         {np.max(np.abs(result_numpy - result_cupy)):.2e}")
    print(f"NumPy vs CUDA Fortran: {np.max(np.abs(result_numpy - result_cuda)):.2e}")
    print(f"CuPy vs CUDA Fortran:  {np.max(np.abs(result_cupy - result_cuda)):.2e}")

print("\nSpeedup ratios:")
print(f"CUDA Fortran vs NumPy: {numpy_time/cuda_time:.1f}x faster")
print(f"CUDA Fortran vs CuPy:  {cupy_time/cuda_time:.1f}x faster")
print(f"CuPy vs NumPy:         {numpy_time/cupy_time:.1f}x faster")

Time taken by NumPy: 10.753 seconds
Time taken by CuPy: 20.265 seconds
 CUDA resources finalized successfully
 CUDA resources initialized successfully
Time taken by CUDA Fortran: 0.437 seconds

Checking results (first few elements):
NumPy result:
[[2.33706779e-66 2.42868700e-66]
 [2.33979801e-66 2.43152426e-66]]

CuPy result:
[[2.33706779e-66 2.42868700e-66]
 [2.33979801e-66 2.43152426e-66]]

CUDA Fortran result:
[[1.03530304e-07 1.03530304e-07]
 [1.03614148e-07 1.03614148e-07]]

Checking for infinities or NaNs:
NumPy contains inf: False
NumPy contains NaN: False
CuPy contains inf: False
CuPy contains NaN: False
CUDA contains inf: False
CUDA contains NaN: False

Accuracy comparison (maximum absolute differences):
NumPy vs CuPy:         7.38e-81
NumPy vs CUDA Fortran: 1.09e-07
CuPy vs CUDA Fortran:  1.09e-07

Speedup ratios:
CUDA Fortran vs NumPy: 24.6x faster
CUDA Fortran vs CuPy:  46.4x faster
CuPy vs NumPy:         0.5x faster


In [5]:
# matmul (seems to require device reset)
import time
import numpy as np
import cupy as cp
from fortran_tensor_ops_cupy import FortranTensorOps

tensor_cp = FortranTensorOps()

# Parameters
m, k, n = 1000, 1000, 1000
iterations = 10

# Create data
a_np = np.random.rand(m, k).astype(np.float64)
b_np = np.random.rand(k, n).astype(np.float64)
a_cp = cp.array(a_np)
b_cp = cp.array(b_np)

print(f"Type of a_cp after creation: {type(a_cp)}")
print(f"Type of b_cp after creation: {type(b_cp)}")

# NumPy test
start = time.time()
for i in range(iterations):
    result_numpy = np.dot(a_np, b_np)
numpy_time = time.time() - start
print(f"NumPy time: {numpy_time:.3f} seconds")

# CuPy test
start = time.time()
for i in range(iterations):
    result_cupy = cp.dot(a_cp, b_cp)
cp.cuda.Stream.null.synchronize()
result_cupy = cp.asnumpy(result_cupy)
cupy_time = time.time() - start
print(f"CuPy time: {cupy_time:.3f} seconds")

# CUDA Fortran test
start = time.time()
for i in range(iterations):
    result_cuda = tensor_cp.matmul(a_cp, b_cp)  # Using CuPy arrays
result_cuda = cp.asnumpy(result_cuda)  # Convert to NumPy for comparison
cuda_time = time.time() - start
print(f"CUDA Fortran time: {cuda_time:.3f} seconds")

print("\nResults comparison (first few elements):")
print("NumPy result:")
print(result_numpy[0:2, 0:2])
print("CuPy result:")
print(result_cupy[0:2, 0:2])
print("CUDA Fortran result:")
print(result_cuda[0:2, 0:2])

print("\nAccuracy comparison (maximum absolute differences):")
print(f"CuPy vs CUDA Fortran:  {np.max(np.abs(result_cupy - result_cuda)):.2e}")
print(f"NumPy vs CuPy:         {np.max(np.abs(result_numpy - result_cupy)):.2e}")
print(f"NumPy vs CUDA Fortran: {np.max(np.abs(result_numpy - result_cuda)):.2e}")

print("\nSpeedup ratios:")
print(f"CUDA Fortran vs NumPy: {numpy_time/cuda_time:.1f}x faster")
print(f"CUDA Fortran vs CuPy:  {cupy_time/cuda_time:.1f}x faster")
print(f"CuPy vs NumPy:         {numpy_time/cupy_time:.1f}x faster")

Type of a_cp after creation: <class 'cupy.ndarray'>
Type of b_cp after creation: <class 'cupy.ndarray'>
NumPy time: 0.134 seconds
CuPy time: 0.209 seconds
 CUDA resources finalized successfully
 CUDA resources initialized successfully
 CUDA resources finalized successfully
 CUDA resources initialized successfully
 CUDA resources finalized successfully
 CUDA resources initialized successfully
 CUDA resources finalized successfully
 CUDA resources initialized successfully
 CUDA resources finalized successfully
 CUDA resources initialized successfully
 CUDA resources finalized successfully
 CUDA resources initialized successfully
 CUDA resources finalized successfully
 CUDA resources initialized successfully
 CUDA resources finalized successfully
 CUDA resources initialized successfully
 CUDA resources finalized successfully
 CUDA resources initialized successfully
 CUDA resources finalized successfully
 CUDA resources initialized successfully
CUDA Fortran time: 0.463 seconds

Results com

In [6]:
# matmul
import numpy as np
import cupy as cp
import time
from fortran_tensor_ops_cupy import FortranTensorOps

tensor_cp = FortranTensorOps()

def test_matmul(m=1000, k=2000, n=1000, num_runs=10):
    # Setup matrix sizes and runs
    print(f"Matrix multiplication test: {m}x{k} @ {k}x{n}")
    
    # Allocate and initialize large matrices
    a_np = np.random.rand(m, k).astype(np.float64)
    b_np = np.random.rand(k, n).astype(np.float64)
    
    # Convert to CuPy for CuPy test
    a_cp = cp.array(a_np)
    b_cp = cp.array(b_np)
    
    # Timing and performance tracking
    numpy_times = []
    cupy_times = []
    cuda_times = []
    
    # Multiple runs for more reliable timing
    for run in range(num_runs):
        # NumPy test
        start = time.time()
        result_numpy = a_np @ b_np
        numpy_times.append(time.time() - start)
        
        # CuPy test
        start = time.time()
        result_cupy = a_cp @ b_cp
        cupy_times.append(time.time() - start)
        
        # CUDA test
        start = time.time()
        result_cuda = tensor_cp.matmul(a_np, b_np)
        cuda_times.append(time.time() - start)
    
    # Convert results to CuPy for consistent comparison
    result_numpy_cp = cp.array(result_numpy)
    result_cupy_cp = result_cupy
    
    # Convert CUDA result to CuPy
    if hasattr(result_cuda, 'to_numpy'):
        result_cuda_cp = cp.array(result_cuda.to_numpy())
    else:
        result_cuda_cp = cp.array(result_cuda)
    
    # Detailed results and statistics
    print("\nResults Comparison:")
    print("First 2x2 elements:")
    print("NumPy:", result_numpy[:2,:2])
    print("CuPy: ", cp.asnumpy(result_cupy_cp)[:2,:2])
    print("CUDA: ", cp.asnumpy(result_cuda_cp)[:2,:2])
    
    print("\nAbsolute Differences:")
    print("NumPy vs CuPy max diff:", cp.max(cp.abs(result_numpy_cp - result_cupy_cp)))
    print("NumPy vs CUDA max diff:", cp.max(cp.abs(result_numpy_cp - result_cuda_cp)))
    print("CuPy vs CUDA max diff:", cp.max(cp.abs(result_cupy_cp - result_cuda_cp)))
    
    print("\nPerformance:")
    print(f"NumPy: {np.mean(numpy_times):.5f} ± {np.std(numpy_times):.5f} seconds")
    print(f"CuPy:  {np.mean(cupy_times):.5f} ± {np.std(cupy_times):.5f} seconds")
    print(f"CUDA:  {np.mean(cuda_times):.5f} ± {np.std(cuda_times):.5f} seconds")
    
    print("\nSpeedups:")
    print(f"CUDA vs NumPy: {np.mean(numpy_times)/np.mean(cuda_times):.1f}x")
    print(f"CuPy vs NumPy: {np.mean(numpy_times)/np.mean(cupy_times):.1f}x")
    print(f"CUDA vs CuPy:  {np.mean(cupy_times)/np.mean(cuda_times):.1f}x")

# Run the test
if __name__ == "__main__":
    test_matmul()

Matrix multiplication test: 1000x2000 @ 2000x1000
 CUDA resources finalized successfully
 CUDA resources initialized successfully
 CUDA resources finalized successfully
 CUDA resources initialized successfully
 CUDA resources finalized successfully
 CUDA resources initialized successfully
 CUDA resources finalized successfully
 CUDA resources initialized successfully
 CUDA resources finalized successfully
 CUDA resources initialized successfully
 CUDA resources finalized successfully
 CUDA resources initialized successfully
 CUDA resources finalized successfully
 CUDA resources initialized successfully
 CUDA resources finalized successfully
 CUDA resources initialized successfully
 CUDA resources finalized successfully
 CUDA resources initialized successfully
 CUDA resources finalized successfully
 CUDA resources initialized successfully

Results Comparison:
First 2x2 elements:
NumPy: [[511.87579864 505.82449041]
 [508.69766396 505.98684364]]
CuPy:  [[511.87579864 505.82449041]
 [508.6

In [7]:
# matmul 
import numpy as np
import cupy as cp
import time
from fortran_tensor_ops_cupy import FortranTensorOps

tensor_cp = FortranTensorOps()

def ensure_cuda_ready():
    """
    Ensure CUDA device is properly initialized.
    """
    import cupy as cp
    
    try:
        # Perform a small test computation to ensure device is fully initialized
        test_array = cp.zeros((10, 10), dtype=cp.float64)
        test_array2 = cp.zeros((10, 10), dtype=cp.float64)
        
        # Do a small matrix multiplication
        _ = test_array @ test_array2
        
        print("CUDA device is ready and initialized")
    except Exception as e:
        print("Error initializing CUDA:", e)
        raise


def test_matmul(m=100, k=200, n=100, num_runs=10):
    print(f"Matrix multiplication test: {m}x{k} @ {k}x{n}")
    
    ensure_cuda_ready()
    # Allocate and initialize large matrices with controlled values
    np.random.seed(42)  # Ensure reproducibility
    a_np = np.random.rand(m, k).astype(np.float64) * 10
    b_np = np.random.rand(k, n).astype(np.float64) * 10
    
    # Convert to CuPy for all computations
    a_cp = cp.array(a_np)
    b_cp = cp.array(b_np)
    
    # Timing and performance tracking
    numpy_times = []
    cupy_times = []
    cuda_times = []
    
    # Compute reference results
    result_numpy = a_np @ b_np
    result_cupy = a_cp @ b_cp
    
    # CUDA test with detailed error checking
    try:
        result_cuda = tensor_cp.matmul(a_np, b_np)
        
        # Convert CUDA result to CuPy
        if hasattr(result_cuda, 'to_numpy'):
            result_cuda_cp = cp.array(result_cuda.to_numpy())
        else:
            result_cuda_cp = cp.array(result_cuda)
        
        # Performance measurement
        for _ in range(num_runs):
            # NumPy timing
            start = time.time()
            _ = a_np @ b_np
            numpy_times.append(time.time() - start)
            
            # CuPy timing
            start = time.time()
            _ = a_cp @ b_cp
            cp.cuda.stream.get_current_stream().synchronize()
            cupy_times.append(time.time() - start)
            
            # CUDA timing
            start = time.time()
            _ = tensor_cp.matmul(a_np, b_np)
            cuda_times.append(time.time() - start)
        
        print("\nDetailed Diagnostic Information:")
        print("Input A:")
        print("  Shape:", a_cp.shape)
        print("  Mean:", cp.mean(a_cp))
        print("  Min:", cp.min(a_cp))
        print("  Max:", cp.max(a_cp))
        
        print("\nInput B:")
        print("  Shape:", b_cp.shape)
        print("  Mean:", cp.mean(b_cp))
        print("  Min:", cp.min(b_cp))
        print("  Max:", cp.max(b_cp))
        
        print("\nResults Comparison:")
        print("First 5x5 elements:")
        print("NumPy:\n", result_numpy[:5,:5])
        print("\nCuPy:\n", cp.asnumpy(result_cupy)[:5,:5])
        print("\nCUDA:\n", cp.asnumpy(result_cuda_cp)[:5,:5])
        
        print("\nAbsolute Differences:")
        diff_numpy_cupy = cp.abs(cp.array(result_numpy) - result_cupy)
        diff_numpy_cuda = cp.abs(cp.array(result_numpy) - result_cuda_cp)
        diff_cupy_cuda = cp.abs(result_cupy - result_cuda_cp)
        
        print("NumPy vs CuPy max diff:", cp.max(diff_numpy_cupy))
        print("NumPy vs CUDA max diff:", cp.max(diff_numpy_cuda))
        print("CuPy vs CUDA max diff:", cp.max(diff_cupy_cuda))
        
        print("\nRelative Differences:")
        # Avoid division by zero
        rel_diff_numpy_cuda = cp.abs(cp.array(result_numpy) - result_cuda_cp) / (cp.abs(cp.array(result_numpy)) + 1e-10)
        print("Max relative difference:", cp.max(rel_diff_numpy_cuda))
        print("Mean relative difference:", cp.mean(rel_diff_numpy_cuda))
        
        print("\nCUDA Result Statistics:")
        print("  Mean:", cp.mean(result_cuda_cp))
        print("  Std:", cp.std(result_cuda_cp))
        print("  Min:", cp.min(result_cuda_cp))
        print("  Max:", cp.max(result_cuda_cp))
        
        # Performance summary
        print("\nPerformance Summary:")
        print(f"NumPy: {np.mean(numpy_times):.5f} ± {np.std(numpy_times):.5f} seconds")
        print(f"CuPy:  {np.mean(cupy_times):.5f} ± {np.std(cupy_times):.5f} seconds")
        print(f"CUDA:  {np.mean(cuda_times):.5f} ± {np.std(cuda_times):.5f} seconds")
        
        print("\nSpeedups:")
        print(f"CuPy speedup vs NumPy: {np.mean(numpy_times)/np.mean(cupy_times):.1f}x")
        print(f"CUDA speedup vs NumPy: {np.mean(numpy_times)/np.mean(cuda_times):.1f}x")
        print(f"CUDA vs CuPy: {np.mean(cupy_times)/np.mean(cuda_times):.1f}x")
        
    except Exception as e:
        print("Error in CUDA computation:", e)
        import traceback
        traceback.print_exc()

# Run the test
if __name__ == "__main__":
    test_matmul()

Matrix multiplication test: 100x200 @ 200x100
CUDA device is ready and initialized
 CUDA resources finalized successfully
 CUDA resources initialized successfully
 CUDA resources finalized successfully
 CUDA resources initialized successfully
 CUDA resources finalized successfully
 CUDA resources initialized successfully
 CUDA resources finalized successfully
 CUDA resources initialized successfully
 CUDA resources finalized successfully
 CUDA resources initialized successfully
 CUDA resources finalized successfully
 CUDA resources initialized successfully
 CUDA resources finalized successfully
 CUDA resources initialized successfully
 CUDA resources finalized successfully
 CUDA resources initialized successfully
 CUDA resources finalized successfully
 CUDA resources initialized successfully
 CUDA resources finalized successfully
 CUDA resources initialized successfully
 CUDA resources finalized successfully
 CUDA resources initialized successfully

Detailed Diagnostic Information:
Inp

In [8]:
# vector_matmul test
import time
import numpy as np
import cupy as cp
from fortran_tensor_ops_cupy import FortranTensorOps

tensor_cp = FortranTensorOps()

# Parameters
vec_len, mat_cols = 1000, 1000
iterations = 10

# Create data
vec_np = np.random.rand(vec_len).astype(np.float64)
mat_np = np.random.rand(vec_len, mat_cols).astype(np.float64)
vec_cp = cp.array(vec_np)
mat_cp = cp.array(mat_np)

print(f"Vector shape: {vec_cp.shape}")
print(f"Matrix shape: {mat_cp.shape}")

# NumPy test
start = time.time()
for i in range(iterations):
    result_numpy = np.dot(vec_np, mat_np)
numpy_time = time.time() - start
print(f"NumPy time: {numpy_time:.3f} seconds")

# CuPy test
start = time.time()
for i in range(iterations):
    result_cupy = cp.dot(vec_cp, mat_cp)
cp.cuda.Stream.null.synchronize()
result_cupy = cp.asnumpy(result_cupy)
cupy_time = time.time() - start
print(f"CuPy time: {cupy_time:.3f} seconds")

# CUDA Fortran test
start = time.time()
for i in range(iterations):
    result_cuda = tensor_cp.vector_matmul(vec_cp, mat_cp)
result_cuda = cp.asnumpy(result_cuda)
cuda_time = time.time() - start
print(f"CUDA Fortran time: {cuda_time:.3f} seconds")

print("\nResults comparison (first few elements):")
print("NumPy result:")
print(result_numpy[:5])
print("CuPy result:")
print(result_cupy[:5])
print("CUDA Fortran result:")
print(result_cuda[:5])

print("\nAccuracy comparison (maximum absolute differences):")
print(f"CuPy vs CUDA Fortran:  {np.max(np.abs(result_cupy - result_cuda)):.2e}")
print(f"NumPy vs CuPy:         {np.max(np.abs(result_numpy - result_cupy)):.2e}")
print(f"NumPy vs CUDA Fortran: {np.max(np.abs(result_numpy - result_cuda)):.2e}")

print("\nSpeedup ratios:")
print(f"CUDA Fortran vs NumPy: {numpy_time/cuda_time:.1f}x faster")
print(f"CUDA Fortran vs CuPy:  {cupy_time/cuda_time:.1f}x faster")
print(f"CuPy vs NumPy:         {numpy_time/cupy_time:.1f}x faster")

Vector shape: (1000,)
Matrix shape: (1000, 1000)
NumPy time: 0.002 seconds
CuPy time: 0.003 seconds
 CUDA resources finalized successfully
 CUDA resources initialized successfully
 CUDA resources finalized successfully
 CUDA resources initialized successfully
 CUDA resources finalized successfully
 CUDA resources initialized successfully
 CUDA resources finalized successfully
 CUDA resources initialized successfully
 CUDA resources finalized successfully
 CUDA resources initialized successfully
 CUDA resources finalized successfully
 CUDA resources initialized successfully
 CUDA resources finalized successfully
 CUDA resources initialized successfully
 CUDA resources finalized successfully
 CUDA resources initialized successfully
 CUDA resources finalized successfully
 CUDA resources initialized successfully
 CUDA resources finalized successfully
 CUDA resources initialized successfully
 CUDA resources finalized successfully
 CUDA resources initialized successfully
 CUDA resources fina

In [9]:
# batched_matrix_multiply test
import time
import numpy as np
import cupy as cp
from fortran_tensor_ops_cupy import FortranTensorOps

tensor_cp = FortranTensorOps()

# Parameters
m, k, n = 100, 100, 100  # Smaller matrices due to batching
batch_size = 32
iterations = 10

# Create data
a_np = np.random.rand(m, k, batch_size).astype(np.float64)
b_np = np.random.rand(k, n).astype(np.float64)
a_cp = cp.array(a_np)
b_cp = cp.array(b_np)

print(f"Batch matrix A shape: {a_cp.shape}")
print(f"Matrix B shape: {b_cp.shape}")

# NumPy test - using loop for batched multiplication
start = time.time()
result_numpy = np.zeros((m, n, batch_size))
for i in range(iterations):
    for j in range(batch_size):
        result_numpy[:,:,j] = np.dot(a_np[:,:,j], b_np)
numpy_time = time.time() - start
print(f"NumPy time: {numpy_time:.3f} seconds")

# CuPy test - using loop for batched multiplication
start = time.time()
result_cupy = cp.zeros((m, n, batch_size))
for i in range(iterations):
    for j in range(batch_size):
        result_cupy[:,:,j] = cp.dot(a_cp[:,:,j], b_cp)
cp.cuda.Stream.null.synchronize()
result_cupy = cp.asnumpy(result_cupy)
cupy_time = time.time() - start
print(f"CuPy time: {cupy_time:.3f} seconds")

# CUDA Fortran test
start = time.time()
for i in range(iterations):
    result_cuda = tensor_cp.batched_matrix_multiply(a_cp, b_cp)
result_cuda = cp.asnumpy(result_cuda)
cuda_time = time.time() - start
print(f"CUDA Fortran time: {cuda_time:.3f} seconds")

print("\nResults comparison (first few elements of first batch):")
print("NumPy result:")
print(result_numpy[0:2, 0:2, 0])
print("CuPy result:")
print(result_cupy[0:2, 0:2, 0])
print("CUDA Fortran result:")
print(result_cuda[0:2, 0:2, 0])

print("\nAccuracy comparison (maximum absolute differences):")
print(f"CuPy vs CUDA Fortran:  {np.max(np.abs(result_cupy - result_cuda)):.2e}")
print(f"NumPy vs CuPy:         {np.max(np.abs(result_numpy - result_cupy)):.2e}")
print(f"NumPy vs CUDA Fortran: {np.max(np.abs(result_numpy - result_cuda)):.2e}")

print("\nSpeedup ratios:")
print(f"CUDA Fortran vs NumPy: {numpy_time/cuda_time:.1f}x faster")
print(f"CUDA Fortran vs CuPy:  {cupy_time/cuda_time:.1f}x faster")
print(f"CuPy vs NumPy:         {numpy_time/cupy_time:.1f}x faster")

Batch matrix A shape: (100, 100, 32)
Matrix B shape: (100, 100)
NumPy time: 0.072 seconds
CuPy time: 0.018 seconds
 CUDA resources finalized successfully
 CUDA resources initialized successfully
 CUDA resources finalized successfully
 CUDA resources initialized successfully
 CUDA resources finalized successfully
 CUDA resources initialized successfully
 CUDA resources finalized successfully
 CUDA resources initialized successfully
 CUDA resources finalized successfully
 CUDA resources initialized successfully
 CUDA resources finalized successfully
 CUDA resources initialized successfully
 CUDA resources finalized successfully
 CUDA resources initialized successfully
 CUDA resources finalized successfully
 CUDA resources initialized successfully
 CUDA resources finalized successfully
 CUDA resources initialized successfully
 CUDA resources finalized successfully
 CUDA resources initialized successfully
 CUDA resources finalized successfully
 CUDA resources initialized successfully
 CUDA

In [10]:
# batched_vector_matmul test
import time
import numpy as np
import cupy as cp
from fortran_tensor_ops_cupy import FortranTensorOps

tensor_cp = FortranTensorOps()

# Parameters
vec_len, mat_cols = 1000, 1000
batch_size = 64
iterations = 1000

# Create data
vectors_np = np.random.rand(vec_len, batch_size).astype(np.float64)
matrix_np = np.random.rand(vec_len, mat_cols).astype(np.float64)
vectors_cp = cp.array(vectors_np)
matrix_cp = cp.array(matrix_np)

print(f"Batched vectors shape: {vectors_cp.shape}")
print(f"Matrix shape: {matrix_cp.shape}")

# NumPy test - using loop for batched multiplication
start = time.time()
result_numpy = np.zeros((batch_size, mat_cols))
for i in range(iterations):
    for j in range(batch_size):
        result_numpy[j,:] = np.dot(vectors_np[:,j], matrix_np)
numpy_time = time.time() - start
print(f"NumPy time: {numpy_time:.3f} seconds")

# CuPy test - using loop for batched multiplication
start = time.time()
result_cupy = cp.zeros((batch_size, mat_cols))
for i in range(iterations):
    for j in range(batch_size):
        result_cupy[j,:] = cp.dot(vectors_cp[:,j], matrix_cp)
cp.cuda.Stream.null.synchronize()
result_cupy = cp.asnumpy(result_cupy)
cupy_time = time.time() - start
print(f"CuPy time: {cupy_time:.3f} seconds")

# CUDA Fortran test
start = time.time()
for i in range(iterations):
    result_cuda = tensor_cp.batched_vector_matmul(vectors_cp, matrix_cp)
result_cuda = cp.asnumpy(result_cuda)
cuda_time = time.time() - start
print(f"CUDA Fortran time: {cuda_time:.3f} seconds")

print("\nResults comparison (first few elements of first batch):")
print("NumPy result:")
print(result_numpy[0,:5])
print("CuPy result:")
print(result_cupy[0,:5])
print("CUDA Fortran result:")
print(result_cuda[0,:5])

print("\nAccuracy comparison (maximum absolute differences):")
print(f"CuPy vs CUDA Fortran:  {np.max(np.abs(result_cupy - result_cuda)):.2e}")
print(f"NumPy vs CuPy:         {np.max(np.abs(result_numpy - result_cupy)):.2e}")
print(f"NumPy vs CUDA Fortran: {np.max(np.abs(result_numpy - result_cuda)):.2e}")

print("\nSpeedup ratios:")
print(f"CUDA Fortran vs NumPy: {numpy_time/cuda_time:.1f}x faster")
print(f"CUDA Fortran vs CuPy:  {cupy_time/cuda_time:.1f}x faster")
print(f"CuPy vs NumPy:         {numpy_time/cupy_time:.1f}x faster")

Batched vectors shape: (1000, 64)
Matrix shape: (1000, 1000)


NumPy time: 7.868 seconds
CuPy time: 3.232 seconds
 CUDA resources finalized successfully
 CUDA resources initialized successfully
 CUDA resources finalized successfully
 CUDA resources initialized successfully
 CUDA resources finalized successfully
 CUDA resources initialized successfully
 CUDA resources finalized successfully
 CUDA resources initialized successfully
 CUDA resources finalized successfully
 CUDA resources initialized successfully
 CUDA resources finalized successfully
 CUDA resources initialized successfully
 CUDA resources finalized successfully
 CUDA resources initialized successfully
 CUDA resources finalized successfully
 CUDA resources initialized successfully
 CUDA resources finalized successfully
 CUDA resources initialized successfully
 CUDA resources finalized successfully
 CUDA resources initialized successfully
 CUDA resources finalized successfully
 CUDA resources initialized successfully
 CUDA resources finalized successfully
 CUDA resources initialized su

In [11]:
# Jan 22, 2025: 1/5 tests pass; matrix_dot fixed by using numpy; now 2/5 tests pass; oops now batched vector matmul test crashes
# used cuda-dba ./test_matmul3.cuf to debug crashes; found cleanup routines were trying to kill non-existent streams
# once fixed the program ran without crashes and matmul, batched vector matrix now work without crashing!
# only vector_matmul and batched_matrix_multiply are crashing