In [1]:
# original test from cupy
import time  
import cupy as cp  
import numpy as np

# create a random numpy array of size 1000x1000  
a = np.random.rand(1000, 1000)

# create a random cupy array of size 1000x1000  
b = cp.random.rand(1000, 1000)

# start timer  
start = time.time()

# perform some random operations on numpy array  
for i in range(1000):  
    a = np.dot(a, a)

# print time taken  
print("Time taken by numpy: ", time.time() - start)

# start timer  
start = time.time()

# perform some random operations on cupy array  
for i in range(1000):  
    b = cp.dot(b, b)

# print time taken  
print("Time taken by cupy: ", time.time() - start)

Time taken by numpy:  11.670009851455688
Time taken by cupy:  9.9381263256073


In [2]:
print(b.device)

<CUDA Device 0>


CUDA Fortran kernel using cuBLAS tested for same accuracy as cupy

In [3]:
# initialise fortran library
import numpy as np
from numpy.ctypeslib import ndpointer
import ctypes
import time

# Load the shared library
lib = ctypes.CDLL('./libcudamatmul_double.so')

# Define argument types
lib.py_matrix_dot.argtypes = [
    ndpointer(dtype=np.float64, flags='C_CONTIGUOUS'),
    ndpointer(dtype=np.float64, flags='C_CONTIGUOUS'),
    ctypes.c_int,
    ctypes.c_int
]

def cuda_dot_double(a, iterations=1000):
    """Fast matrix power using CUDA Fortran with binary exponentiation"""
    if not isinstance(a, np.ndarray):
        raise TypeError("Input must be a numpy array")
    
    if a.dtype != np.float64:
        a = np.asarray(a, dtype=np.float64)
    
    if a.shape[0] != a.shape[1]:
        raise ValueError("Matrix must be square")
    
    if not a.flags['C_CONTIGUOUS']:
        a = np.ascontiguousarray(a)
    
    n = a.shape[0]
    result = np.empty_like(a)
    
    lib.py_matrix_dot(a, result, n, iterations)
    return result

In [4]:
import time  
import cupy as cp  
import numpy as np

# create a random numpy array of size 1000x1000 and scale it
a_np = np.random.rand(1000, 1000)
# Modified scaling
a_np = np.random.rand(1000, 1000)
scale_factor = np.linalg.norm(a_np)
a_np = a_np / (scale_factor * np.power(1.1, 1/1000))  # More gradual scaling

a_cp = cp.array(a_np)  # same scaled data for CuPy
a_cuda = a_np.copy()   # same scaled data for CUDA Fortran

# NumPy test
start = time.time()
result_numpy = a_np.copy()
for i in range(1000):  
    result_numpy = np.dot(result_numpy, a_np)
numpy_time = time.time() - start
print(f"Time taken by NumPy: {numpy_time:.3f} seconds")

# CuPy test
start = time.time()
result_cupy = a_cp.copy()
for i in range(1000):  
    result_cupy = cp.dot(result_cupy, a_cp)
result_cupy = cp.asnumpy(result_cupy)
cupy_time = time.time() - start
print(f"Time taken by CuPy: {cupy_time:.3f} seconds")

# CUDA Fortran test
start = time.time()
result_cuda = cuda_dot_double(a_cuda, 1000)
cuda_time = time.time() - start
print(f"Time taken by CUDA Fortran: {cuda_time:.3f} seconds")

print("\nChecking results (first few elements):")
print("NumPy result:")
print(result_numpy[0:2, 0:2])
print("\nCuPy result:")
print(result_cupy[0:2, 0:2])
print("\nCUDA Fortran result:")
print(result_cuda[0:2, 0:2])

print("\nChecking for infinities or NaNs:")
print(f"NumPy contains inf: {np.isinf(result_numpy).any()}")
print(f"NumPy contains NaN: {np.isnan(result_numpy).any()}")
print(f"CuPy contains inf: {np.isinf(result_cupy).any()}")
print(f"CuPy contains NaN: {np.isnan(result_cupy).any()}")
print(f"CUDA contains inf: {np.isinf(result_cuda).any()}")
print(f"CUDA contains NaN: {np.isnan(result_cuda).any()}")

if not (np.isinf(result_numpy).any() or np.isinf(result_cupy).any() or np.isinf(result_cuda).any()):
    print("\nAccuracy comparison (maximum absolute differences):")
    print(f"NumPy vs CuPy:         {np.max(np.abs(result_numpy - result_cupy)):.2e}")
    print(f"NumPy vs CUDA Fortran: {np.max(np.abs(result_numpy - result_cuda)):.2e}")
    print(f"CuPy vs CUDA Fortran:  {np.max(np.abs(result_cupy - result_cuda)):.2e}")

print("\nSpeedup ratios:")
print(f"CUDA Fortran vs NumPy: {numpy_time/cuda_time:.1f}x faster")
print(f"CUDA Fortran vs CuPy:  {cupy_time/cuda_time:.1f}x faster")
print(f"CuPy vs NumPy:         {numpy_time/cupy_time:.1f}x faster")

Time taken by NumPy: 12.796 seconds
Time taken by CuPy: 20.359 seconds
Time taken by CUDA Fortran: 20.328 seconds

Checking results (first few elements):
NumPy result:
[[3.08320013e-66 3.09295878e-66]
 [3.08024097e-66 3.08999025e-66]]

CuPy result:
[[3.08320013e-66 3.09295878e-66]
 [3.08024097e-66 3.08999025e-66]]

CUDA Fortran result:
[[3.08320013e-66 3.09295878e-66]
 [3.08024097e-66 3.08999025e-66]]

Checking for infinities or NaNs:
NumPy contains inf: False
NumPy contains NaN: False
CuPy contains inf: False
CuPy contains NaN: False
CUDA contains inf: False
CUDA contains NaN: False

Accuracy comparison (maximum absolute differences):
NumPy vs CuPy:         1.05e-80
NumPy vs CUDA Fortran: 1.21e-80
CuPy vs CUDA Fortran:  1.37e-80

Speedup ratios:
CUDA Fortran vs NumPy: 0.6x faster
CUDA Fortran vs CuPy:  1.0x faster
CuPy vs NumPy:         0.6x faster


In [6]:
import time  
import cupy as cp  
import numpy as np
#from numba import cuda

# Set random seed for reproducibility
np.random.seed(42)

# Print CuPy configuration
print("CuPy configuration:")
print(cp.show_config())

# Create a random numpy array of size 1000x1000 and scale it
a_np = np.random.rand(1000, 1000)
scale_factor = np.linalg.norm(a_np)
a_np = a_np / (scale_factor * np.power(1.1, 1/1000))  # Scale to prevent overflow

# Use the same input data for all three methods
a_cp = cp.array(a_np)  # Copy to GPU for CuPy
a_cuda = a_np.copy()   # Copy for CUDA Fortran

print("\nInitial matrix first few elements:")
print("NumPy:", a_np[0:2, 0:2])
print("CuPy:", cp.asnumpy(a_cp)[0:2, 0:2])
print("CUDA input:", a_cuda[0:2, 0:2])

# First multiplication test
print("\nAfter first multiplication:")
result_np = np.dot(a_np, a_np)
print("NumPy:", result_np[0:2, 0:2])

result_cp = cp.dot(a_cp, a_cp)
print("CuPy:", cp.asnumpy(result_cp)[0:2, 0:2])

result_cuda = cuda_dot_double(a_cuda, 1)
print("CUDA Fortran:", result_cuda[0:2, 0:2])

print("\nRunning full tests...")

# NumPy test
start = time.time()
result_numpy = a_np.copy()
for i in range(1000):  
    result_numpy = np.dot(result_numpy, a_np)
numpy_time = time.time() - start
print(f"Time taken by NumPy: {numpy_time:.3f} seconds")

# CuPy test
start = time.time()
result_cupy = a_cp.copy()
for i in range(1000):  
    result_cupy = cp.dot(result_cupy, a_cp)
result_cupy = cp.asnumpy(result_cupy)
cupy_time = time.time() - start
print(f"Time taken by CuPy: {cupy_time:.3f} seconds")

# CUDA Fortran test
start = time.time()
result_cuda = cuda_dot_double(a_cuda, 1000)
cuda_time = time.time() - start
print(f"Time taken by CUDA Fortran: {cuda_time:.3f} seconds")

print("\nFinal results (first few elements):")
print("NumPy result:")
print(result_numpy[0:2, 0:2])
print("\nCuPy result:")
print(result_cupy[0:2, 0:2])
print("\nCUDA Fortran result:")
print(result_cuda[0:2, 0:2])

# Compare results at specific iterations
def run_specific_iterations(n_iters):
    result_np = a_np.copy()
    result_cp = a_cp.copy()
    result_cu = cuda_dot_double(a_cuda, n_iters)
    
    for i in range(n_iters):
        result_np = np.dot(result_np, a_np)
    
    for i in range(n_iters):
        result_cp = cp.dot(result_cp, a_cp)
    
    return result_np, cp.asnumpy(result_cp), result_cu

# Check accuracy at different iterations
for iters in [1, 10, 100]:
    print(f"\nResults after {iters} iterations:")
    res_np, res_cp, res_cu = run_specific_iterations(iters)
    print(f"NumPy[0,0]: {res_np[0,0]:.6e}")
    print(f"CuPy[0,0]:  {res_cp[0,0]:.6e}")
    print(f"CUDA[0,0]:  {res_cu[0,0]:.6e}")
    print(f"Ratios:")
    print(f"CUDA/NumPy: {res_cu[0,0]/res_np[0,0]:.6f}")
    print(f"CUDA/CuPy:  {res_cu[0,0]/res_cp[0,0]:.6f}")


CuPy configuration:
OS                           : Linux-6.12.9-200.fc41.x86_64-x86_64-with-glibc2.40
Python Version               : 3.12.7
CuPy Version                 : 13.3.0
CuPy Platform                : NVIDIA CUDA
NumPy Version                : 2.2.1
SciPy Version                : 1.14.1
Cython Build Version         : 0.29.36
Cython Runtime Version       : None
CUDA Root                    : /var/home/fraser/nvidia/hpc_sdk/Linux_x86_64/24.11/cuda
nvcc PATH                    : /var/home/fraser/nvidia/hpc_sdk/Linux_x86_64/24.11/cuda/bin/nvcc
CUDA Build Version           : 12060
CUDA Driver Version          : 12070
CUDA Runtime Version         : 12060 (linked to CuPy) / 12060 (locally installed)
CUDA Extra Include Dirs      : []
cuBLAS Version               : (available)
cuFFT Version                : 11300
cuRAND Version               : 10307
cuSOLVER Version             : (11, 7, 1)
cuSPARSE Version             : (available)
NVRTC Version                : (12, 6)
Thrust Version 

In [7]:
# initialise fortran library for tensor core version
import numpy as np
from numpy.ctypeslib import ndpointer
import ctypes
import time

# Load the shared library
lib = ctypes.CDLL('./libcudamatmul_tensor.so')

# Define argument types
lib.py_matrix_dot.argtypes = [
    ndpointer(dtype=np.float64, flags='C_CONTIGUOUS'),
    ndpointer(dtype=np.float64, flags='C_CONTIGUOUS'),
    ctypes.c_int,
    ctypes.c_int
]

def cuda_dot_tensor(a, iterations=1000):
    """Fast matrix power using CUDA Fortran with binary exponentiation"""
    if not isinstance(a, np.ndarray):
        raise TypeError("Input must be a numpy array")
    
    if a.dtype != np.float64:
        a = np.asarray(a, dtype=np.float64)
    
    if a.shape[0] != a.shape[1]:
        raise ValueError("Matrix must be square")
    
    if not a.flags['C_CONTIGUOUS']:
        a = np.ascontiguousarray(a)
    
    n = a.shape[0]
    result = np.empty_like(a)
    
    lib.py_matrix_dot(a, result, n, iterations)
    return result

In [8]:
import time  
import cupy as cp  
import numpy as np

# create a random numpy array of size 1000x1000 and scale it
a_np = np.random.rand(1000, 1000)
# Modified scaling
a_np = np.random.rand(1000, 1000)
scale_factor = np.linalg.norm(a_np)
a_np = a_np / (scale_factor * np.power(1.1, 1/1000))  # More gradual scaling

a_cp = cp.array(a_np)  # same scaled data for CuPy
a_cuda = a_np.copy()   # same scaled data for CUDA Fortran

# NumPy test
start = time.time()
result_numpy = a_np.copy()
for i in range(1000):  
    result_numpy = np.dot(result_numpy, a_np)
numpy_time = time.time() - start
print(f"Time taken by NumPy: {numpy_time:.3f} seconds")

# CuPy test
start = time.time()
result_cupy = a_cp.copy()
for i in range(1000):  
    result_cupy = cp.dot(result_cupy, a_cp)
result_cupy = cp.asnumpy(result_cupy)
cupy_time = time.time() - start
print(f"Time taken by CuPy: {cupy_time:.3f} seconds")

# CUDA Fortran test
start = time.time()
result_cuda = cuda_dot_tensor(a_cuda, 1000)
cuda_time = time.time() - start
print(f"Time taken by CUDA Fortran: {cuda_time:.3f} seconds")

print("\nChecking results (first few elements):")
print("NumPy result:")
print(result_numpy[0:2, 0:2])
print("\nCuPy result:")
print(result_cupy[0:2, 0:2])
print("\nCUDA Fortran result:")
print(result_cuda[0:2, 0:2])

print("\nChecking for infinities or NaNs:")
print(f"NumPy contains inf: {np.isinf(result_numpy).any()}")
print(f"NumPy contains NaN: {np.isnan(result_numpy).any()}")
print(f"CuPy contains inf: {np.isinf(result_cupy).any()}")
print(f"CuPy contains NaN: {np.isnan(result_cupy).any()}")
print(f"CUDA contains inf: {np.isinf(result_cuda).any()}")
print(f"CUDA contains NaN: {np.isnan(result_cuda).any()}")

if not (np.isinf(result_numpy).any() or np.isinf(result_cupy).any() or np.isinf(result_cuda).any()):
    print("\nAccuracy comparison (maximum absolute differences):")
    print(f"NumPy vs CuPy:         {np.max(np.abs(result_numpy - result_cupy)):.2e}")
    print(f"NumPy vs CUDA Fortran: {np.max(np.abs(result_numpy - result_cuda)):.2e}")
    print(f"CuPy vs CUDA Fortran:  {np.max(np.abs(result_cupy - result_cuda)):.2e}")

print("\nSpeedup ratios:")
print(f"CUDA Fortran vs NumPy: {numpy_time/cuda_time:.1f}x faster")
print(f"CUDA Fortran vs CuPy:  {cupy_time/cuda_time:.1f}x faster")
print(f"CuPy vs NumPy:         {numpy_time/cupy_time:.1f}x faster")

Time taken by NumPy: 8.676 seconds
Time taken by CuPy: 20.319 seconds
Time taken by CUDA Fortran: 0.509 seconds

Checking results (first few elements):
NumPy result:
[[2.60460916e-66 2.62017611e-66]
 [2.59865694e-66 2.61418831e-66]]

CuPy result:
[[2.60460916e-66 2.62017611e-66]
 [2.59865694e-66 2.61418831e-66]]

CUDA Fortran result:
[[1.02058017e-07 1.02058017e-07]
 [1.01911084e-07 1.01911084e-07]]

Checking for infinities or NaNs:
NumPy contains inf: False
NumPy contains NaN: False
CuPy contains inf: False
CuPy contains NaN: False
CUDA contains inf: False
CUDA contains NaN: False

Accuracy comparison (maximum absolute differences):
NumPy vs CuPy:         9.49e-81
NumPy vs CUDA Fortran: 1.10e-07
CuPy vs CUDA Fortran:  1.10e-07

Speedup ratios:
CUDA Fortran vs NumPy: 17.1x faster
CUDA Fortran vs CuPy:  39.9x faster
CuPy vs NumPy:         0.4x faster


In [10]:
import time  
import cupy as cp  
import numpy as np
#from numba import cuda

# Set random seed for reproducibility
np.random.seed(42)

# Print CuPy configuration
print("CuPy configuration:")
print(cp.show_config())

# Create a random numpy array of size 1000x1000 and scale it
a_np = np.random.rand(1000, 1000)
scale_factor = np.linalg.norm(a_np)
a_np = a_np / (scale_factor * np.power(1.1, 1/1000))  # Scale to prevent overflow

# Use the same input data for all three methods
a_cp = cp.array(a_np)  # Copy to GPU for CuPy
a_cuda = a_np.copy()   # Copy for CUDA Fortran

print("\nInitial matrix first few elements:")
print("NumPy:", a_np[0:2, 0:2])
print("CuPy:", cp.asnumpy(a_cp)[0:2, 0:2])
print("CUDA input:", a_cuda[0:2, 0:2])

# First multiplication test
print("\nAfter first multiplication:")
result_np = np.dot(a_np, a_np)
print("NumPy:", result_np[0:2, 0:2])

result_cp = cp.dot(a_cp, a_cp)
print("CuPy:", cp.asnumpy(result_cp)[0:2, 0:2])

result_cuda = cuda_dot_tensor(a_cuda, 1)
print("CUDA Fortran:", result_cuda[0:2, 0:2])

print("\nRunning full tests...")

# NumPy test
start = time.time()
result_numpy = a_np.copy()
for i in range(1000):  
    result_numpy = np.dot(result_numpy, a_np)
numpy_time = time.time() - start
print(f"Time taken by NumPy: {numpy_time:.3f} seconds")

# CuPy test
start = time.time()
result_cupy = a_cp.copy()
for i in range(1000):  
    result_cupy = cp.dot(result_cupy, a_cp)
result_cupy = cp.asnumpy(result_cupy)
cupy_time = time.time() - start
print(f"Time taken by CuPy: {cupy_time:.3f} seconds")

# CUDA Fortran test
start = time.time()
result_cuda = cuda_dot_tensor(a_cuda, 1000)
cuda_time = time.time() - start
print(f"Time taken by CUDA Fortran: {cuda_time:.3f} seconds")

print("\nFinal results (first few elements):")
print("NumPy result:")
print(result_numpy[0:2, 0:2])
print("\nCuPy result:")
print(result_cupy[0:2, 0:2])
print("\nCUDA Fortran result:")
print(result_cuda[0:2, 0:2])

# Compare results at specific iterations
def run_specific_iterations(n_iters):
    result_np = a_np.copy()
    result_cp = a_cp.copy()
    result_cu = cuda_dot_tensor(a_cuda, n_iters)
    
    for i in range(n_iters):
        result_np = np.dot(result_np, a_np)
    
    for i in range(n_iters):
        result_cp = cp.dot(result_cp, a_cp)
    
    return result_np, cp.asnumpy(result_cp), result_cu

# Check accuracy at different iterations
for iters in [1, 10, 100]:
    print(f"\nResults after {iters} iterations:")
    res_np, res_cp, res_cu = run_specific_iterations(iters)
    print(f"NumPy[0,0]: {res_np[0,0]:.6e}")
    print(f"CuPy[0,0]:  {res_cp[0,0]:.6e}")
    print(f"CUDA[0,0]:  {res_cu[0,0]:.6e}")
    print(f"Ratios:")
    print(f"CUDA/NumPy: {res_cu[0,0]/res_np[0,0]:.6f}")
    print(f"CUDA/CuPy:  {res_cu[0,0]/res_cp[0,0]:.6f}")


CuPy configuration:
OS                           : Linux-6.12.9-200.fc41.x86_64-x86_64-with-glibc2.40
Python Version               : 3.12.7
CuPy Version                 : 13.3.0
CuPy Platform                : NVIDIA CUDA
NumPy Version                : 2.2.1
SciPy Version                : 1.14.1
Cython Build Version         : 0.29.36
Cython Runtime Version       : None
CUDA Root                    : /var/home/fraser/nvidia/hpc_sdk/Linux_x86_64/24.11/cuda
nvcc PATH                    : /var/home/fraser/nvidia/hpc_sdk/Linux_x86_64/24.11/cuda/bin/nvcc
CUDA Build Version           : 12060
CUDA Driver Version          : 12070
CUDA Runtime Version         : 12060 (linked to CuPy) / 12060 (locally installed)
CUDA Extra Include Dirs      : []
cuBLAS Version               : (available)
cuFFT Version                : 11300
cuRAND Version               : 10307
cuSOLVER Version             : (11, 7, 1)
cuSPARSE Version             : (available)
NVRTC Version                : (12, 6)
Thrust Version 

The first multiplication is extremely accurate (within ~0.002% difference), and the results diverge more over iterations. This makes sense because:

1. First iteration: very close match
   - NumPy/CuPy: 0.000725
   - CUDA: 0.00072499
   - Ratio: 0.999983

2. After 10 iterations: still very close
   - NumPy/CuPy: 1.985871e-04
   - CUDA: 1.985755e-04
   - Ratio: 0.999941

3. After 100 iterations: divergence grows
   - NumPy/CuPy: 4.811153e-10
   - CUDA: 1.011735e-07
   - Ratio: 210.289597

This means that the CUDA tensor core dot() replacement is 
Accuracy comparison (maximum absolute differences):
NumPy vs CuPy:         9.49e-81
NumPy vs CUDA Fortran: 1.10e-07
CuPy vs CUDA Fortran:  1.10e-07

Speedup ratios:
CUDA Fortran vs NumPy: 69.0x faster
CUDA Fortran vs CuPy:  39.4x faster
CuPy vs NumPy:         1.8x faster

39x faster than cupy with negligible accuracy loss could be very useful in most situations. Also, the 210x ratio is with super tiny numbers of 4.8e-10 vs 1.01e-7. Not likely to affect normal matrix dot scenarios.