# TEST timing arccos with gt4py

In [1]:
import os
import gt4py.next as gtx
from gt4py.next.ffront.fbuiltins import arccos
import numpy as np

In [2]:
gt4py_prog_path = os.getcwd()
ref_data_path = "/".join(gt4py_prog_path.split("/")[:-2] + ["build", "data", "ref_data.npz"])

In [3]:
file = np.load(ref_data_path)

In [4]:
I = gtx.Dimension("I")
J = gtx.Dimension("J")
K = gtx.Dimension("K")
IField = gtx.Field[gtx.Dims[I], gtx.float64]
IJKField = gtx.Field[gtx.Dims[I, J, K], gtx.float64]

In [5]:
ref_np = file["ref"]
x_np = file["x"]
display(ref_np, x_np)

assert(len(x_np.shape) == 1)
domain_all = gtx.domain({
    I: (0, x_np.shape[0]),
})

array([0.83780304, 1.27465707, 1.17722935, ..., 2.16029659, 1.66456644,
       1.27790004], shape=(100000000,))

array([ 0.66909717,  0.29182971,  0.38348511, ..., -0.5559457 ,
       -0.09363275,  0.28872637], shape=(100000000,))

In [6]:
@gtx.field_operator
def arccos_once(x: IField) -> IField:
    return arccos(x)

@gtx.field_operator
def arccos_twice(x: IField) -> IField:
    return arccos(arccos(x))

@gtx.field_operator
def arccos_three_times(x: IField) -> IField:
    return arccos(arccos(arccos(x)))

@gtx.field_operator
def arccos_four_times(x: IField) -> IField:
    return arccos(arccos(arccos(arccos(x))))

In [47]:
# dir(gtx)


In [58]:
np.random.rand(8).dtype

dtype('float64')

In [51]:
# backend = None
# backend = gtx.gtfn_cpu
backend = gtx.gtfn_gpu
gtx_arccos = arccos_once.with_backend(backend)

ref = gtx.as_field(data=ref_np, domain=domain_all, allocator=backend)
x = gtx.as_field(data=x_np, domain=domain_all, allocator=backend)

out_field = gtx.empty(domain=domain_all, dtype=x.dtype, allocator=backend)

In [52]:
%timeit gtx_arccos(x=x, out=out_field, domain=domain_all)

1.91 ms ± 10.8 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [53]:
ref_np == out_field.asnumpy()

array([ True,  True,  True, ...,  True,  True,  True], shape=(100000000,))

In [54]:
np.all(ref_np == out_field.asnumpy())

np.False_

In [57]:
np.isclose(ref_np, out_field.asnumpy()).all()

np.True_

In [9]:
# backend = None
backend = gtx.gtfn_cpu
# backend = gtx.gtfn_gpu
gtx_arccos_cpu = arccos_once.with_backend(gtx.gtfn_cpu)
gtx_arccos_gpu = arccos_once.with_backend(gtx.gtfn_gpu)

x_cpu = gtx.as_field(data=x_np, domain=domain_all, allocator=gtx.gtfn_cpu)
x_gpu = gtx.as_field(data=x_np, domain=domain_all, allocator=gtx.gtfn_gpu)
out_field_cpu = gtx.empty(domain=domain_all, dtype=x.dtype, allocator=gtx.gtfn_cpu)
out_field_gpu = gtx.empty(domain=domain_all, dtype=x.dtype, allocator=gtx.gtfn_gpu)

In [18]:
%timeit gtx_arccos_cpu(x=x_cpu, out=out_field_cpu, domain=domain_all)

14.2 ms ± 661 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [19]:
%timeit gtx_arccos_gpu(x=x_gpu, out=out_field_gpu, domain=domain_all)

1.98 ms ± 5.28 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [31]:
import timeit
n_reps = 100
execution_time = timeit.timeit("gtx_arccos_gpu(x=x_gpu, out=out_field_gpu, domain=domain_all)", globals=globals(), number=n_reps)
print(f"Average time per run: {execution_time / n_reps:.6f} seconds")

Average time per run: 0.001981 seconds


In [32]:
import timeit
number = 100 # inner loop reps
repeats = 10 # timings (outer loop)
times = timeit.repeat("gtx_arccos_gpu(x=x_gpu, out=out_field_gpu, domain=domain_all)", globals=globals(), repeat=repeats, number=number)
mean_time, std_time = np.mean(times), np.std(times)
print(f"Average time per run: {mean_time / number:.6f} seconds")

Average time per run: 0.001949 seconds


In [39]:
import timeit
import statistics
import cupy as cp  # Required for GPU sync

def gt4py_timeit(func, *args, number=10, repeat=5, warm_up=1, synchronize=True, **kwargs):
    """
    Time a GT4Py function with optional GPU synchronization.
    
    Args:
        func: Function to time.
        *args, **kwargs: Arguments to pass to the function.
        number: How many executions per timing batch.
        repeat: How many timing batches.
        synchronize: If True, synchronize GPU before/after timing.

    Returns:
        mean_time_per_exec: Mean execution time (seconds).
        std_time_per_exec: Standard deviation (seconds).
        all_times: List of per-execution times (seconds).
    """
    times = []

    for _ in range(repeat + warm_up):
        if synchronize:
            cp.cuda.Device(0).synchronize()

        start = timeit.default_timer()

        for _ in range(number):
            func(*args, **kwargs)

        if synchronize:
            cp.cuda.Device(0).synchronize()

        end = timeit.default_timer()
        times.append((end - start) / number)  # Per-execution time

    for k in range(warm_up):
        print("    ", times[k])
    times = times[warm_up:]
    mean_time = np.mean(times)
    std_time = np.std(times)

    print(f"{func.__name__} mean time per execution: {mean_time:.6e} s ± {std_time:.6e} s over {repeat} repeats with {warm_up} warmups")

    return mean_time, std_time, times


In [41]:
gt4py_timeit(gtx_arccos_gpu, number=number, repeat=repeats, warm_up=1, synchronize=True, x=x_gpu, out=out_field_gpu, domain=domain_all)


     0.00201899686944671
arccos_once mean time per execution: 1.956682e-03 s ± 4.218620e-06 s over 10 repeats with 1 warmups


(np.float64(0.0019566818167222663),
 np.float64(4.218620336337162e-06),
 [0.0019582394894678144,
  0.0019630511791910977,
  0.001954908079933375,
  0.0019531244598329067,
  0.0019576862291432917,
  0.00195556181948632,
  0.001962093450129032,
  0.001957561440067366,
  0.0019572142499964685,
  0.0019473777699749917])

In [13]:
import time
import cupy as cp

In [14]:
# time manually:
tic = time.perf_counter()
gtx_arccos_gpu(x=x_gpu, out=out_field_gpu, domain=domain_all)
cp.cuda.runtime.deviceSynchronize()
toc = time.perf_counter()

In [15]:
toc-tic

0.005055260029621422

In [1]:
import os
os.getcwd()

'/users/class182/project-overlapping/src/gt4py'

In [2]:
os.listdir()

['.gt4py_cache',
 '.ipynb_checkpoints',
 'TEMP_arccos.ipynb',
 '__pycache__',
 'arccos_gt4py.py',
 'run_arccos_gt4py.py',
 'run_arccos_gt4py.sh']

In [6]:
os.listdir("../../build/gt4py/")

['.gt4py_cache',
 '.ipynb_checkpoints',
 'TEMP_arccos.ipynb',
 '__pycache__',
 'arccos_gt4py.py',
 'run_arccos_gt4py.py',
 'run_arccos_gt4py.sh']

In [7]:
os.path.isfile("../../build/gt4py/run_arccos_gt4py.py")

True

In [28]:
import pkgutil
import gt4py

for loader, module_name, is_pkg in pkgutil.iter_modules(gt4py.next.__path__):
    print(f"{module_name} {'(package)' if is_pkg else ''}")


allocators 
backend 
common 
config 
constructors 
embedded (package)
errors (package)
ffront (package)
field_utils 
iterator (package)
metrics 
otf (package)
program_processors (package)
type_system (package)
utils 
