In [1]:
from py_virtual_gpu import VirtualGPU, syncthreads
from py_virtual_gpu.services import get_gpu_manager
from py_virtual_gpu.api.server import start_background_api
from py_virtual_gpu.kernel import kernel
from py_virtual_gpu.thread import get_current_thread

In [2]:
@kernel
def reduce_sum_kernel(threadIdx, blockIdx, blockDim, gridDim, in_ptr, out_ptr):
    ctx = get_current_thread()
    shared_mem = ctx.shared_mem
    tx = threadIdx[0]
    shared_mem.write(tx * 4, in_ptr[tx])
    syncthreads()

    stride = blockDim[0] // 2
    while stride > 0:
        if tx < stride:
            a = int.from_bytes(shared_mem.read(tx * 4, 4), "little", signed=True)
            b = int.from_bytes(shared_mem.read((tx + stride) * 4, 4), "little", signed=True)
            shared_mem.write(tx * 4, (a + b).to_bytes(4, "little", signed=True))
        syncthreads()
        stride //= 2

    if tx == 0:
        out_ptr[0] = shared_mem.read(0, 4)

In [3]:
def host_reduce(vals):
    total = 0
    for v in vals:
        total += v
    return total

In [4]:
from py_virtual_gpu.api.server import start_background_api, start_background_dashboard
api_thread, ui_proc, stop = start_background_dashboard(port=8001)

INFO:     Started server process [22356]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://127.0.0.1:8001 (Press CTRL+C to quit)


In [5]:
values = [1, 2, 3, 4, 5, 6, 7, 8]
gpu = get_gpu_manager().get_gpu(0)
gpu

<py_virtual_gpu.virtualgpu.VirtualGPU at 0x235912312d0>

In [11]:
from py_virtual_gpu import VirtualGPU
from py_virtual_gpu.kernel import kernel
import platform

print('Platform:', platform.system())

gpu = VirtualGPU(num_sms=1, global_mem_size=1024)
VirtualGPU.set_current(gpu)

@kernel(grid_dim=(1, 1, 1), block_dim=(2, 1, 1))
def test_kernel(threadIdx, blockIdx, blockDim, gridDim):
    print(f'Thread {threadIdx[0]} executed successfully!')

print('Running kernel test...')
test_kernel()
gpu.synchronize()
print('Test completed successfully!')

Platform: Windows
Running kernel test...
Thread 0 executed successfully!
Thread 1 executed successfully!
Test completed successfully!


In [6]:
VirtualGPU.set_current(gpu)
in_bytes = b"".join(v.to_bytes(4, "little", signed=True) for v in values)

in_ptr = gpu.malloc(len(in_bytes))
out_ptr = gpu.malloc(4)

gpu.memcpy_host_to_device(in_bytes, in_ptr)

In [7]:
reduce_sum_kernel(
    in_ptr,
    out_ptr,
    grid_dim=(1, 1, 1),
    block_dim=(len(values), 1, 1),
)

In [8]:
gpu.synchronize()

out = gpu.memcpy_device_to_host(out_ptr, 4)
result = int.from_bytes(out, "little", signed=True)
expected = host_reduce(values)

print("Kernel result:", result)
print("Host result:", expected)

Exception in thread Exception in thread Thread-7 (run):
Traceback (most recent call last):
  File "C:\Users\diego.rodrigues\AppData\Local\anaconda3\Lib\threading.py", line 1038, in _bootstrap_inner
Thread-6 (run):
Traceback (most recent call last):
  File "C:\Users\diego.rodrigues\AppData\Local\anaconda3\Lib\threading.py", line 1038, in _bootstrap_inner
Exception in thread Thread-8 (run):
Traceback (most recent call last):
  File "C:\Users\diego.rodrigues\AppData\Local\anaconda3\Lib\threading.py", line 1038, in _bootstrap_inner
Exception in thread Thread-9 (run):
Traceback (most recent call last):
  File "C:\Users\diego.rodrigues\AppData\Local\anaconda3\Lib\threading.py", line 1038, in _bootstrap_inner
    self.run()
  File "C:\Users\diego.rodrigues\AppData\Local\anaconda3\Lib\threading.py", line 975, in run
Exception in thread Thread-10 (run):
Traceback (most recent call last):
  File "C:\Users\diego.rodrigues\AppData\Local\anaconda3\Lib\threading.py", line 1038, in _bootstrap_inner
 

Kernel result: 0
Host result: 36
