In [1]:
import os, ray
import cupy
from numba import cuda
print(cupy.show_config())

CuPy Version          : 7.5.0
CUDA Root             : /usr/local/cuda-11.0
CUDA Build Version    : 10020
CUDA Driver Version   : 11000
CUDA Runtime Version  : 10020
cuBLAS Version        : 10202
cuFFT Version         : 10102
cuRAND Version        : 10102
cuSOLVER Version      : (10, 3, 0)
cuSPARSE Version      : 10301
NVRTC Version         : (10, 2)
cuDNN Build Version   : 7605
cuDNN Version         : 7605
NCCL Build Version    : 2406
NCCL Runtime Version  : 2604
CUB Version           : None
cuTENSOR Version      : None
None


In [2]:
import torch
print("Cuda available: ", torch.cuda.is_available())
print("Device name:", torch.cuda.get_device_name())

Cuda available:  True
Device name: Quadro P620


In [3]:
ray.shutdown()
ray.init()

@ray.remote(num_gpus=1)
def use_gpu():
    print("ray.get_gpu_ids(): {}".format(ray.get_gpu_ids()[0]))
    print("CUDA_VISIBLE_DEVICES: {}".format(os.environ["CUDA_VISIBLE_DEVICES"]))

use_gpu.remote()

2020-06-22 17:08:24,340	INFO resource_spec.py:212 -- Starting Ray with 157.32 GiB memory available for workers and up to 71.43 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>).
2020-06-22 17:08:24,769	INFO services.py:1170 -- View the Ray dashboard at [1m[32mlocalhost:8265[39m[22m


ObjectID(45b95b1c8bd3a9c4ffffffff010000c801000000)

In [4]:
import numpy as np
import numba

@numba.stencil
def _smooth(x):
    return (x[-1, -1] + x[-1, 0] + x[-1, 1] +
            x[ 0, -1] + x[ 0, 0] + x[ 0, 1] +
            x[ 1, -1] + x[ 1, 0] + x[ 1, 1]) // 9

@numba.njit
def smooth_cpu(x):
    return _smooth(x)

[2m[36m(pid=24442)[0m ray.get_gpu_ids(): 0
[2m[36m(pid=24442)[0m CUDA_VISIBLE_DEVICES: 0


In [5]:
from numba import cuda

@cuda.jit
def smooth_gpu(x, out):
    i, j = cuda.grid(2)
    n, m = x.shape
    if 1 <= i < n - 1 and 1 <= j < m - 1:
        out[i, j] = (x[i - 1, j - 1] + x[i - 1, j] + x[i - 1, j + 1] +
                     x[i    , j - 1] + x[i    , j] + x[i    , j + 1] +
                     x[i + 1, j - 1] + x[i + 1, j] + x[i + 1, j + 1]) // 9

In [6]:
x_cpu = np.ones((10000, 10000), dtype='int8')

%timeit smooth_cpu(x_cpu)

416 ms ± 2.68 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
import cupy,math

x_gpu = cupy.ones((10000, 10000), dtype='int8')
out_gpu = cupy.zeros((10000, 10000), dtype='int8')

# I copied the four lines below from the Numba docs
threadsperblock = (16, 16)
blockspergrid_x = math.ceil(x_gpu.shape[0] / threadsperblock[0])
blockspergrid_y = math.ceil(x_gpu.shape[1] / threadsperblock[1])
blockspergrid = (blockspergrid_x, blockspergrid_y)

%timeit smooth_gpu[blockspergrid, threadsperblock](x_gpu, out_gpu)

In [None]:
!set CUDA_VISIBLE_DEVICES=512

In [None]:
use_gpu.remote()