In [1]:
from pynvml import *

In [2]:
nvmlInit()

In [3]:
ngpus = nvmlDeviceGetCount()

In [4]:
for i in range(ngpus):
    handle = nvmlDeviceGetHandleByIndex(i)
    gpu_util = nvmlDeviceGetUtilizationRates(handle).gpu
    print('GPU %d Utilization = %d%%' % (i, gpu_util))

GPU 0 Utilization = 0%


In [5]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2021 NVIDIA Corporation
Built on Sun_Aug_15_21:14:11_PDT_2021
Cuda compilation tools, release 11.4, V11.4.120
Build cuda_11.4.r11.4/compiler.30300941_0


In [6]:
import numba  # We added these two lines for a 500x speedup

@numba.jit    # We added these two lines for a 500x speedup
def sum(x):
    total = 0
    for i in range(x.shape[0]):
        total += x[i]
    return total

In [7]:
@numba.jit
def _smooth(x):
    out = np.empty_like(x)
    for i in range(1, x.shape[0] - 1):
        for j in range(1, x.shape[1] - 1):
            out[i,j] = (x[i-1, j-1] + x[i-1, j+0] + x[i-1, j+1] +
                        x[i+0, j-1] + x[i+0, j+0] + x[i+0, j+1] +
                        x[i+1, j-1] + x[i+1, j+0] + x[i+1, j+1])//9
    return out

In [9]:
from numba import jit
import numpy as np
import time

x = np.arange(100).reshape(10, 10)

@jit(nopython=True)
def go_fast(a): # Function is compiled and runs in machine code
    trace = 0.0
    for i in range(a.shape[0]):
        trace += np.tanh(a[i, i])
    return a + trace

# DO NOT REPORT THIS... COMPILATION TIME IS INCLUDED IN THE EXECUTION TIME!
start = time.time()
go_fast(x)
end = time.time()
print("Elapsed (with compilation) = %s" % (end - start))

# NOW THE FUNCTION IS COMPILED, RE-TIME IT EXECUTING FROM CACHE
start = time.time()
go_fast(x)
end = time.time()
print("Elapsed (after compilation) = %s" % (end - start))

Elapsed (with compilation) = 0.14077186584472656
Elapsed (after compilation) = 4.6253204345703125e-05


In [11]:
from numba import cuda
import numpy as np

@cuda.jit
def max_example(result, values):
    """Find the maximum value in values and store in result[0]"""
    tid = cuda.threadIdx.x
    bid = cuda.blockIdx.x
    bdim = cuda.blockDim.x
    i = (bid * bdim) + tid
    cuda.atomic.max(result, 0, values[i])


arr = np.random.rand(16384)
result = np.zeros(1, dtype=np.float64)

max_example[256,64](result, arr)
print(result[0]) # Found using cuda.atomic.max
print(max(arr))  # Print max(arr) for comparison (should be equal!)

0.9999755800048137
0.9999755800048137




In [12]:
from numba import jit
import random

@jit(nopython=True)
def monte_carlo_pi(nsamples):
    acc = 0
    for i in range(nsamples):
        x = random.random()
        y = random.random()
        if (x ** 2 + y ** 2) < 1.0:
            acc += 1
    return 4.0 * acc / nsamples

In [15]:
monte_carlo_pi(10000000000)

3.141604182