# Multidimensional arrays

### Check installation

In [5]:
!lscpu

Architecture:            x86_64
  CPU op-mode(s):        32-bit, 64-bit
  Address sizes:         46 bits physical, 48 bits virtual
  Byte Order:            Little Endian
CPU(s):                  4
  On-line CPU(s) list:   0-3
Vendor ID:               GenuineIntel
  Model name:            Intel(R) Xeon(R) Platinum 8259CL CPU @ 2.50GHz
    CPU family:          6
    Model:               85
    Thread(s) per core:  2
    Core(s) per socket:  2
    Socket(s):           1
    Stepping:            7
    BogoMIPS:            4999.99
    Flags:               fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mc
                         a cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscal
                         l nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopo
                         logy nonstop_tsc cpuid aperfmperf tsc_known_freq pni pc
                         lmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe 
                         popcnt tsc_deadlin

In [2]:
!nvidia-smi

Fri Sep 16 18:49:32 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 515.65.01    Driver Version: 515.65.01    CUDA Version: 11.7     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            On   | 00000000:00:1E.0 Off |                    0 |
| N/A   34C    P8    14W /  70W |      2MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------------------------------------------------------

## Latency numbers every programmer should know (Jeff Dean):

**L1 cache reference 0.5 ns**

**L2 cache reference 7 ns**

**Main memory reference 100 ns**

![CPUCUDA](https://docs.nvidia.com/cuda/cuda-c-programming-guide/graphics/gpu-devotes-more-transistors-to-data-processing.png)

In [6]:
import numpy as np
print(np.show_config())

blas_info:
    libraries = ['cblas', 'blas', 'cblas', 'blas']
    library_dirs = ['/opt/anaconda3/envs/minimal/lib']
    include_dirs = ['/opt/anaconda3/envs/minimal/include']
    language = c
    define_macros = [('HAVE_CBLAS', None)]
blas_opt_info:
    define_macros = [('NO_ATLAS_INFO', 1), ('HAVE_CBLAS', None)]
    libraries = ['cblas', 'blas', 'cblas', 'blas']
    library_dirs = ['/opt/anaconda3/envs/minimal/lib']
    include_dirs = ['/opt/anaconda3/envs/minimal/include']
    language = c
lapack_info:
    libraries = ['lapack', 'blas', 'lapack', 'blas']
    library_dirs = ['/opt/anaconda3/envs/minimal/lib']
    language = f77
lapack_opt_info:
    libraries = ['lapack', 'blas', 'lapack', 'blas', 'cblas', 'blas', 'cblas', 'blas']
    library_dirs = ['/opt/anaconda3/envs/minimal/lib']
    language = c
    define_macros = [('NO_ATLAS_INFO', 1), ('HAVE_CBLAS', None)]
    include_dirs = ['/opt/anaconda3/envs/minimal/include']
Supported SIMD extensions in this NumPy install:
    baseline 

In [7]:
!conda list | grep blas 

libblas                   3.9.0           16_linux64_openblas    conda-forge
libcblas                  3.9.0           16_linux64_openblas    conda-forge
liblapack                 3.9.0           16_linux64_openblas    conda-forge
libopenblas               0.3.21          pthreads_h78a6416_3    conda-forge


In [8]:
import torch
print(torch.__config__.show())
print(torch.__config__.parallel_info())

PyTorch built with:
  - GCC 10.4
  - C++ Version: 201402
  - Intel(R) oneAPI Math Kernel Library Version 2022.1-Product Build 20220311 for Intel(R) 64 architecture applications
  - OpenMP 201511 (a.k.a. OpenMP 4.5)
  - LAPACK is enabled (usually provided by MKL)
  - NNPACK is enabled
  - CPU capability usage: AVX2
  - CUDA Runtime 11.7
  - Built with CUDA Runtime 11.2
  - NVCC architecture flags: -gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_61,code=sm_61;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_86,code=compute_86
  - CuDNN 8.4.1  (built against CUDA 11.6)
  - Magma 2.5.4
  - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, CUDA_VERSION=11.2, CUDNN_VERSION=8.4.1, CXX_COMPILER=/home/conda/feedstock_root/build_artifacts/pytorch-recipe_1660083882787/_build_env/bin/x86_64-conda-linu

In [9]:
N = 500
a = np.random.normal(size=(N,N))
b = np.random.normal(size=(N,N))
a_ten = torch.tensor(a)
b_ten = torch.tensor(b)
a_tenc = a_ten.cuda()
b_tenc = b_ten.cuda()

In [13]:
a_ten

tensor([[-0.0775,  0.3024, -0.4119,  ..., -0.5312, -0.6876,  2.7937],
        [-0.9282, -0.3022, -0.0286,  ..., -0.6468, -0.3552, -0.5702],
        [ 1.7997,  0.0215,  0.2466,  ...,  0.0973, -0.7282,  0.0596],
        ...,
        [-1.1021,  0.7491,  1.8667,  ..., -0.0968, -2.5408, -0.2693],
        [ 0.8480, -1.5878,  1.6776,  ..., -1.0743,  3.3374,  1.0190],
        [-1.2507, -0.8470,  0.3551,  ...,  0.4148,  0.0299, -1.5046]],
       dtype=torch.float64)

In [10]:
%timeit np.dot(a,b)

2.7 ms ± 8.84 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [11]:
%timeit torch.matmul(a_ten, b_ten)

2.23 ms ± 14 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [12]:
%timeit torch.matmul(a_tenc, b_tenc)

22.8 µs ± 14.4 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [15]:
a32 = np.random.normal(size=(N,N)).astype(np.float32)
b32 = np.random.normal(size=(N,N)).astype(np.float32)
a_ten32 = torch.tensor(a32)
b_ten32 = torch.tensor(b32)
a_tenc32 = a_ten32.cuda()
b_tenc32 = b_ten32.cuda()

In [17]:
a_ten32

tensor([[ 1.3954,  0.3139, -1.2094,  ..., -2.5670, -0.7269,  0.1231],
        [-0.4257, -0.0673, -0.9478,  ...,  0.1603,  0.5322,  1.9775],
        [-0.2056, -0.0631,  0.6073,  ..., -0.7403,  1.1499, -0.1753],
        ...,
        [ 0.0296,  0.0241, -0.5349,  ...,  1.0955,  0.8051, -1.1943],
        [ 0.6457, -1.2042,  0.0721,  ..., -0.6507,  0.4757,  1.0971],
        [ 2.5511,  0.4281,  0.9820,  ..., -0.3287, -0.7725,  1.4404]])

In [16]:
%timeit np.dot(a32,b32)

1.37 ms ± 10 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [18]:
%timeit torch.matmul(a_ten32, b_ten32)

1.01 ms ± 4.7 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [20]:
%timeit torch.matmul(a_tenc32, b_tenc32)

59.2 µs ± 136 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [21]:
a_tenc32

tensor([[ 1.3954,  0.3139, -1.2094,  ..., -2.5670, -0.7269,  0.1231],
        [-0.4257, -0.0673, -0.9478,  ...,  0.1603,  0.5322,  1.9775],
        [-0.2056, -0.0631,  0.6073,  ..., -0.7403,  1.1499, -0.1753],
        ...,
        [ 0.0296,  0.0241, -0.5349,  ...,  1.0955,  0.8051, -1.1943],
        [ 0.6457, -1.2042,  0.0721,  ..., -0.6507,  0.4757,  1.0971],
        [ 2.5511,  0.4281,  0.9820,  ..., -0.3287, -0.7725,  1.4404]],
       device='cuda:0')

In [22]:
def matmul(a, b):
    n = a.shape[0]
    k = a.shape[1]
    m = b.shape[1]  
    c = np.zeros((n, m))
    for i in range(n):
        for j in range(m):
            for s in range(k):
                c[i, j] += a[i, s] * b[s, j]
                
    return c

N = 100
a = np.random.normal(size=(N,N))
b = np.random.normal(size=(N,N))

In [23]:
%timeit np.dot(a,b)

44.5 µs ± 39.3 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [24]:
%timeit matmul(a,b)

488 ms ± 664 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


## JIT - Just In Time - Compilation

In [25]:
import numba as nb
@nb.njit
def numba_matmul(a, b):
    n = a.shape[0]
    k = a.shape[1]
    m = b.shape[1]
    c = np.zeros((n, m))
    for i in range(n):
        for j in range(m):
            for s in range(k):
                c[i, j] += a[i, s] * b[s, j]
    return c

In [26]:
result = numba_matmul(a,b)

In [27]:
%timeit numba_matmul(a,b)

1.51 ms ± 1.44 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


## Multithreading and Parallel Computing

![multithreading](https://randu.org/tutorials/threads/images/process.png)

In [28]:
@nb.njit(parallel=True)
def numba_matmul_parallel(a, b):
    n = a.shape[0]
    k = a.shape[1]
    m = b.shape[1]
    c = np.zeros((n, m))
    for i in nb.prange(n):
        for j in range(m):
            for s in range(k):
                c[i, j] += a[i, s] * b[s, j]
    return c

In [29]:
result = numba_matmul_parallel(a,b)

In [30]:
%timeit numba_matmul_parallel(a,b)

500 µs ± 6.05 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


## GPU Computing

![sm](https://docs.nvidia.com/cuda/cuda-c-programming-guide/graphics/automatic-scalability.png)

![blocks](https://docs.nvidia.com/cuda/cuda-c-programming-guide/graphics/grid-of-thread-blocks.png)

![CUDA](https://developer-blogs.nvidia.com/wp-content/uploads/2017/01/cuda_indexing.png)

In [33]:
from numba import cuda
@cuda.jit
def numba_matmul_kernel(a, b, c, m, k):
    thread_id = cuda.threadIdx.x
    block_id = cuda.blockIdx.x # same as n

    step = cuda.blockDim.x

    for i in range(thread_id, m, step):
        for s in range(k):
            c[block_id, i] += a[block_id, s] * b[s, i]

def numba_matmul_cuda(a, b):
    n = a.shape[0]
    k = a.shape[1]
    m = b.shape[1]
    c = torch.zeros((n, m)).cuda()

    grid_size = n
    block_size = 32 * (max(1, m // 32)) # Block size should be a multiple of 32

    numba_matmul_kernel[grid_size, block_size](a, b, c, m, k)
    return c

In [34]:
ac = torch.Tensor(a).cuda()
bc = torch.Tensor(b).cuda()

In [35]:
result = numba_matmul_cuda(ac, bc)

In [34]:
result

tensor([[  4.0396,  -6.0901, -28.9926,  ...,  -0.1462, -17.9570,  -3.8915],
        [ -2.6122,  11.3718, -20.6277,  ...,   2.0551,  -7.9371,  -0.5512],
        [ 10.4962,  -2.2127,  -7.3381,  ...,   1.0584,  -9.9571,   6.1283],
        ...,
        [  4.1356,  -9.5141,   2.8716,  ...,  13.6263, -21.8132,  20.0127],
        [-13.2677,   2.2546,  -4.0188,  ..., -11.1008,  -2.8803, -16.5581],
        [ -3.1486,   9.6964,   7.0799,  ...,  -8.2270,   3.9389,  -4.2925]],
       device='cuda:0')

In [35]:
%timeit numba_matmul_cuda(a, b)

790 µs ± 983 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)


### Buffer Protocol

```cpp
struct buffer_info {
    void *ptr; /* Pointer to buffer */
    size_t itemsize; /* Size of one scalar */
    size_t ndim; /* Number of dimensions */
    size_t *shape; /* Buffer dimensions */
    size_t *strides; /* Strides (in bytes) for each index */
};
```

In [36]:
a = np.random.normal(size=(4,5)).astype(np.float32)
print(a.itemsize)
print(a.ndim)
print(a.shape)
print(a.strides)
a

4
2
(4, 5)
(20, 4)


In [37]:
a_tor = torch.from_numpy(a)
print(a_tor.dtype)
print(a_tor.dim())
print(a_tor.size())
print(a_tor.stride())
a_tor

torch.float32
2
torch.Size([4, 5])
(5, 1)


tensor([[-0.5554, -0.7204,  0.4757, -0.5168, -0.7033],
        [ 1.0857,  0.6206, -0.0197, -1.3250,  0.4471],
        [-0.1547,  0.3575,  1.2935, -0.0075,  0.2461],
        [-1.9445,  0.5729,  0.5526, -2.2325, -1.2715]])

In [38]:
a_num = a_tor.numpy()
a_num

In [39]:
a_tor[0,0] *= 10
print(a_tor[0,0])

tensor(-5.5544)


In [41]:
a

In [40]:
assert a[0,0] == a_tor[0,0].item()
assert a[0,0] == a_num[0,0]

## AOT - Ahead Of Time - Compilation

In [41]:
from torch.utils.cpp_extension import load

In [43]:
cpp_intro = load(name='cpp_intro',
             build_directory='./build',
             sources=['cpp_intro.cc'],
             extra_cflags=['-Wall -Wextra -Wpedantic -O3'],
             verbose=False)

In [44]:
N = 3
PI = 2. * torch.acos(torch.tensor(0.))
thetas = 0.05 * PI * (torch.rand(N) - 0.5) # example of angles in radians
rots = cpp_intro.get_rotations(thetas)
rots

tensor([[[ 0.9974, -0.0721],
         [ 0.0721,  0.9974]],

        [[ 0.9978, -0.0667],
         [ 0.0667,  0.9978]],

        [[ 0.9992,  0.0408],
         [-0.0408,  0.9992]]])

In [46]:
(rots.matmul(rots.transpose(-1,-2)) - torch.eye(2)).abs().sum()

tensor(3.5763e-07)

For more info have a look at: 
* PyTorch [docs](https://pytorch.org/tutorials/beginner/blitz/tensor_tutorial.html) 
* Numba [docs](https://numba.pydata.org/numba-doc/latest/index.html)

### Writing native extensions 

Tutorials worth working through include: 
* [numba & CUDA](http://numba.pydata.org/numba-doc/0.16.0/CUDAJit.html)
* [CUDA made easy](https://developer.nvidia.com/blog/even-easier-introduction-cuda)
* [CUDA guide](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html)

Further performance gain can be obtained by writing C++ extensions directly using LibTorch:
* [PyTorch C++ extensions](https://pytorch.org/tutorials/advanced/cpp_extension.html)