# Exemplo do uso de Python em ambiente de Processamento de Alto Desempenho (PAD)

Exemplo mostrando abordagen de PAD no ambiente Python, para um estudo de caso de computação científica, usando o supercomputador Santos Dumont (SD). Para este exemplo foi selecionada a biblioteca / compilador Numba rodando em CPU e em GPU, usando os nós de login B-710 (sdumount11, sdumount13, sdumount14) e o nó de login Sequana (sdumont18).

> Nota: o nó de login não deve ser utilizado para rodar os programas finais; utilizá-lo apenas para compilar e fazer pequenos testes curtos.

# Configuração do ambiente

## Na máquina local

Pré-requisito: distribuição Python (Anaconda ou Intel) instalada e funcionando na máquina local

* https://www.anaconda.com/products/individual
* https://software.intel.com/content/www/us/en/develop/tools/oneapi/components/distribution-for-python.html

### Exemplo (na máquina local)

Distribuição Intel (que inclui outros compiladores e ferramentas Intel)

* https://software.intel.com/content/www/us/en/develop/documentation/installation-guide-for-intel-oneapi-toolkits-linux/top/installation/install-using-package-managers/apt.html

```
...
sudo apt install intel-basekit intel-hpckit -y
sudo chmod -R 777 /opt/intel/oneapi/intelpython
source /opt/intel/oneapi/setvars.sh
conda install jupyterlab --yes
jupyter-lab
```

### Configuração do VPN e SSH

* https://github.com/efurlanm/hpc/blob/master/sd/access_configuring.ipynb
* https://www.hostinger.com.br/tutoriais/conexao-ssh-sem-senha

### Configuração do JupyterLab

* https://github.com/efurlanm/hpc/blob/master/sd/access_using_jupyterlab.ipynb

## No Santos Dumont

### Primeira sessão, no nó B-710
```
ssh sd
hostname > hostname_sd.txt
screen -S session_b710
module load anaconda3
jupyter-lab --no-browser --port=58889 --ip=0.0.0.0 --NotebookApp.token=''
```

### Segunda sessão, no nó sequana
```
Ctrl+a Ctrl+d
ssh sdumont18
module load anaconda3
jupyter-lab --no-browser --port=58890 --ip=0.0.0.0 --NotebookApp.token=''
```
>Notas:\
>  1 - o sdumont18 não tem o comando screen\
>  2 - ao sair do SDumont, não esquecer de encerrar os servidores

### Conexão das instâncias Jupyter à máquina local
Na máquina local:
```
scp -q sd:hostname_sd.txt ~/
export HOSTNAME_SD=$(cat ~/hostname_sd.txt)
ssh sd -L 8889:$HOSTNAME_SD:58889 -NTf
ssh sd -L 8890:sdumont18:58890 -NTf
sshfs sd:/ /mnt/sd -o workaround=rename,uid=1000,gid=1000
```

---

# Nó B-710

In [1]:
! lscpu | head -n 15 | grep "Model \|CPU(s):\|Thre\|Core\|NUMA\|MHz"

CPU(s):                24
Thread(s) per core:    1
Core(s) per socket:    12
NUMA node(s):          2
Model name:            Intel(R) Xeon(R) CPU E5-2695 v2 @ 2.40GHz
CPU MHz:               2435.302


# Comparação entre as implementações Fortran e Python

## Fortran 90 sequencial

In [1]:
%%writefile heat_seq.f90
program stencil
    implicit none
    integer, parameter  :: nsources=3
    integer             :: n=2400     ! n x n grid
    integer             :: energy=1   ! energy to be injected per iteration
    integer             :: niters=250 ! number of iterations
    integer             :: iters, i, j, size, sizeStart, sizeEnd
    integer, dimension(3, 2)        :: sources
    double precision, allocatable   :: aold(:,:), anew(:,:)
    double precision    :: t=0.0, t1=0.0, heat=0.0           
    
    size = n + 2
    sizeStart = 2
    sizeEnd = n + 1

    allocate(aold(size, size))
    allocate(anew(size, size))
    aold = 0.0
    anew = 0.0

    sources(1,:) = (/ n/2,   n/2   /)
    sources(2,:) = (/ n/3,   n/3   /)
    sources(3,:) = (/ n*4/5, n*8/9 /)   ! 8/9 conforme Balaji

    call cpu_time(t1)
    t = -t1

    do iters = 1, niters, 2
        
        ! odd iteration: anew <- stencil(aold)

        do j = sizeStart, sizeEnd
            do i = sizeStart, sizeEnd
                anew(i,j)=1/2.0*(aold(i,j)+1/4.0*(aold(i-1,j)+aold(i+1,j)+ aold(i,j-1)+aold(i,j+1)))
            enddo
        enddo

        do i = 1, nsources
            anew(sources(i,1)+1, sources(i,2)+1) =  &
                anew(sources(i,1)+1, sources(i,2)+1) + energy
        enddo


        ! even iteration: aold <- stencil(anew)

        do j = sizeStart, sizeEnd
            do i = sizeStart, sizeEnd
                aold(i,j)=1/2.0*(anew(i,j)+1/4.0*(anew(i-1,j)+anew(i+1,j)+anew(i,j-1)+anew(i,j+1)))
            enddo
        enddo
        
        do i = 1, nsources
            aold(sources(i,1)+1, sources(i,2)+1) =  &
                aold(sources(i,1)+1, sources(i,2)+1) + energy
        enddo

    enddo
   
    heat = 0.0
    do j = sizeStart, sizeEnd
        do i = sizeStart, sizeEnd
            heat = heat + aold(i,j)
        end do
    end do

    call cpu_time(t1)
    t = t + t1

    write(*, "('Heat = ' f0.4' | ')", advance="no") heat
    write(*, "('Time = 'f0.4)") t

    deallocate(aold)
    deallocate(anew)

end

Writing heat_seq.f90


In [2]:
! gfortran --version

GNU Fortran (GCC) 4.8.5 20150623 (Red Hat 4.8.5-36)
Copyright (C) 2015 Free Software Foundation, Inc.

GNU Fortran comes with NO WARRANTY, to the extent permitted by law.
You may redistribute copies of GNU Fortran
under the terms of the GNU General Public License.
For more information about these matters, see the file named COPYING



In [3]:
! gfortran  -O3  -o heat_seq  heat_seq.f90

In [4]:
! ./heat_seq

Heat = 750.0000 | Time = 2.4168


## Python sequencial

In [5]:
! python --version

Python 3.8.5


In [6]:
import numpy as np
from time import time

# parameters
n            = 2400    # n x n grid
energy       = 1.0     # energy to be injected per iteration
niters       = 250     # number of iterations

# other variables
heat         = np.zeros((1), np.float64)     # system total heat
anew         = np.zeros((n + 2,  n + 2), np.float64)
aold         = np.zeros((n + 2,  n + 2), np.float64)
sources      = np.empty((3, 2), np.int16)    # sources of energy
sources[:,:] = [ [n//2, n//2], [n//3, n//3], [n*4//5, n*8//9] ]

# computationally intensive core
def kernel(anew, aold) :
    anew[1:-1,1:-1]=1/2.0*(aold[1:-1,1:-1]+1/4.0*(aold[2:,1:-1]+aold[:-2,1:-1]+aold[1:-1,2:]+aold[1:-1,:-2]))

# main routine
t2 = 0
t0 = time()    # time measure
for _ in range(0, niters, 2) :
    t3 = time()
    kernel(anew, aold)    # odd iteration
    t2 += time() - t3
    anew[sources[:, 0], sources[:, 1]] += energy
    t3 = time()
    kernel(aold, anew)    # even iteration
    t2 += time() - t3
    aold[sources[:, 0], sources[:, 1]] += energy
heat[0] = np.sum( aold[1:-1, 1:-1] )  # system total heat
t1 = time()    # time measure

# show result
print("Heat: %0.4f | Time: %0.4f | Kernel: %0.4f" % (heat[0], t1-t0, t2) )

Heat: 750.0000 | Time: 24.3972 | Kernel: 24.3698


## Numba sequencial (1 thread)

In [4]:
import numpy as np
from time import time
from numba import njit, set_num_threads, get_num_threads, threading_layer

# parameters
n            = 2400    # n x n grid
energy       = 1.0     # energy to be injected per iteration
niters       = 250     # number of iterations
# initialize the data arrays
anew         = np.zeros((n + 2,  n + 2), np.float64)
aold         = np.zeros((n + 2,  n + 2), np.float64)
# initialize three heat sources
sources      = np.empty((3, 2), np.int16)    # sources of energy
sources[:,:] = [ [n//2, n//2], [n//3, n//3], [n*4//5, n*8//9] ]
heat         = 0     # system total heat

# computationally intensive core
@njit('(float64[:,:],float64[:,:])', fastmath=True, parallel=True, nogil=True)
def kernel(anew, aold) :
    anew[1:-1,1:-1]=1/2.0*(aold[1:-1,1:-1]+1/4.0*(aold[2:,1:-1]+aold[:-2,1:-1]+aold[1:-1,2:]+aold[1:-1,:-2]))

# main routine
set_num_threads(1)
t2 = 0
t0 = time()    # time measure
for _ in range(0, niters, 2) :
    t3 = time()
    kernel(anew, aold)
    t2 += time() - t3
    anew[sources[:, 0], sources[:, 1]] += energy
    t3 = time()
    kernel(aold, anew)
    t2 += time() - t3
    aold[sources[:, 0], sources[:, 1]] += energy
heat = np.sum( aold[1:-1, 1:-1] )    # system total heat
t1   = time()    # time measure

# show result
print("Heat: %0.4f | Time: %0.4f | Kernel: %0.4f" % (heat, t1-t0, t2) )
print("Threading layer chosen: %s | Thread count: %s" % (threading_layer(), get_num_threads()) )

Heat: 750.0000 | Time: 3.3275 | Kernel: 3.2966
Threading layer chosen: tbb | Thread count: 1


## 4 threads

Obs: o hyperthreading e turboboost devem estar desligados, caso contrário não é possível ver o aumento de velocidade

In [2]:
import numpy as np
from time import time
from numba import njit, set_num_threads, get_num_threads, threading_layer

# parameters
n            = 2400    # n x n grid
energy       = 1.0     # energy to be injected per iteration
niters       = 250     # number of iterations

# other variables
heat         = np.zeros((1), np.float64)     # system total heat
anew         = np.zeros((n + 2,  n + 2), np.float64)
aold         = np.zeros((n + 2,  n + 2), np.float64)
sources      = np.empty((3, 2), np.int16)    # sources of energy
sources[:,:] = [ [n//2, n//2], [n//3, n//3], [n*4//5, n*8//9] ]

# computationally intensive core
@njit('(float64[:,:],float64[:,:])', fastmath=True, parallel=True, nogil=True)
def kernel(anew, aold) :
    anew[1:-1,1:-1]=1/2.0*(aold[1:-1,1:-1]+1/4.0*(aold[2:,1:-1]+aold[:-2,1:-1]+aold[1:-1,2:]+aold[1:-1,:-2]))

# main routine
set_num_threads(4)
t2 = 0
t0 = time()    # time measure
for iters in range((niters+1)//2) :
    t3 = time()
    kernel(anew, aold)
    t2 += time() - t3
    anew[sources[:, 0], sources[:, 1]] += energy
    t3 = time()
    kernel(aold, anew)
    t2 += time() - t3
    aold[sources[:, 0], sources[:, 1]] += energy
heat[0] = np.sum( aold[1:-1, 1:-1] )  # system total heat
t1 = time()    # time measure

# show result
print("Heat: %0.4f | Time: %0.4f | Kernel: %0.4f" % (heat[0], t1-t0, t2) )
print("Threading layer chosen: %s | Thread count: %s" % (threading_layer(), get_num_threads()) )

Heat: 750.0000 | Time: 1.2146 | Kernel: 1.1831
Threading layer chosen: tbb | Thread count: 4


## 16 threads

In [3]:
import numpy as np
from time import time
from numba import njit, set_num_threads, get_num_threads, threading_layer

# parameters
n            = 2400    # n x n grid
energy       = 1.0     # energy to be injected per iteration
niters       = 250     # number of iterations

# other variables
heat         = np.zeros((1), np.float64)     # system total heat
anew         = np.zeros((n + 2,  n + 2), np.float64)
aold         = np.zeros((n + 2,  n + 2), np.float64)
sources      = np.empty((3, 2), np.int16)    # sources of energy
sources[:,:] = [ [n//2, n//2], [n//3, n//3], [n*4//5, n*8//9] ]

# computationally intensive core
@njit('(float64[:,:],float64[:,:])', fastmath=True, parallel=True, nogil=True)
def kernel(anew, aold) :
    anew[1:-1,1:-1]=1/2.0*(aold[1:-1,1:-1]+1/4.0*(aold[2:,1:-1]+aold[:-2,1:-1]+aold[1:-1,2:]+aold[1:-1,:-2]))

# main routine
set_num_threads(16)
t2 = 0
t0 = time()    # time measure
for iters in range((niters+1)//2) :
    t3 = time()
    kernel(anew, aold)
    t2 += time() - t3
    anew[sources[:, 0], sources[:, 1]] += energy
    t3 = time()
    kernel(aold, anew)
    t2 += time() - t3
    aold[sources[:, 0], sources[:, 1]] += energy
heat[0] = np.sum( aold[1:-1, 1:-1] )  # system total heat
t1 = time()    # time measure

# show result
print("Heat: %0.4f | Time: %0.4f | Kernel: %0.4f" % (heat[0], t1-t0, t2) )
print("Threading layer chosen: %s | Thread count: %s" % (threading_layer(), get_num_threads()) )

Heat: 750.0000 | Time: 0.7777 | Kernel: 0.7479
Threading layer chosen: tbb | Thread count: 16
