# Jetson Nano를 사용해서 Federated Learning으로 학습 과제

* 이전 실습에서 Django를 가지고 서버를 만들고 Federated learning을 진행해보는 실습을 진행하였다.
* 이번 과제에서는 실제 Jetson Nano를 사용해서 Federated Learning을 진행해야 한다.

* ```Jetson```클래스를 사용해서 모든 Jetson nano에 동시에 커맨드를 보낸다.
* 현재 Jetson Nano는 20101 ~ 20106, 20111 ~ 20116, 20121 ~ 20126, 20131 ~ 20136 포트가 사용 가능함

In [1]:
try:
    import fabric
except:
    !pip install fabric
    
try:
    from IPython.display import clear_output
except:
    pass

In [2]:
import logging, socket, paramiko.ssh_exception
from fabric import Connection, Config, SerialGroup, ThreadingGroup, exceptions, runners
from fabric.exceptions import GroupException
from random import random

In [3]:
import paramiko
import time
import threading

class Jetson:
    def __init__(self, min_port, max_port):
        self.address = "147.47.200.209"
        self.username, self.password = "jetson", "jetson"
        self.ports = [i for i in range(int(min_port), int(max_port)+1) if 1<=i%10<=6]
        self.ssh_ports = []
        self.connections = []
        
    def check(self):
        for port in self.ports:
            con = Connection(f'{self.username}@{self.address}:{port}', connect_kwargs ={"password":self.password})
            command = 'ls'
            print(f'----------------{port}----------------')
            try:
                con.run(command)
                self.ssh_ports.append(port)
                self.connections.append(con)
            except:
                print('ERROR')

        print("Available ports", self.ssh_ports)
        return len(self.ssh_ports)
            
    
    
    def send_command(self, command):
        for port, con in zip(self.ssh_ports, self.connections): 
            print(f'----------------{port}----------------')
            try:
                con.run(command)

            except:
                print('ERROR')

                        
    def start_fed(self, experiment, max_round, num_samples, num_clients):
        for i, (port, con) in enumerate(zip(self.ssh_ports, self.connections)):
            command = f'docker exec temp python3 /ambient_fl/client.py --round {max_round} --num {num_samples} --id {i} --exp {experiment}'
            print(f'----------------{port}----------------')
            try:
                t=threading.Thread(target=con.run,args=(command,))
                t.start()
                time.sleep(0.5)
            except:
                print('ERROR')

* 명령어 실행에 필요한 변수들을 정의한다
    * ```MIN_PORT``` : 최소 포트값 (default = 20101)
    * ```MAX_PORT``` : 최대 포트값 (default = 20136)
    * ```EXPERIMENT``` : experiment 번호
    * ```MAX_ROUND``` : 학습할 round
    * ```NUM_SAMPLES``` : 각 클라이언트에서 학습에 사용할 수 있는 데이터의 개수 (experiment 3, 4에 대해서는 무시)
    * ```CLIENT_NUM``` : 클라이언트 수

In [4]:
MIN_PORT = 20101
MAX_PORT = 20106
EXPERIMENT = 1
MAX_ROUND = 5
NUM_SAMPLES = 600

In [5]:
jetson = Jetson(min_port = MIN_PORT, max_port=MAX_PORT)
CLIENT_NUM = jetson.check() # 통신 전에 무조건 실행되야 함

----------------20101----------------
ambient_fl
Desktop
Documents
Downloads
examples.desktop
FedML-IoT
FL
jetson
jetson_network.sh
Music
Pictures
Public
Templates
Videos
----------------20102----------------
ambient_fl
Desktop
Documents
Downloads
examples.desktop
FL
jetson
jetson_network.sh
Music
Pictures
Public
Templates
Videos
----------------20103----------------
ambient_fl
Desktop
Documents
Downloads
examples.desktop
FL
jetson
jetson_network.sh
Music
Pictures
Public
Templates
Videos
----------------20104----------------
ambient_fl
Desktop
Documents
Downloads
examples.desktop
FL
jetson
jetson_network.sh
Music
Pictures
Public
Templates
Videos
----------------20105----------------
ambient_fl
Desktop
Documents
Downloads
examples.desktop
FL
jetson
jetson_network.sh
Music
Pictures
Public
Templates
Videos
----------------20106----------------
ambient_fl
Desktop
Documents
Downloads
examples.desktop
FL
jetson
jetson_network.sh
Music
Pictures
Public
Templates
Videos
Available ports [20101, 

* 아래 커맨드를 통해 서버를 초기화 시키고, 클라이언트 수, 실험, 전체 round 수를 서버에 알려줄 수 있다.
* 초기화가 성공한 경우 ```<Response [200]> Initialized server>``` 라고 뜰 것이다

In [6]:
import requests
init = requests.get(f"http://147.47.200.178:9103/initialize/{CLIENT_NUM}/{EXPERIMENT}/{MAX_ROUND}")
print(init, init.text)

<Response [200]> Initialized server


# 전체 컨테이너 보기

In [7]:
jetson.send_command("docker ps -a")

----------------20101----------------
CONTAINER ID   IMAGE                           COMMAND       CREATED       STATUS       PORTS     NAMES
745c86df9b29   crazyboy9103/jetson_fl:latest   "/bin/bash"   4 hours ago   Up 3 hours             temp
----------------20102----------------
CONTAINER ID   IMAGE                           COMMAND       CREATED       STATUS       PORTS     NAMES
0510db3c21e4   crazyboy9103/jetson_fl:latest   "/bin/bash"   4 hours ago   Up 3 hours             temp
----------------20103----------------
CONTAINER ID   IMAGE                           COMMAND       CREATED       STATUS       PORTS     NAMES
033ab674ef12   crazyboy9103/jetson_fl:latest   "/bin/bash"   4 hours ago   Up 3 hours             temp
----------------20104----------------
CONTAINER ID   IMAGE                           COMMAND       CREATED       STATUS       PORTS     NAMES
22cd5de151c2   crazyboy9103/jetson_fl:latest   "/bin/bash"   4 hours ago   Up 3 hours             temp
----------------2010

# 실행중인 컨테이너 정지
* 학습 중인 컨테이너 정지

In [19]:
jetson.send_command("docker stop $(docker ps -a -q)")

----------------20101----------------


Exception in thread Thread-244:
Traceback (most recent call last):
  File "/usr/lib/python3.8/threading.py", line 932, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.8/threading.py", line 870, in run
    self._target(*self._args, **self._kwargs)
  File "<decorator-gen-3>", line 2, in run
  File "/usr/local/lib/python3.8/dist-packages/fabric/connection.py", line 30, in opens
    return method(self, *args, **kwargs)
  File "/usr/local/lib/python3.8/dist-packages/fabric/connection.py", line 723, in run
    return self._run(self._remote_runner(), command, **kwargs)
  File "/usr/local/lib/python3.8/dist-packages/invoke/context.py", line 102, in _run
    return runner.run(command, **kwargs)
  File "/usr/local/lib/python3.8/dist-packages/invoke/runners.py", line 380, in run
    return self._run_body(command, **kwargs)
  File "/usr/local/lib/python3.8/dist-packages/invoke/runners.py", line 442, in _run_body
    return self.make_promise() if self._asynchronous else self._finish()
 

e7cef17d1784
----------------20102----------------
ff49389345de
----------------20103----------------
e83d26850302
----------------20104----------------
d0e0dd1b2ee2
----------------20105----------------
14771fdb2ec7
----------------20106----------------
d254e301da94


# 정지된 컨테이너 실행

In [20]:
jetson.send_command("docker start temp")

----------------20101----------------
temp
----------------20102----------------
temp
----------------20103----------------
temp
----------------20104----------------
temp
----------------20105----------------
temp
----------------20106----------------
temp


# docker 이미지 최신으로 업데이트 (불필요)

In [10]:
jetson.send_command("docker pull crazyboy9103/jetson_fl:latest")

----------------20101----------------
latest: Pulling from crazyboy9103/jetson_fl
Digest: sha256:197f27a2870b47e5a25492e776a1abed997191069fe8f99310958be9b1a101ba
Status: Image is up to date for crazyboy9103/jetson_fl:latest
docker.io/crazyboy9103/jetson_fl:latest
----------------20102----------------
latest: Pulling from crazyboy9103/jetson_fl
04da93b342eb: Already exists
b235194751de: Already exists
606a67bb8db9: Already exists
9ce7ce1da17c: Already exists
f5299db1221c: Already exists
cb893097de39: Already exists
a36863a728ec: Already exists
86dd6e5994e2: Already exists
15a5811e1a7b: Already exists
b1cdeb9e69c9: Already exists
f0f57d03cad8: Already exists
f84ceb6e8887: Already exists
905b1329c1d4: Already exists
cfb2938be99f: Already exists
bf60857fb496: Already exists
0aac5305d11a: Already exists
08c23323368d: Already exists
93752947af53: Already exists
4d4b03e45e85: Already exists
890acf9522e1: Already exists
4285cec48fa4: Already exists
b2134f7c52d6: Already exists
fc89162c3e71: Al

# temp라는 이름의 컨테이너를 실행

In [13]:
jetson.send_command("docker run -d -ti --name temp --gpus all crazyboy9103/jetson_fl:latest")

----------------20101----------------
e7cef17d17840c08025ad7064e2aa492d76b65d05d4795a80b84c7df11b2c434
----------------20102----------------
ff49389345de2b6923d89a4ee5bf30e33c596d4b8b0736557ff690c0d4434c1a
----------------20103----------------
e83d268503022f3517c170150c85befd6921a06ebc540e8b898d5da9f0403732
----------------20104----------------
d0e0dd1b2ee2915be5a887994072a9d7e6e53212d3b4a0e3a6a148e2624d3ff4
----------------20105----------------
14771fdb2ec7c502da113ca1c0bdc1bdb39cfd6b597d2045ff306eadf29826d9
----------------20106----------------
d254e301da94c1b0a332c3e50e99b75d60ff05c35fcb047676276c7f01a21c32


* 아래 커맨드를 통해 ```MIN_PORT``` 부터 ```MAX_PORT```까지 federated learning을 시작한다. (결과는 서버에 쌓임) 

# 컨테이너 삭제
* 위에서 생성한 temp라는 컨테이너를 삭제한다

In [12]:
jetson.send_command("docker rm temp")

----------------20101----------------
temp
----------------20102----------------
temp
----------------20103----------------
temp
----------------20104----------------
temp
----------------20105----------------
temp
----------------20106----------------
temp


# Federated Learning 시작

In [21]:
jetson.start_fed(experiment=EXPERIMENT, 
                 max_round=MAX_ROUND,
                 num_samples=NUM_SAMPLES,
                 num_clients=CLIENT_NUM)

----------------20101----------------
----------------20102----------------
----------------20103----------------
----------------20104----------------
----------------20105----------------
----------------20106----------------


2021-11-18 08:56:02.311697: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.2
2021-11-18 08:56:02.372031: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.2
2021-11-18 08:56:02.954717: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.2
2021-11-18 08:56:03.623364: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.2
2021-11-18 08:56:03.974601: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.2
2021-11-18 08:56:04.413150: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.2
2021-11-18 08:56:09.688450: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic libr

2021-11-18 08:56:10.342004: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1046] ARM64 does not support NUMA - returning NUMA node zero
2021-11-18 08:56:10.342315: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x1024aa10 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2021-11-18 08:56:10.342380: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA Tegra X1, Compute Capability 5.3
2021-11-18 08:56:10.342791: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1046] ARM64 does not support NUMA - returning NUMA node zero
2021-11-18 08:56:10.342922: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1742] Found device 0 with properties: 
pciBusID: 0000:00:00.0 name: NVIDIA Tegra X1 computeCapability: 5.3
coreClock: 0.9216GHz coreCount: 1 deviceMemorySize: 3.87GiB deviceMemoryBandwidth: 194.55MiB/s
2021-11-18 08:56:10.343001: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Suc

2021-11-18 08:56:10.969285: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcufft.so.10
2021-11-18 08:56:10.970261: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcurand.so.10
2021-11-18 08:56:10.974773: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcusolver.so.10
2021-11-18 08:56:10.978200: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcusparse.so.10
2021-11-18 08:56:10.979164: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudnn.so.8
2021-11-18 08:56:10.979430: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1046] ARM64 does not support NUMA - returning NUMA node zero
2021-11-18 08:56:10.979703: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1046] ARM64 does not support NUMA - returning NUMA n

2021-11-18 08:56:11.331382: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1046] ARM64 does not support NUMA - returning NUMA node zero
2021-11-18 08:56:11.331601: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1046] ARM64 does not support NUMA - returning NUMA node zero
2021-11-18 08:56:11.331681: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1884] Adding visible gpu devices: 0
2021-11-18 08:56:11.331774: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.2
2021-11-18 08:56:12.479819: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcuda.so.1
2021-11-18 08:56:12.488156: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1046] ARM64 does not support NUMA - returning NUMA node zero
2021-11-18 08:56:12.488316: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1742] Found device 0 with properties: 
pciBusID: 0000:00:00.0 name: NVIDIA Tegra X1 com

2021-11-18 08:56:15.078746: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1046] ARM64 does not support NUMA - returning NUMA node zero
2021-11-18 08:56:15.079036: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1046] ARM64 does not support NUMA - returning NUMA node zero
2021-11-18 08:56:15.079210: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1428] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 720 MB memory) -> physical GPU (device: 0, name: NVIDIA Tegra X1, pci bus id: 0000:00:00.0, compute capability: 5.3)
2021-11-18 08:56:15.161932: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1283] Device interconnect StreamExecutor with strength 1 edge matrix:
2021-11-18 08:56:15.162024: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1289]      0 
2021-11-18 08:56:15.162053: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1302] 0:   N 
2021-11-18 08:56:15.162424: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1046] ARM64 doe

global round 0
current round 0
Client 5needs update
train started
global round 0
current round 1
need wait
global round 0
current round 1
need wait
global round 1
current round 1
Client 5needs update
train started
global round 2
current round 2
Client 5needs update
train started
global round 2
current round 3
need wait
global round 3
current round 3
Client 5needs update
train started
global round 4
current round 4
Client 5needs update
train started
global round 4
current round 5
Client 5 finished
global round 0
current round 0
Client 4needs update
train started
global round 0
current round 1
need wait
global round 1
current round 1
Client 4needs update
train started
global round 2
current round 2
Client 4needs update
train started
global round 2
current round 3
need wait
global round 3
current round 3
Client 4needs update
train started
global round 4
current round 4
Client 4needs update
train started
global round 5
current round 5
Client 4 finished
global round 0
current round 0
Client