# Jetson Nano를 사용해서 Federated Learning으로 학습 과제

* 이전 실습에서 Django를 가지고 서버를 만들고 Federated learning을 진행해보는 실습을 진행하였다.
* 이번 과제에서는 실제 Jetson Nano를 사용해서 Federated Learning을 진행해야 한다.

* ```Jetson```클래스를 사용해서 모든 Jetson nano에 동시에 커맨드를 보낸다.
* 현재 Jetson Nano는 20101 ~ 20106, 20111 ~ 20116, 20121 ~ 20126, 20131 ~ 20136 포트가 사용 가능함

In [42]:
try:
    import fabric
except:
    !pip install fabric
    
try:
    from IPython.display import clear_output
except:
    pass

In [43]:
import logging, socket, paramiko.ssh_exception
from fabric import Connection, Config, SerialGroup, ThreadingGroup, exceptions, runners
from fabric.exceptions import GroupException
from random import random

In [45]:
import paramiko
import time
import threading

class Jetson:
    def __init__(self, min_port, max_port):
        self.address = "147.47.200.209"
        self.username, self.password = "jetson", "jetson"
        self.ports = [i for i in range(int(min_port), int(max_port)+1) if 1<=i%10<=6]
        self.ssh_ports = []
        self.connections = []
        
    def check(self):
        for port in self.ports:
            con = Connection(f'{self.username}@{self.address}:{port}', connect_kwargs ={"password":self.password})
            command = 'ls'
            print(f'----------------{port}----------------')
            try:
                con.run(command)
                self.ssh_ports.append(port)
                self.connections.append(con)
            except:
                print('ERROR')

        print("Available ports", self.ssh_ports)
        return len(self.ssh_ports)
            
    
    
    def send_command(self, command):
        for port, con in zip(self.ssh_ports, self.connections): 
            print(f'----------------{port}----------------')
            try:
                con.run(command)

            except:
                print('ERROR')

                        
    def start_fed(self, experiment, max_round, num_samples, num_clients):
        for i, (port, con) in enumerate(zip(self.ssh_ports, self.connections)):
            command = f'docker exec temp python3 /ambient_fl/client.py --round {max_round} --num {num_samples} --id {i} --exp {experiment}'
            print(f'----------------{port}----------------')
            try:
                t=threading.Thread(target=con.run,args=(command,))
                t.start()
                time.sleep(0.5)
            except:
                print('ERROR')

* 명령어 실행에 필요한 변수들을 정의한다
    * ```MIN_PORT``` : 최소 포트값 (default = 20101)
    * ```MAX_PORT``` : 최대 포트값 (default = 20136)
    * ```EXPERIMENT``` : experiment 번호
    * ```MAX_ROUND``` : 학습할 round
    * ```NUM_SAMPLES``` : 각 클라이언트에서 학습에 사용할 수 있는 데이터의 개수 (experiment 3, 4에 대해서는 무시)
    * ```CLIENT_NUM``` : 클라이언트 수

In [46]:
MIN_PORT = 20101
MAX_PORT = 20106
EXPERIMENT = 1
MAX_ROUND = 2
NUM_SAMPLES = 100

In [47]:
jetson = Jetson(min_port = MIN_PORT, max_port=MAX_PORT)
CLIENT_NUM = jetson.check() # 통신 전에 무조건 실행되야 함

----------------20101----------------
ambient_fl
Desktop
Documents
Downloads
examples.desktop
FedML-IoT
FL
jetson
jetson_network.sh
Music
Pictures
Public
Templates
Videos
----------------20102----------------
ambient_fl
Desktop
Documents
Downloads
examples.desktop
FL
jetson
jetson_network.sh
Music
Pictures
Public
Templates
Videos
----------------20103----------------
ambient_fl
Desktop
Documents
Downloads
examples.desktop
FL
jetson
jetson_network.sh
Music
Pictures
Public
Templates
Videos
----------------20104----------------
ambient_fl
Desktop
Documents
Downloads
examples.desktop
FL
jetson
jetson_network.sh
Music
Pictures
Public
Templates
Videos
----------------20105----------------
ambient_fl
Desktop
Documents
Downloads
examples.desktop
FL
jetson
jetson_network.sh
Music
Pictures
Public
Templates
Videos
----------------20106----------------
ambient_fl
Desktop
Documents
Downloads
examples.desktop
FL
jetson
jetson_network.sh
Music
Pictures
Public
Templates
Videos
Available ports [20101, 

* 아래 커맨드를 통해 서버를 초기화 시키고, 클라이언트 수, 실험, 전체 round 수를 서버에 알려줄 수 있다.
* 초기화가 성공한 경우 ```<Response [200]> Initialized server>``` 라고 뜰 것이다

In [48]:
import requests
init = requests.get(f"http://147.47.200.178:9103/initialize/{CLIENT_NUM}/{EXPERIMENT}/{MAX_ROUND}")
print(init, init.text)

<Response [200]> Initialized server


# 전체 컨테이너 보기

In [53]:
jetson.send_command("docker ps -a")

----------------20101----------------
CONTAINER ID   IMAGE                           COMMAND       CREATED          STATUS          PORTS     NAMES
745c86df9b29   crazyboy9103/jetson_fl:latest   "/bin/bash"   37 minutes ago   Up 12 minutes             temp
----------------20102----------------
CONTAINER ID   IMAGE                           COMMAND       CREATED          STATUS          PORTS     NAMES
0510db3c21e4   crazyboy9103/jetson_fl:latest   "/bin/bash"   37 minutes ago   Up 12 minutes             temp
----------------20103----------------
CONTAINER ID   IMAGE                           COMMAND       CREATED          STATUS          PORTS     NAMES
033ab674ef12   crazyboy9103/jetson_fl:latest   "/bin/bash"   37 minutes ago   Up 12 minutes             temp
----------------20104----------------
CONTAINER ID   IMAGE                           COMMAND       CREATED          STATUS          PORTS     NAMES
22cd5de151c2   crazyboy9103/jetson_fl:latest   "/bin/bash"   37 minutes ago   Up 

# 실행중인 컨테이너 정지
* 학습 중인 컨테이너 정지

In [49]:
jetson.send_command("docker stop $(docker ps -a -q)")

----------------20101----------------
745c86df9b29
----------------20102----------------
0510db3c21e4
----------------20103----------------
033ab674ef12
----------------20104----------------
22cd5de151c2
----------------20105----------------
287f9b7f841b
----------------20106----------------
79e7b643d3e7


# 정지된 컨테이너 실행

In [50]:
jetson.send_command("docker start temp")

----------------20101----------------
temp
----------------20102----------------
temp
----------------20103----------------
temp
----------------20104----------------
temp
----------------20105----------------
temp
----------------20106----------------
temp


# docker 이미지 최신으로 업데이트 (불필요)

In [32]:
jetson.send_command("docker pull crazyboy9103/jetson_fl:latest")

----------------20101----------------
latest: Pulling from crazyboy9103/jetson_fl
Digest: sha256:a0aeacf86984dc7bf07707460e7c07af19cf0b8c4258fc99dad4f1182c460418
Status: Image is up to date for crazyboy9103/jetson_fl:latest
docker.io/crazyboy9103/jetson_fl:latest
----------------20102----------------
latest: Pulling from crazyboy9103/jetson_fl
04da93b342eb: Already exists
b235194751de: Already exists
606a67bb8db9: Already exists
9ce7ce1da17c: Already exists
f5299db1221c: Already exists
cb893097de39: Already exists
a36863a728ec: Already exists
86dd6e5994e2: Already exists
15a5811e1a7b: Already exists
b1cdeb9e69c9: Already exists
f0f57d03cad8: Already exists
f84ceb6e8887: Already exists
905b1329c1d4: Already exists
cfb2938be99f: Already exists
bf60857fb496: Already exists
0aac5305d11a: Already exists
08c23323368d: Already exists
93752947af53: Already exists
4d4b03e45e85: Already exists
890acf9522e1: Already exists
4285cec48fa4: Already exists
b2134f7c52d6: Already exists
fc89162c3e71: Al

# temp라는 이름의 컨테이너를 백그라운드에서 실행

In [33]:
jetson.send_command("docker run -d -ti --name temp --gpus all crazyboy9103/jetson_fl:latest")

----------------20101----------------
745c86df9b2997f89832fcb3075a291fff64562b5713cde86c2c0a2db1572117
----------------20102----------------
0510db3c21e4a8bb022b9d10fa51108f578e7a70ba697b0b257be0c5058bce5b
----------------20103----------------
033ab674ef1220642afdaf78a19331483ac00a1cee559c20ee56798a37e288c9
----------------20104----------------
22cd5de151c267fd92a8766e8b0c9e74ba1d3f15c92505ba795f605cc2eb1ae0
----------------20105----------------
287f9b7f841b970f91615123e5c6116e72760f01c0cbd9769159da9f01fb08b8
----------------20106----------------
79e7b643d3e7e572880b60f2124b5435433951fe16b99d5c0d583d48a7644575


* 아래 커맨드를 통해 ```MIN_PORT``` 부터 ```MAX_PORT```까지 federated learning을 시작한다. (결과는 서버에 쌓임) 

# 컨테이너 삭제
* 위에서 생성한 temp라는 컨테이너를 삭제한다

In [30]:
jetson.send_command("docker rm temp")

----------------20101----------------
temp
----------------20102----------------
temp
----------------20103----------------
temp
----------------20104----------------
temp
----------------20105----------------
temp
----------------20106----------------
temp


# Federated Learning 시작

In [52]:
jetson.start_fed(experiment=EXPERIMENT, 
                 max_round=MAX_ROUND,
                 num_samples=NUM_SAMPLES,
                 num_clients=CLIENT_NUM)

----------------20101----------------
----------------20102----------------
----------------20103----------------
----------------20104----------------
----------------20105----------------
----------------20106----------------


2021-11-18 05:12:14.926578: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.2
2021-11-18 05:12:15.133035: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.2
2021-11-18 05:12:15.557516: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.2
2021-11-18 05:12:15.847980: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.2
2021-11-18 05:12:16.490560: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.2
2021-11-18 05:12:17.006726: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.2
2021-11-18 05:12:22.707336: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic libr

2021-11-18 05:12:23.303058: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1046] ARM64 does not support NUMA - returning NUMA node zero
2021-11-18 05:12:23.303409: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x30e54830 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2021-11-18 05:12:23.303471: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA Tegra X1, Compute Capability 5.3
2021-11-18 05:12:23.303881: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1046] ARM64 does not support NUMA - returning NUMA node zero
2021-11-18 05:12:23.304017: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1742] Found device 0 with properties: 
pciBusID: 0000:00:00.0 name: NVIDIA Tegra X1 computeCapability: 5.3
coreClock: 0.9216GHz coreCount: 1 deviceMemorySize: 3.87GiB deviceMemoryBandwidth: 194.55MiB/s
2021-11-18 05:12:23.304096: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Suc

2021-11-18 05:12:24.028053: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcufft.so.10
2021-11-18 05:12:24.068369: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcurand.so.10
2021-11-18 05:12:24.096749: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcusolver.so.10
2021-11-18 05:12:24.125494: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcusparse.so.10
2021-11-18 05:12:24.114700: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1046] ARM64 does not support NUMA - returning NUMA node zero
2021-11-18 05:12:24.115196: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x94def80 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2021-11-18 05:12:24.115306: I tensorflow/compiler/xla/service/service.cc:176]   StreamExe

2021-11-18 05:12:24.261019: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudnn.so.8
2021-11-18 05:12:24.261165: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1046] ARM64 does not support NUMA - returning NUMA node zero
2021-11-18 05:12:24.261342: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1046] ARM64 does not support NUMA - returning NUMA node zero
2021-11-18 05:12:24.261413: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1884] Adding visible gpu devices: 0
2021-11-18 05:12:24.261486: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.2
2021-11-18 05:12:25.384654: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcuda.so.1
2021-11-18 05:12:25.397121: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1046] ARM64 does not support NUMA - returning NUMA node zero
2021-11-18 05:12:25.3972

2021-11-18 05:12:27.592624: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1283] Device interconnect StreamExecutor with strength 1 edge matrix:
2021-11-18 05:12:27.592715: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1289]      0 
2021-11-18 05:12:27.592745: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1302] 0:   N 
2021-11-18 05:12:27.593088: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1046] ARM64 does not support NUMA - returning NUMA node zero
2021-11-18 05:12:27.593434: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1046] ARM64 does not support NUMA - returning NUMA node zero
2021-11-18 05:12:27.593608: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1428] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 635 MB memory) -> physical GPU (device: 0, name: NVIDIA Tegra X1, pci bus id: 0000:00:00.0, compute capability: 5.3)
2021-11-18 05:12:28.073525: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1283] Device interc

global round 0
current round 0
Client 3needs update
train started
global round 1
current round 1
Client 3needs update
train started
global round 2
current round 2
Client 3 finished
global round 0
current round 0
Client 1needs update
train started
global round 0
current round 1
need wait
global round 1
current round 1
Client 1needs update
train started
global round 0
current round 2
Client 1 finished
global round 0
current round 0
Client 2needs update
train started
global round 0
current round 1
need wait
global round 1
current round 1
Client 2needs update
train started
global round 0
current round 2
Client 2 finished
global round 0
current round 0
Client 0needs update
train started
global round 1
current round 1
Client 0needs update
train started
global round 0
current round 2
Client 0 finished
global round 0
current round 0
Client 5needs update
train started
global round 0
current round 1
need wait
global round 1
current round 1
Client 5needs update
train started
global round 0
curren