# Jetson Nano를 사용해서 Federated Learning으로 학습 과제

* 이전 실습에서 Django를 가지고 서버를 만들고 Federated learning을 진행해보는 실습을 진행하였다.
* 이번 과제에서는 실제 Jetson Nano를 사용해서 Federated Learning을 진행해야 한다.

* ```Jetson```클래스를 사용해서 모든 Jetson nano에 동시에 커맨드를 보낸다.
* 현재 Jetson Nano는 20101 ~ 20106, 20111 ~ 20116, 20121 ~ 20126, 20131 ~ 20136 포트가 사용 가능함

In [18]:
try:
    import fabric
except:
    !pip install fabric
    
try:
    from IPython.display import clear_output
except:
    pass

In [19]:
import logging, socket, paramiko.ssh_exception
from fabric import Connection, Config, SerialGroup, ThreadingGroup, exceptions, runners
from fabric.exceptions import GroupException
from random import random

In [20]:
import paramiko
import time
import threading

class Jetson:
    def __init__(self, min_port, max_port):
        self.address = "147.47.200.209"
        self.username, self.password = "jetson", "jetson"
        self.ports = [i for i in range(int(min_port), int(max_port)+1) if 1<=i%10<=6]
        self.ssh_ports = []
        self.connections = []
        
    def check(self):
        for port in self.ports:
            con = Connection(f'{self.username}@{self.address}:{port}', connect_kwargs ={"password":self.password})
            command = 'ls'
            print(f'----------------{port}----------------')
            try:
                con.run(command)
                self.ssh_ports.append(port)
                self.connections.append(con)
            except:
                print('ERROR')

        print("Available ports", self.ssh_ports)
        return len(self.ssh_ports)
            
    
    
    def send_command(self, command):
        for port, con in zip(self.ssh_ports, self.connections): 
            print(f'----------------{port}----------------')
            try:
                con.run(command)

            except:
                print('ERROR')

                        
    def start_fed(self, experiment, max_round, num_samples, num_clients):
        for i, (port, con) in enumerate(zip(self.ssh_ports, self.connections)):
            command = f'docker exec temp python3 /ambient_fl/client.py --round {max_round} --num {num_samples} --id {i} --exp {experiment}'
            print(f'----------------{port}----------------')
            try:
                t=threading.Thread(target=con.run,args=(command,))
                t.start()
                time.sleep(1)
            except:
                print('ERROR')

* 명령어 실행에 필요한 변수들을 정의한다
    * ```MIN_PORT``` : 최소 포트값 (default = 20101)
    * ```MAX_PORT``` : 최대 포트값 (default = 20136)
    * ```EXPERIMENT``` : experiment 번호
    * ```MAX_ROUND``` : 학습할 round
    * ```NUM_SAMPLES``` : 각 클라이언트에서 학습에 사용할 수 있는 데이터의 개수 (experiment 3, 4에 대해서는 무시)
    * ```CLIENT_NUM``` : 클라이언트 수

In [21]:
MIN_PORT = 20101
MAX_PORT = 20106
EXPERIMENT = 1
MAX_ROUND = 5
NUM_SAMPLES = 600

In [22]:
jetson = Jetson(min_port = MIN_PORT, max_port=MAX_PORT)
CLIENT_NUM = jetson.check() # 통신 전에 무조건 실행되야 함

----------------20101----------------
ambient_fl
Desktop
Documents
Downloads
examples.desktop
FedML-IoT
FL
jetson
jetson_network.sh
Music
Pictures
Public
Templates
Videos
----------------20102----------------
ambient_fl
Desktop
Documents
Downloads
examples.desktop
FL
jetson
jetson_network.sh
Music
Pictures
Public
Templates
Videos
----------------20103----------------
ambient_fl
Desktop
Documents
Downloads
examples.desktop
FL
jetson
jetson_network.sh
Music
Pictures
Public
Templates
Videos
----------------20104----------------
ambient_fl
Desktop
Documents
Downloads
examples.desktop
FL
jetson
jetson_network.sh
Music
Pictures
Public
Templates
Videos
----------------20105----------------
ambient_fl
Desktop
Documents
Downloads
examples.desktop
FL
jetson
jetson_network.sh
Music
Pictures
Public
Templates
Videos
----------------20106----------------
ambient_fl
Desktop
Documents
Downloads
examples.desktop
FL
jetson
jetson_network.sh
Music
Pictures
Public
Templates
Videos
Available ports [20101, 

* 아래 커맨드를 통해 서버를 초기화 시키고, 클라이언트 수, 실험, 전체 round 수를 서버에 알려줄 수 있다.
* 초기화가 성공한 경우 ```<Response [200]> Initialized server>``` 라고 뜰 것이다

In [23]:
import requests
init = requests.get(f"http://147.47.200.178:9103/initialize/{CLIENT_NUM}/{EXPERIMENT}/{MAX_ROUND}")
print(init, init.text)

<Response [200]> Initialized server


# 전체 컨테이너 보기

In [24]:
jetson.send_command("docker ps -a")

----------------20101----------------
CONTAINER ID   IMAGE          COMMAND       CREATED        STATUS                     PORTS     NAMES
e7cef17d1784   36d6d3fef19b   "/bin/bash"   17 hours ago   Exited (0) 3 minutes ago             temp
----------------20102----------------
CONTAINER ID   IMAGE                           COMMAND       CREATED        STATUS          PORTS     NAMES
ff49389345de   crazyboy9103/jetson_fl:latest   "/bin/bash"   17 hours ago   Up 15 minutes             temp
----------------20103----------------
CONTAINER ID   IMAGE                           COMMAND       CREATED        STATUS          PORTS     NAMES
e83d26850302   crazyboy9103/jetson_fl:latest   "/bin/bash"   17 hours ago   Up 15 minutes             temp
----------------20104----------------
CONTAINER ID   IMAGE                           COMMAND       CREATED        STATUS          PORTS     NAMES
d0e0dd1b2ee2   crazyboy9103/jetson_fl:latest   "/bin/bash"   17 hours ago   Up 15 minutes             temp


# 실행중인 컨테이너 정지
* 학습 중인 컨테이너 정지

In [25]:
jetson.send_command("docker stop $(docker ps -a -q)")

----------------20101----------------
e7cef17d1784
----------------20102----------------
ff49389345de
----------------20103----------------
e83d26850302
----------------20104----------------
d0e0dd1b2ee2
----------------20105----------------
14771fdb2ec7
----------------20106----------------
d254e301da94


# 정지된 컨테이너 실행

In [13]:
jetson.send_command("docker start temp")

----------------20101----------------
temp
----------------20102----------------
temp
----------------20103----------------
temp
----------------20104----------------
temp
----------------20105----------------
temp
----------------20106----------------
temp


# docker 이미지 최신으로 업데이트 (불필요)

In [27]:
jetson.send_command("docker pull crazyboy9103/jetson_fl:latest")

----------------20101----------------
latest: Pulling from crazyboy9103/jetson_fl
Digest: sha256:61a95436031e258a51574ec8987c321db264c5a682f1c59ba87691503edaec55
Status: Image is up to date for crazyboy9103/jetson_fl:latest
docker.io/crazyboy9103/jetson_fl:latest
----------------20102----------------
latest: Pulling from crazyboy9103/jetson_fl
04da93b342eb: Already exists
b235194751de: Already exists
606a67bb8db9: Already exists
9ce7ce1da17c: Already exists
f5299db1221c: Already exists
cb893097de39: Already exists
a36863a728ec: Already exists
86dd6e5994e2: Already exists
15a5811e1a7b: Already exists
b1cdeb9e69c9: Already exists
f0f57d03cad8: Already exists
f84ceb6e8887: Already exists
905b1329c1d4: Already exists
cfb2938be99f: Already exists
bf60857fb496: Already exists
0aac5305d11a: Already exists
08c23323368d: Already exists
93752947af53: Already exists
4d4b03e45e85: Already exists
890acf9522e1: Already exists
4285cec48fa4: Already exists
b2134f7c52d6: Already exists
fc89162c3e71: Al

# temp라는 이름의 컨테이너를 실행

In [28]:
jetson.send_command("docker run -d -ti --name temp --gpus all crazyboy9103/jetson_fl:latest")

----------------20101----------------
05001744ca9723ce81e0473ff2d2cbbbd2980349b65bea800e96b62776435bfa
----------------20102----------------
84ad9bc76f7b1504dc890a7339edc91c57ee4155d714ff8db1acdb53fc7e6232
----------------20103----------------
d4eb8f51df1e5c97f9e20d818e0b4f9c78cc80380dc95ffb2902fc00c0f3c5cf
----------------20104----------------
f4d8e53866eef763893ce925445fdc1c65d400320535682732c41befe8c3aede
----------------20105----------------
a8174f76655d4a69c78ab7572ff5a4a1b6f9de23da6ec91139a6d311412ab2e3
----------------20106----------------
698dafbb3877ebbf5582d7a64cd0842b3d51d0f155b652db1a338c59e725b319


* 아래 커맨드를 통해 ```MIN_PORT``` 부터 ```MAX_PORT```까지 federated learning을 시작한다. (결과는 서버에 쌓임) 

# 컨테이너 삭제
* 위에서 생성한 temp라는 컨테이너를 삭제한다

In [26]:
jetson.send_command("docker rm temp")

----------------20101----------------
temp
----------------20102----------------
temp
----------------20103----------------
temp
----------------20104----------------
temp
----------------20105----------------
temp
----------------20106----------------
temp


# Federated Learning 시작

In [29]:
jetson.start_fed(experiment=EXPERIMENT, 
                 max_round=MAX_ROUND,
                 num_samples=NUM_SAMPLES,
                 num_clients=CLIENT_NUM)

----------------20101----------------
----------------20102----------------
----------------20103----------------


2021-11-19 01:38:10.307176: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.2


----------------20104----------------


2021-11-19 01:38:10.723713: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.2


----------------20105----------------


2021-11-19 01:38:11.813681: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.2


----------------20106----------------


2021-11-19 01:38:13.037194: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.2
2021-11-19 01:38:14.006387: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.2
2021-11-19 01:38:14.766557: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.2
2021-11-19 01:38:18.188096: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcuda.so.1
2021-11-19 01:38:18.204333: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1046] ARM64 does not support NUMA - returning NUMA node zero
2021-11-19 01:38:18.204523: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1742] Found device 0 with properties: 
pciBusID: 0000:00:00.0 name: NVIDIA Tegra X1 computeCapability: 5.3
coreClock: 0.9216GHz coreCount: 1 deviceMemorySize: 3.87GiB deviceMemoryBandwidth: 194.55Mi

2021-11-19 01:38:19.125914: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1046] ARM64 does not support NUMA - returning NUMA node zero
2021-11-19 01:38:19.126044: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1742] Found device 0 with properties: 
pciBusID: 0000:00:00.0 name: NVIDIA Tegra X1 computeCapability: 5.3
coreClock: 0.9216GHz coreCount: 1 deviceMemorySize: 3.87GiB deviceMemoryBandwidth: 194.55MiB/s
2021-11-19 01:38:19.126135: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.2
2021-11-19 01:38:19.126248: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcublas.so.10
2021-11-19 01:38:19.126321: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcufft.so.10
2021-11-19 01:38:19.126363: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcurand.so.1

2021-11-19 01:38:20.906062: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcusparse.so.10
2021-11-19 01:38:20.910906: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudnn.so.8
2021-11-19 01:38:20.911402: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1046] ARM64 does not support NUMA - returning NUMA node zero
2021-11-19 01:38:20.911902: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1046] ARM64 does not support NUMA - returning NUMA node zero
2021-11-19 01:38:20.912275: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1884] Adding visible gpu devices: 0
2021-11-19 01:38:20.938867: W tensorflow/core/platform/profile_utils/cpu_utils.cc:108] Failed to find bogomips or clock in /proc/cpuinfo; cannot determine CPU frequency
2021-11-19 01:38:20.939806: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x3b5c5520 initialized for platform Host (this doe

2021-11-19 01:38:22.338467: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcusolver.so.10
2021-11-19 01:38:22.338508: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcusparse.so.10
2021-11-19 01:38:22.338549: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudnn.so.8
2021-11-19 01:38:22.338697: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1046] ARM64 does not support NUMA - returning NUMA node zero
2021-11-19 01:38:22.338883: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1046] ARM64 does not support NUMA - returning NUMA node zero
2021-11-19 01:38:22.338961: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1884] Adding visible gpu devices: 0
2021-11-19 01:38:22.339069: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.2
2021-11-19 01:38:

2021-11-19 01:38:25.774287: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1302] 0:   N 
2021-11-19 01:38:25.774682: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1046] ARM64 does not support NUMA - returning NUMA node zero
2021-11-19 01:38:25.775050: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1046] ARM64 does not support NUMA - returning NUMA node zero
2021-11-19 01:38:25.775278: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1428] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 366 MB memory) -> physical GPU (device: 0, name: NVIDIA Tegra X1, pci bus id: 0000:00:00.0, compute capability: 5.3)
2021-11-19 01:38:26.035975: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcublas.so.10
2021-11-19 01:38:26.393282: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1283] Device interconnect StreamExecutor with strength 1 edge matrix:
2021-11-19 01:38:26.393371: I tensorflow/core/c