# Jetson Nano를 사용해서 Federated Learning으로 학습 과제

* 이전 실습에서 Django를 가지고 서버를 만들고 Federated learning을 진행해보는 실습을 진행하였다.
* 이번 과제에서는 실제 Jetson Nano를 사용해서 Federated Learning을 진행해야 한다.

* ```Jetson```클래스를 사용해서 모든 Jetson nano에 커맨드를 보낸다.
* 현재 Jetson은 20101 ~ 20106, 20111 ~ 20116, 20121 ~ 20126, 20131 ~ 20136 포트가 사용 가능함

In [1]:
try:
    import fabric
except:
    !pip install fabric
    
try:
    from IPython.display import clear_output
except:
    pass

In [2]:
import logging, socket, paramiko.ssh_exception
from fabric import Connection, Config, SerialGroup, ThreadingGroup, exceptions, runners
from fabric.exceptions import GroupException
from random import random

In [8]:
import paramiko
import time
import argparse
import requests
from tqdm import tqdm
import threading
import time

class Jetson:
    def __init__(self, min_port, max_port):
        self.address = "147.47.200.209"
        self.username, self.password = "jetson", "jetson"
        self.ports = [i for i in range(int(min_port), int(max_port)+1) if 1<=i%10<=6]
        self.ssh_ports = []
        self.connections = []
        
    def check(self):
        for port in self.ports:
            con = Connection(f'{self.username}@{self.address}:{port}', connect_kwargs ={"password":self.password})
            command = 'ls'
            print(f'----------------{port}----------------')
            try:
                con.run(command)
                self.ssh_ports.append(port)
                self.connections.append(con)
            except:
                print('ERROR')

        print("Available ports", self.ssh_ports)
        return len(self.ssh_ports)
            
    
    
    def send_command(self, command):
        for port, con in zip(self.ssh_ports, self.connections): 
            print(f'----------------{port}----------------')
            try:
                con.run(command)

            except:
                print('ERROR')

                        
    def start_fed(self, experiment, max_round, num_samples, num_clients):
        for i, (port, con) in enumerate(zip(self.ssh_ports, self.connections)):
            command = f'docker exec temp python3 /ambient_fl/client.py --round {max_round} --num {num_samples} --id {i} --exp {experiment}'
            print(f'----------------{port}----------------')
            try:
                t=threading.Thread(target=con.run,args=(command,))
                t.start()
            except:
                print('ERROR')

* 명령어 실행에 필요한 변수들을 정의한다
    * ```MIN_PORT``` : 최소 포트값 (default = 20101)
    * ```MAX_PORT``` : 최대 포트값 (default = 20136)
    * ```EXPERIMENT``` : experiment 번호
    * ```MAX_ROUND``` : 학습할 round
    * ```NUM_SAMPLES``` : 각 클라이언트에서 학습에 사용할 수 있는 데이터의 개수 (experiment 3, 4에 대해서는 무시)
    * ```CLIENT_NUM``` : 클라이언트 수

In [9]:
MIN_PORT = 20101
MAX_PORT = 20106
EXPERIMENT = 1
MAX_ROUND = 5
NUM_SAMPLES = 600

In [10]:
jetson = Jetson(min_port = MIN_PORT, max_port=MAX_PORT)
CLIENT_NUM = jetson.check() # 통신 전에 무조건 실행되야 함

----------------20101----------------
ambient_fl
Desktop
Documents
Downloads
examples.desktop
FedML-IoT
FL
jetson
jetson_network.sh
Music
Pictures
Public
Templates
Videos
----------------20102----------------
ambient_fl
Desktop
Documents
Downloads
examples.desktop
FL
jetson
jetson_network.sh
Music
Pictures
Public
Templates
Videos
----------------20103----------------
ambient_fl
Desktop
Documents
Downloads
examples.desktop
FL
jetson
jetson_network.sh
Music
Pictures
Public
Templates
Videos
----------------20104----------------
ambient_fl
Desktop
Documents
Downloads
examples.desktop
FL
jetson
jetson_network.sh
Music
Pictures
Public
Templates
Videos
----------------20105----------------
ambient_fl
Desktop
Documents
Downloads
examples.desktop
FL
jetson
jetson_network.sh
Music
Pictures
Public
Templates
Videos
----------------20106----------------
ambient_fl
Desktop
Documents
Downloads
examples.desktop
FL
jetson
jetson_network.sh
Music
Pictures
Public
Templates
Videos
Available ports [20101, 

* 아래 커맨드를 통해 서버를 초기화 시키고, 클라이언트 수, 실험, 전체 round 수를 서버에 알려줄 수 있다.
* 초기화가 성공한 경우 ```<Response [200]> Initialized server>``` 라고 뜰 것이다

In [115]:
import requests
init = requests.get(f"http://147.47.200.178:9103/initialize/{CLIENT_NUM}/{EXPERIMENT}/{MAX_ROUND}")
print(init, init.text)

<Response [200]> Initialized server


In [6]:
jetson.send_command("docker stop $(docker ps -a -q)")

----------------20101----------------
e67b09ca440a
----------------20102----------------
f5a94b1e067e
----------------20103----------------
8ba4ee27ca05
----------------20104----------------
657c33f85423
----------------20105----------------
f5138f72af99
----------------20106----------------
1328e4e79959


In [109]:
jetson.send_command("docker pull crazyboy9103/jetson_fl:latest")

----------------20101----------------
latest: Pulling from crazyboy9103/jetson_fl
Digest: sha256:f85a28e7d790b5e099f8c996970ef9e60438bffd28f2cd70e974dbfc547fb46a
Status: Image is up to date for crazyboy9103/jetson_fl:latest
docker.io/crazyboy9103/jetson_fl:latest
----------------20102----------------
latest: Pulling from crazyboy9103/jetson_fl
Digest: sha256:f85a28e7d790b5e099f8c996970ef9e60438bffd28f2cd70e974dbfc547fb46a
Status: Image is up to date for crazyboy9103/jetson_fl:latest
docker.io/crazyboy9103/jetson_fl:latest
----------------20103----------------
latest: Pulling from crazyboy9103/jetson_fl
Digest: sha256:f85a28e7d790b5e099f8c996970ef9e60438bffd28f2cd70e974dbfc547fb46a
Status: Image is up to date for crazyboy9103/jetson_fl:latest
docker.io/crazyboy9103/jetson_fl:latest
----------------20104----------------
latest: Pulling from crazyboy9103/jetson_fl
Digest: sha256:f85a28e7d790b5e099f8c996970ef9e60438bffd28f2cd70e974dbfc547fb46a
Status: Image is up to date for crazyboy9103/j

In [110]:
jetson.send_command("docker run -d -ti --name temp --gpus all crazyboy9103/jetson_fl:latest")

----------------20101----------------
e67b09ca440a7ca4953e114b0b77d55548e1bce4573c3f48ce376cb3eab311a0
----------------20102----------------
f5a94b1e067e1b811ac16e51055058d25b7e9a1d00464bf686ecfdc3898e049b
----------------20103----------------
8ba4ee27ca05fa4f9d0de2cc438466e6011eec3357e6f6898195956010f4cb5e
----------------20104----------------
657c33f85423b44f459da014236003188e52b02c1272147ae8d03eee0c3072fb
----------------20105----------------
f5138f72af99d23d6f0e3f6a95f523f922398a014347bb57aeae2a7e24ff6b42
----------------20106----------------
1328e4e799598f5171b0e2f603cf13b95222aa796ed6c1254bc5cba9aaa2a1bd


In [11]:
jetson.send_command("docker ps -a")

----------------20101----------------
CONTAINER ID   IMAGE                           COMMAND       CREATED          STATUS                     PORTS     NAMES
e67b09ca440a   crazyboy9103/jetson_fl:latest   "/bin/bash"   11 minutes ago   Exited (0) 8 minutes ago             temp
----------------20102----------------
CONTAINER ID   IMAGE                           COMMAND       CREATED          STATUS                     PORTS     NAMES
f5a94b1e067e   crazyboy9103/jetson_fl:latest   "/bin/bash"   11 minutes ago   Exited (0) 7 minutes ago             temp
----------------20103----------------
CONTAINER ID   IMAGE                           COMMAND       CREATED          STATUS                     PORTS     NAMES
8ba4ee27ca05   crazyboy9103/jetson_fl:latest   "/bin/bash"   11 minutes ago   Exited (0) 7 minutes ago             temp
----------------20104----------------
CONTAINER ID   IMAGE                           COMMAND       CREATED          STATUS                     PORTS     NAMES
657c

* 아래 커맨드를 통해 ```MIN_PORT``` 부터 ```MAX_PORT```까지 federated learning을 시작한다. (결과는 서버에 쌓임) 

In [11]:
jetson.send_command("docker start temp")

----------------20101----------------
temp
----------------20102----------------
temp
----------------20103----------------
temp
----------------20104----------------
temp
----------------20105----------------
temp
----------------20106----------------
temp


In [12]:
jetson.start_fed(experiment=EXPERIMENT, 
                 max_round=MAX_ROUND,
                 num_samples=NUM_SAMPLES,
                 num_clients=CLIENT_NUM)

----------------20101----------------
----------------20102----------------
----------------20103----------------
----------------20104----------------
----------------20105----------------
----------------20106----------------


2021-11-18 03:06:12.999610: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.2
2021-11-18 03:06:13.079914: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.2
2021-11-18 03:06:14.431469: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.2
2021-11-18 03:06:14.607756: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.2
2021-11-18 03:06:14.904928: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.2
2021-11-18 03:06:15.031684: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.2
2021-11-18 03:06:18.949773: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic libr

2021-11-18 03:06:19.504302: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1046] ARM64 does not support NUMA - returning NUMA node zero
2021-11-18 03:06:19.504605: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x27efe770 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2021-11-18 03:06:19.504669: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA Tegra X1, Compute Capability 5.3
2021-11-18 03:06:19.505200: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1046] ARM64 does not support NUMA - returning NUMA node zero
2021-11-18 03:06:19.505368: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1742] Found device 0 with properties: 
pciBusID: 0000:00:00.0 name: NVIDIA Tegra X1 computeCapability: 5.3
coreClock: 0.9216GHz coreCount: 1 deviceMemorySize: 3.87GiB deviceMemoryBandwidth: 194.55MiB/s
2021-11-18 03:06:19.505461: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Suc

2021-11-18 03:06:23.390808: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcublas.so.10
2021-11-18 03:06:23.455205: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1046] ARM64 does not support NUMA - returning NUMA node zero
2021-11-18 03:06:23.455557: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x1c0f3350 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2021-11-18 03:06:23.455624: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA Tegra X1, Compute Capability 5.3
2021-11-18 03:06:23.456264: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1046] ARM64 does not support NUMA - returning NUMA node zero
2021-11-18 03:06:23.456442: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1742] Found device 0 with properties: 
pciBusID: 0000:00:00.0 name: NVIDIA Tegra X1 computeCapability: 5.3
coreClock: 0.9216GHz coreCount: 1 deviceMemory

2021-11-18 03:06:24.062864: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcublas.so.10
2021-11-18 03:06:24.150079: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1046] ARM64 does not support NUMA - returning NUMA node zero
2021-11-18 03:06:24.150584: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x40558350 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2021-11-18 03:06:24.150644: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA Tegra X1, Compute Capability 5.3
2021-11-18 03:06:24.151034: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1046] ARM64 does not support NUMA - returning NUMA node zero
2021-11-18 03:06:24.151163: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1742] Found device 0 with properties: 
pciBusID: 0000:00:00.0 name: NVIDIA Tegra X1 computeCapability: 5.3
coreClock: 0.9216GHz coreCount: 1 deviceMemory

2021-11-18 03:06:29.605484: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1046] ARM64 does not support NUMA - returning NUMA node zero
2021-11-18 03:06:29.605766: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1046] ARM64 does not support NUMA - returning NUMA node zero
2021-11-18 03:06:29.605935: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1428] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 306 MB memory) -> physical GPU (device: 0, name: NVIDIA Tegra X1, pci bus id: 0000:00:00.0, compute capability: 5.3)
2021-11-18 03:06:29.685857: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1283] Device interconnect StreamExecutor with strength 1 edge matrix:
2021-11-18 03:06:29.685947: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1289]      0 
2021-11-18 03:06:29.685977: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1302] 0:   N 
2021-11-18 03:06:29.686358: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1046] ARM64 doe