# Jetson Nano를 사용해서 Federated Learning으로 학습 과제
* 이전 실습에서 Django를 가지고 서버를 만들고 Federated learning을 구현해보는 실습을 진행하였다.
* 이번 과제에서는 Jetson Nano를 사용해서 Federated Learning을 진행해야 한다.

* 우선 필요한 패키지를 설치한다.

In [12]:
!pip install -q -r ../requirements.txt

In [11]:
%%writefile jetson_client.py
# Usage : python jetson_client.py --ip IP --p PORT
import matplotlib.pyplot as plt
import argparse
import json
import os
import threading
import time
from random import random
import numpy as np
import requests
import tensorflow as tf

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' # Quiet tensorflow error messages

class NumpyEncoder(json.JSONEncoder): # inherits JSONEncoder 
    def default(self, o):
        if isinstance(o, np.ndarray):
            return o.tolist()
        return json.JSONEncoder.default(self, o)

class Client:
    def __init__(self, max_round: int, time_delay = 5, suppress=True, num_samples=600, cliend_id = 0, experiment = 1):
        """
        @params: 
            experiment : Desired data split type (1~4)
            max_round : the maximum number of rounds that should be trained (arbitrary integer)
            model : the NN model type (either 'ann' or 'cnn')
            time_delay : the time delay until the next local check (arbitrary positive integer) 
                        (Need to increase this value if one round of training takes much longer than current time_delay. 
                        The reason is that any network communication until next round after the client has already uploaded 
                        the parameters for current round increases network overhead. Thus, higher time_delay will make communication
                        more stable while increasing the absolute time it takes. Requires careful selection of this value.)
            suppress : boolean value to print the logs
        
        @return: 
            None : Initializes the variables
                   Setup the urls for communication
                   Fetch client's id from the server
                   Downloads MNIST dataset and splits
                   Build model
        """
        
        '''
        Urls
        '''
        
        base_url = f"http://{IP}:{PORT}/" # Base Url
        self.put_weight_url = base_url + f"put_weight/{client_id}"
        self.get_weight_url = base_url + "get_server_weight" # Url that we send or fetch weight parameters
        self.round_url = base_url + "get_server_round" 
        self.put_accuracy_url = base_url + f"put_accuracy/{client_id}"
        
        '''
        Initial setup
        '''
        self.experiment = experiment
        self.client_id = client_id
        self.time_delay = time_delay
        self.suppress = suppress
        self.global_round = self.request_global_round()
        self.current_round = 0
        self.max_round = max_round # Set the maximum number of rounds
        
        '''
        Downloads MNIST dataset and prepares (train_x, train_y), (test_x, test_y)
        '''
        self.train_images, self.train_labels = None, None
        self.test_images, self.test_labels = None, None
        self.prepare_images()
        
        self.train_index_list = None
        self.test_index_list = None
        self.split_train_images = []
        self.split_train_labels = []
        
        self.local_data_num = 0
        self.data_split(num_samples=num_samples)
        
        '''
        Builds model
        '''
        self.model = None
        self.build_cnn_model()
        
    def prepare_images(self):
        """
        @params: 
            model : 'ann' or 'cnn'. They need slightly different format for the input. For cnn, we add additional dimension for channel
        
        @return: 
            None : Prepares MNIST images in the required format for each model
            
        """
        mnist = tf.keras.datasets.mnist
        (self.train_images, self.train_labels), (self.test_images, self.test_labels) = mnist.load_data()
        self.train_images, self.test_images = self.train_images / 255, self.test_images / 255
        
        # For CNN, add dummy channel to feed the images to CNN
        self.train_images=self.train_images.reshape(-1,28, 28, 1)
        self.test_images=self.test_images.reshape(-1,28, 28, 1)
            
    
    def build_cnn_model(self):
        """
        @params: 
            None
        
        @return: 
            None : saves the CNN model in self.model variable 
        """
        #This model definition must be same in the server (Federated.py)
        self.model = tf.keras.models.Sequential([
            tf.keras.layers.Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=(28, 28, 1)),
            tf.keras.layers.Conv2D(64, (3, 3), activation='relu'),
            tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
            tf.keras.layers.Dropout(0.25),
            tf.keras.layers.Flatten(),
            tf.keras.layers.Dense(128, activation='relu'),
            tf.keras.layers.Dropout(0.5),
            tf.keras.layers.Dense(10, activation='softmax')
        ])

        self.model.compile(optimizer=tf.keras.optimizers.SGD(),
                      loss=tf.keras.losses.SparseCategoricalCrossentropy(),
                      metrics=['accuracy'])
        
    def data_split(self, num_samples):
        """
        @params: 
            num_samples : The number of sample images in each client. This value is used for equally
                          sized dataset
        
        @return: 
            None : Split the dataset depending on the self.experiment value
           
                If self.experiment is 1: Uniform data split: We take equal amount of data from each class (iid)
                If self.experiment is 2: Random data split1: We take equal amount of data, but not uniformly distributed across classes
                If self.experiment is 3: Random data split2: We take different amount of data and not uniformly distributed across classes
                If self.experiment is 4: Skewed: We take disproportionate amount of data for some classes
                        
        """
        if self.train_index_list is None or self.test_index_list is None:
            self.train_index_list = [[], [], [], [], [], [], [], [], [], []]
            self.test_index_list = [[], [], [], [], [], [], [], [], [], []]
            for i, v in enumerate(self.train_labels):
                self.train_index_list[v].append(i)

            for i, v in enumerate(self.test_labels):
                self.test_index_list[v].append(i)

        
        self.split_train_images = []
        self.split_train_labels = []
        
        if self.experiment == 1: #uniform data split
            self.local_data_num = num_samples
            
            for i in range(len(self.train_index_list)):
                indices = self.train_index_list[i]
                random_indices = np.random.choice(indices, size=num_samples//10)
                
                self.split_train_images.extend(self.train_images[random_indices])
                self.split_train_labels.extend(self.train_labels[random_indices])
            

        elif self.experiment == 2: # Randomly selected, equally sized dataset
            self.local_data_num = num_samples
            random_indices = np.random.choice([i for i in range(len(self.train_labels))], size=num_samples)
            self.split_train_images = self.train_images[random_indices]
            self.split_train_labels = self.train_labels[random_indices]

            counts = [0 for _ in range(10)]
            
            for label in self.train_labels[random_indices]:
                counts[label] += 1
            
        elif self.experiment == 3: # Randomly selected, differently sized dataset
            n = np.random.randint(1, num_samples)
            self.local_data_num = n
            random_indices = np.random.choice([i for i in range(len(self.train_labels))], size=n)
            self.split_train_images = self.train_images[random_indices]
            self.split_train_labels = self.train_labels[random_indices]
            

            counts = [0 for _ in range(10)]
            
            for label in self.train_labels[random_indices]:
                counts[label] += 1
  
        elif self.experiment == 4: #Skewed
            temp = [i for i in range(10)]
            skewed_numbers = np.random.choice(temp, np.random.randint(1, 10))
            non_skewed_numbers = list(set(temp)-set(skewed_numbers))
            N = 0
            
            counts = [0 for _ in range(10)]
            
            for i in skewed_numbers:
                n = np.random.randint(50, 60)
                N += n
                
                indices = self.train_index_list[i]
                random_indices = np.random.choice(indices, size=n)
                
                self.split_train_images.extend(self.train_images[random_indices])
                self.split_train_labels.extend(self.train_labels[random_indices])
                
                counts[i] += n
            
                
            for i in non_skewed_numbers:
                n = np.random.randint(1, 10)
                N += n
                
                indices = self.train_index_list[i]
                random_indices = np.random.choice(indices, size=n)
                
                self.split_train_images.extend(self.train_images[random_indices])
                self.split_train_labels.extend(self.train_labels[random_indices])
                
                counts[i] += n
            
            self.local_data_num = N
        
        self.split_train_images = np.array(self.split_train_images)
        self.split_train_labels = np.array(self.split_train_labels)
        self.update_total_num_data(self.local_data_num)    

        
        
    def update_total_num_data(self, num_data):
        """
        num_data : the number of training images that the current client has
        
        update the total number of training images that is stored in the server
        """
        local_num_data_to_json = json.dumps(num_data)
        requests.put(self.total_num_data_url, data=local_num_data_to_json)

    
    def request_global_round(self):
        """
        result : Current global round that the server is in
        """
        result = requests.get(self.round_url)
        result = result.json()
        return result
    
    def request_global_weight(self):
        """
        global_weight : Up-to-date version of the model parameters
        """
        result = requests.get(self.weight_url)
        result_data = result.json()
        
        global_weight = None
        if result_data is not None:
            global_weight = []
            for i in range(len(result_data)):
                temp = np.array(result_data[i], dtype=np.float32)
                global_weight.append(temp)
            
        return global_weight

    def upload_local_weight(self, local_weight=[]):
        """
        local_weight : the local weight that current client has converged to
        
        Add current client's weights to the server (Server accumulates these from multiple clients and computes the global weight)
        """
        local_weight_to_json = json.dumps(local_weight, cls=NumpyEncoder)
        requests.put(self.put_weight_url, data=local_weight_to_json)
        
    def upload_local_accuracy(self, accuracy):
        accuracy_dic = {'accuracy': accuracy}
        accuracy_in_json = json.dumps(accuracy_dic)
        requests.get(self.put_accuracy_url, data = accuracy_in_json)
        
    def validation(self, local_weight=[]):
        """
        local_weight : the current client's weights
        
        acc : test accuracy of the current client's model
        """
        if local_weight is not None:
            self.model.set_weights(local_weight)
            acc = self.model.evaluate(self.test_images, self.test_labels, verbose=0 if self.suppress else 1)
            self.upload_local_accuracy(acc)
            e = {out: acc[i] for i, out in enumerate(self.model.metrics_names)}

            return acc
        
    def train_local_model(self):
        """
        local_weight : local weight of the current client after training
        """
        global_weight = self.request_global_weight()
        if global_weight != None:
            global_weight = np.array(global_weight)
            self.model.set_weights(global_weight)
        
        self.model.fit(self.split_train_images, self.split_train_labels, epochs=10, batch_size=16, verbose=0)
        local_weight = self.model.get_weights()
        return local_weight
    
    def task(self):
        """
        Federated learning task
        1. If the current round is larger than the max round that we set, finish
        2. If the global round = current client's round, the client needs update
        3. Otherwise, we need to wait until other clients to finish
        """
        
        #this is for executing on multiple devices
        self.global_round = self.request_global_round()

        if self.current_round >= self.max_round:
            print(f"Client {self.fed_id} finished")
            return 

        if self.global_round == self.current_round: #need update 
            global_weight = self.request_global_weight()
            local_weight = self.train_local_model()
            acc = self.validation(local_weight)
            self.upload_local_weight(local_weight)
            self.current_round += 1
            time.sleep(self.time_delay)
            return self.task()

        else: #need to wait until other clients finish
            time.sleep(self.time_delay * 2)
            return self.task()
        
if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Usage --ip {ip} --p {port} --max {max round} --delay {time delay} --num {num samples}")
    parser.add_argument("--ip", type=str, help="base ip address", default="127.0.0.1")
    parser.add_argument("--max", type=int, help="max round", default=5)
    parser.add_argument("--delay", type=int, help="time delay", default=5)
    parser.add_argument("--num", type=int, help="num samples", default=600)
    parser.add_argument("--id", type=int, help="client id", default=0)
    parser.add_argumetn("--exp", type=int, help="experiment number", default=1) #2,3,4
    args = parser.parse_args()

    client = Client(max_round = args.max, 
                    time_delay = args.delay, 
                    num_samples = args.num,  
                    suppress=True, 
                    cliend_id = args.id, 
                    experiment = args.exp)
    client.task()

Overwriting jetson_client.py


In [3]:
%%writefile jetson_server.py
import paramiko
import getpass
from ping3 import ping
import time
import argparse
import requests
from tqdm import tqdm


class Jetson:
    def __init__(self, min_port, max_port):
        assert int(min_port) < int(max_port), "max port must be >= min port"
        self.address = "147.47.200.209"
        self.host_address = args.serverip
        self.username, self.password = "jetson", "jetson"
        self.ports = [i for i in range(int(min_port), int(max_port)+1)]        
        self.available = []
        
    def check(self):
        # check which clients are online 
        cli = paramiko.SSHClient()
        cli.set_missing_host_key_policy(paramiko.AutoAddPolicy)       
        
        for port in self.ports:
            try:
                cli.connect(hostname=self.address, port=port, username=self.username, password=self.password)
                stdin, stdout, stderr = cli.exec_command("ls")
                lines = stdout.readlines()
                self.available.append(port)

            except:
                print(f"Port {port} Error")
                continue
                
        cli.close() 
        

    def start_fed(self, experiment, max_round, time_delay, num_samples):
        self.cli = paramiko.SSHClient()
        self.cli.set_missing_host_key_policy(paramiko.AutoAddPolicy)       
        
        for i, port in tqdm(enumerate(self.available), desc="Sending commands"):
            self.cli.connect(hostname=self.address, port=port, username=self.username, password=self.password)
            command = f"python jetson_client.py --ip {self.host_address} --max {max_round} --delay {time_delay} --num {num_samples} --id {i} --exp {experiment}"
            if i == 0:
                print(f"Sending command: {command}")
            stdin, stdout, stderr = self.cli.exec_command(command)
        self.cli.close()

    def init_jetson_nanos(self):
        self.check()
            
if __name__ =="__main__":
    parser = argparse.ArgumentParser(description="Usage --ip {ip} --p {port} --max {max round} --delay {time delay} --num {num samples}")
    parser.add_argument("--minp", type=int, help="min port", default=20101)
    parser.add_argument("--maxp", type=int, help="max port", default=20106)
    parser.add_argument("--mr", type=int, help="total # of rounds to run", default=5)
    parser.add_argument("--delay", type=int, help="time delay", default=5)
    parser.add_argument("--num", type=int, help="num samples", default=600)
    parser.add_argument("--exp", type=int, help="experiment number", default=1) #2,3,4
    parser.add_argument("--serverip", type=str, help="server ip address", default="localhost:22222")
    args = parser.parse_args()
    
    ###### VARIABLES ######
    MIN_PORT = args.minp
    MAX_PORT = args.maxp
    EXPERIMENT = args.exp
    MAX_ROUND = args.mr
    TIME_DELAY = args.delay
    CLIENT_NUM = 1 + (MAX_PORT-MIN_PORT)
    SERVER_ADD = args.serverip
    assert (CLIENT_NUM > 0)
    ###### INITIALIZE SERVER ######
    import requests
    init = requests.get(f"http://{SERVER_ADD}/initialize/{CLIENT_NUM}/{EXPERIMENT}/{MAX_ROUND}")
    reset = requests.get(f"http://{SERVER_ADD}/reset")
    print(init, init.text)

    jetson = Jetson(min_port = MIN_PORT, max_port=MAX_PORT)
    jetson.init_jetson_nanos()
    jetson.start_fed(experiment=args.exp, 
                     max_round=args.mr,
                     time_delay=args.delay, 
                     num_samples=args.num)

Overwriting jetson_server.py


In [4]:
!python3 jetson_server.py --minp 20131 --maxp 20136 --mr 5 --delay 5 --num 600 --exp 1 --serverip "147.47.200.178:22222"

Traceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packages/urllib3/connection.py", line 169, in _new_conn
    conn = connection.create_connection(
  File "/usr/local/lib/python3.8/dist-packages/urllib3/util/connection.py", line 96, in create_connection
    raise err
  File "/usr/local/lib/python3.8/dist-packages/urllib3/util/connection.py", line 86, in create_connection
    sock.connect(sa)
ConnectionRefusedError: [Errno 111] Connection refused

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packages/urllib3/connectionpool.py", line 699, in urlopen
    httplib_response = self._make_request(
  File "/usr/local/lib/python3.8/dist-packages/urllib3/connectionpool.py", line 394, in _make_request
    conn.request(method, url, **httplib_request_kw)
  File "/usr/local/lib/python3.8/dist-packages/urllib3/connection.py", line 234, in request
    super(HTTPConnect

# HW 10
* 실습에서 컴퓨터 하나로 Federated learning을 진행해 보았다.
* 이번 과제는 미리 설치해 놓은 여러 대의 Jetson Nano를 사용해서 실제로 네트워크 상에서 Federated learning을 진행하는 것이다. 
* 4개의 실험 (experiment:1,2,3,4)을 진행해보고, 결과를 비교하여 보고서로 작성하여 제출. (형식 자유)
* 보고서에는 실습에서의 결과와의 비교가 포함되어야 한다. (?) 