In [1]:
import glob
import os
import sys
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
from multiprocessing import Process
import gc

import flwr as fl
from flwr.server.strategy import FedAvg
import tensorflow as tf
import sklearn
import time

import numpy as np
import pandas as pd
from pandas import DataFrame

import collections

#!export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib/
# demonstration of calculating metrics for a neural network model using sklearn
from sklearn.datasets import make_circles
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix

from flwr.common.logger import log
from flwr.server.client_manager import ClientManager
from flwr.server.client_proxy import ClientProxy
from flwr.common import (
    EvaluateIns,
    EvaluateRes,
    FitIns,
    FitRes,
    MetricsAggregationFn,
    NDArrays,
    Parameters,
    Scalar,
    ndarrays_to_parameters,
    parameters_to_ndarrays,
)

# Make TensorFlow log less verbose
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"

2024-02-15 18:03:27.916676: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-02-15 18:03:28.259076: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-02-15 18:03:29.064355: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2024-02-15 18:03:29.064432: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or 

In [2]:
# argumentos
n = len(sys.argv)
print("Total arguments passed:", n)
iteracoes = 0
cycle_index = 1
finalIterations = 0
checkpoint_iteration = 0
if(n > 0):
    for value in sys.argv:
        print("arg:", value)
        if("iterations=" in value):
            try:
                iteracoes = int(value.replace("iterations=",""))
            except:
                print("no")
        
        if("cycle=" in value):
            try:
                cycle_index = int(value.replace("cycle=",""))
            except:
                print("no")
        if("checkpoint_iteration=" in value):
            try:
                iteracoes = int(value.replace("iterations=",""))
            except:
                print("no")
print("iteracoes:",iteracoes)      
print("cycle:",cycle_index)   

Total arguments passed: 3
arg: /home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/ipykernel_launcher.py
arg: -f
arg: /home/guilherme/.local/share/jupyter/runtime/kernel-84608fe3-8da3-4d07-b54d-4129e8145b56.json
iteracoes: 0
cycle: 1


In [3]:
# input folder
#inputFolders = "../02-transformed-data-new-testes/dados2019/"
inputFolderPath = "../data_2019_processed/"

# General configuration
NUMBER_OF_ITERATIONS_FINAL = 200
    
NUM_EPOCHS = 1
BATCH_SIZE = 32
VERBOSE = 0

# usado para experimentos
if(iteracoes > 0):
    NUMBER_OF_ITERATIONS_FINAL = iteracoes
    
NUMBER_OF_ITERATIONS = NUMBER_OF_ITERATIONS_FINAL

# output folder
outputFolder = "result_unbalanced_epoch_"+str(NUM_EPOCHS)+"_rounds_"+str(NUMBER_OF_ITERATIONS_FINAL)+"_cycle_"+str(cycle_index)

#outputFolder = "test_checkpoint"
checkPointFolder = outputFolder+"/checkpoints"

# last cycle
last_cycle_index = cycle_index - 1
lastCycleOutputFolder = "result_unbalanced_epoch_"+str(NUM_EPOCHS)+"_rounds_"+str(200)+"_cycle_"+str(last_cycle_index)
lastCycleOutputFolder = "result_unbalanced_epoch_"+str(NUM_EPOCHS)+"_rounds_"+str(NUMBER_OF_ITERATIONS_FINAL)+"_cycle_"+str(last_cycle_index)
iferredCycleDataFolder = lastCycleOutputFolder+"/inferred_datasets"

# train file name modifier
fileSufixTrain = "" # _smote for smote

fl.common.logger.configure(identifier="myFlowerExperiment", filename="log_"+outputFolder+".txt")

# usado para checkpoints
if(checkpoint_iteration > 0):
    NUMBER_OF_ITERATIONS_FINAL = checkpoint_iteration
    
NUMBER_OF_ITERATIONS = NUMBER_OF_ITERATIONS_FINAL

In [4]:
print("Checking whether the folder exists or not")
isExist = os.path.exists(outputFolder)
if not isExist:
    # Create a new directory because it does not exist
    os.makedirs(outputFolder)
    print("The new directory is created!")
else:
    print("The directory exists!")

Checking whether the folder exists or not
The new directory is created!


In [5]:
print("Checking whether the checkpoint folder exists or not")
isExist = os.path.exists(checkPointFolder)
if not isExist:
    # Create a new directory because it does not exist
    os.makedirs(checkPointFolder)
    print("The new checkpoint directory is created!")
else:
    print("The checkpoint directory exists!")

Checking whether the checkpoint folder exists or not
The new checkpoint directory is created!


In [6]:
print("check whether the cycle is 0 > or not, if so, the folder of inference must exist")
if(cycle_index > 0):
    isExist = os.path.exists(iferredCycleDataFolder)
    print(iferredCycleDataFolder)
    if not isExist:
        print("The folder of inference not exists!")
        sys.exit("The folder of inference not exists!")
    else:
        print("The checkpoint directory exists!")

check whether the cycle is 0 > or not, if so, the folder of inference must exist
result_unbalanced_epoch_1_rounds_200_cycle_0/inferred_datasets
The checkpoint directory exists!


In [7]:
# selected features
inputFeatures = ["activity","location","day_of_week","light","phone_lock","proximity","sound","time_to_next_alarm", "minutes_day"]
outputClasses = ["awake","asleep"]
#outputClasses = ["class"]

In [8]:
# client datasets used on the training process (75% of data)
trainFolders =  ['0Jf4TH9Zzse0Z1Jjh7SnTOe2MMzeSnFi7feTnkG6vgs',
                '0tdmm6rwW3KquQ73ATYYJ5JkpMtvbppJ0VzA2GExdA', 
                '2cyV53lVyUtlMj0BRwilEWtYJwUiviYoL48cZBPBq0', 
                '2J22RukYnEbKTk7t+iUVDBkorcyL5NKN6TrLe89ys', 
                #['5FLZBTVAPwdq9QezHE2sVCJIs7p+r6mCemA2gp9jATk'], #does not have the file
                '7EYF5I04EVqisUJCVNHlqn77UAuOmwL2Dahxd3cA', 
                'a9Qgj8ENWrHvl9QqlXcIPKmyGMKgbfHk9Dbqon1HQP4', 
                'ae4JJBZDycEcY8McJF+3BxyvZ1619y03BNdCxzpZTc', 
                'Ch3u5Oaz96VSrQbf0z31X6jEIbeIekkC0mwPzCdeJ1U', 
                'CH8f0yZkZL13zWuE9ks1CkVJRVrr+jsGdUXHrZ6YeA', 
                'DHO1K4jgiwZJOfQTrxvKE2vn7hkjamigroGD5IaeRc', 
                #'DHPqzSqSttiba1L3BD1cptNJPjSxZ8rXxF9mY3za6WA', # does not have asleep data
                'dQEFscjqnIlug8Tgq97JohhSQPG2DEOWJqS86wCrcY', 
                'HFvs2CohmhHte+AaCzFasjzegGzxZKPhkrX23iI6Xo', 
                'jgB9E8v3Z6PKdTRTCMAijBllA9YEMtrmHbe4qsbmJWw', 
                'JkY++R7E8myldLN3on6iQ78Ee78zCbrLuggfwGju3I', 
                'K4SLohf+TN1Ak8Dn8iE3Lme7rEMPISfppB2sXfHX8', 
                'oGaWetJJJEWHuvYdWYo826SQxfhCExVVQ2da8LE1Y7Q', 
                'pyt24oiDAHsmgWMvkFKz2fn2pwcHiXchd6KchLM', 
                #'PZCf1nfvhR+6fk+7+sPNMYOgb8BAMmtQtfoRS83Suc', # does not have asleep data
                'QUNCATForxzK0HHw46LrGOMWh0eVA8Y5XWEiUXX+cQ', 
                #'rIl2UK9+bQ+tzpFdbJAdbBxEa5GbgrgC030yEaENLw', 
                #'RoBW3cDOO9wWRMPO2twQff83MPc+OXn6gJ+a1DafreI', 
                'SH3kQeyd5volraxw8vOyhlowNqWBPr1IJ9URNXUL4']
                #'VVpwFNMrEglveh6MDN8lrRzTy5OwzglD4FURfM4A2is', 
                #'Wa1mcNmbh66S7VS6GIzyfCFMD3SGhbtDQyFP1ywJEsw', 
                #'XCKRE0BWRHxfP1kZIihgtT+jUjSp2GE8v5ZlhcIhVmA', 
                #'YI5Y79K6GXqAUoGP6PNyII8WKlAoel4urDxWSVVOvBw', 
                #'ypklj+8GJ15rOIH1lpKQtFJOuK+VdvyCuBPqhY3aoM', 
                #'ZSsAZ0Pq+MCqFrnjsRFn5Ua09pMCVaOV9c8ZuYb7XQY']
            
# client datasets used on the training process (25% of data)
testFolders =  [#'0Jf4TH9Zzse0Z1Jjh7SnTOe2MMzeSnFi7feTnkG6vgs',
                #'0tdmm6rwW3KquQ73ATYYJ5JkpMtvbppJ0VzA2GExdA', 
                #'2cyV53lVyUtlMj0BRwilEWtYJwUiviYoL48cZBPBq0', 
                #'2J22RukYnEbKTk7t+iUVDBkorcyL5NKN6TrLe89ys', 
                #['5FLZBTVAPwdq9QezHE2sVCJIs7p+r6mCemA2gp9jATk'], #does not have the file
                #'7EYF5I04EVqisUJCVNHlqn77UAuOmwL2Dahxd3cA', 
                #'a9Qgj8ENWrHvl9QqlXcIPKmyGMKgbfHk9Dbqon1HQP4', 
                #'ae4JJBZDycEcY8McJF+3BxyvZ1619y03BNdCxzpZTc', 
                #'Ch3u5Oaz96VSrQbf0z31X6jEIbeIekkC0mwPzCdeJ1U', 
                #'CH8f0yZkZL13zWuE9ks1CkVJRVrr+jsGdUXHrZ6YeA', 
                #'DHO1K4jgiwZJOfQTrxvKE2vn7hkjamigroGD5IaeRc', 
                #'DHPqzSqSttiba1L3BD1cptNJPjSxZ8rXxF9mY3za6WA', # does not have asleep data
                #'dQEFscjqnIlug8Tgq97JohhSQPG2DEOWJqS86wCrcY', 
                #'HFvs2CohmhHte+AaCzFasjzegGzxZKPhkrX23iI6Xo', 
                #'jgB9E8v3Z6PKdTRTCMAijBllA9YEMtrmHbe4qsbmJWw', 
                #'JkY++R7E8myldLN3on6iQ78Ee78zCbrLuggfwGju3I', 
                #'K4SLohf+TN1Ak8Dn8iE3Lme7rEMPISfppB2sXfHX8', 
                #'oGaWetJJJEWHuvYdWYo826SQxfhCExVVQ2da8LE1Y7Q', 
                #'pyt24oiDAHsmgWMvkFKz2fn2pwcHiXchd6KchLM', 
                #'PZCf1nfvhR+6fk+7+sPNMYOgb8BAMmtQtfoRS83Suc', # does not have asleep data
                #'QUNCATForxzK0HHw46LrGOMWh0eVA8Y5XWEiUXX+cQ', 
                'rIl2UK9+bQ+tzpFdbJAdbBxEa5GbgrgC030yEaENLw', 
                'RoBW3cDOO9wWRMPO2twQff83MPc+OXn6gJ+a1DafreI', 
                #'SH3kQeyd5volraxw8vOyhlowNqWBPr1IJ9URNXUL4'] 
                'VVpwFNMrEglveh6MDN8lrRzTy5OwzglD4FURfM4A2is', 
                'Wa1mcNmbh66S7VS6GIzyfCFMD3SGhbtDQyFP1ywJEsw', 
                'XCKRE0BWRHxfP1kZIihgtT+jUjSp2GE8v5ZlhcIhVmA', 
                'YI5Y79K6GXqAUoGP6PNyII8WKlAoel4urDxWSVVOvBw', 
                'ypklj+8GJ15rOIH1lpKQtFJOuK+VdvyCuBPqhY3aoM', 
                'ZSsAZ0Pq+MCqFrnjsRFn5Ua09pMCVaOV9c8ZuYb7XQY']

In [9]:
def generateMetrics(y_test,yhat_probs):
    # predict crisp classes for test set deprecated
    #yhat_classes = model.predict_classes(X_test, verbose=0)
    #yhat_classes = np.argmax(yhat_probs,axis=1)
    yhat_classes = yhat_probs.round()
    # accuracy: (tp + tn) / (p + n)
    accuracy = accuracy_score(y_test, yhat_classes)
    # precision tp / (tp + fp)
    precision = precision_score(y_test, yhat_classes)
    # recall: tp / (tp + fn)
    recall = recall_score(y_test, yhat_classes)
    # f1: 2 tp / (2 tp + fp + fn)
    f1 = f1_score(y_test, yhat_classes)
    # kappa
    kappa = cohen_kappa_score(y_test, yhat_classes)
    # ROC AUC
    auc = roc_auc_score(y_test, yhat_probs)
    # confusion matrix
    matrix = confusion_matrix(y_test, yhat_classes)
    #print(matrix)
    
    array = []
    results = dict()
    results['accuracy'] = accuracy
    results['precision'] = precision
    results['recall'] = recall
    results['f1_score'] = f1
    results['cohen_kappa_score'] = kappa
    results['roc_auc_score'] = auc
    results['matrix'] = ("[[ " +str(matrix[0][0]) + " " +str(matrix[0][1]) +"][ " +str(matrix[1][0]) + " " + str(matrix[1][1]) +"]]") # array.append(np.array(matrix,dtype=object))
    results['TP'] = matrix[0][0]
    results['FP'] = matrix[0][1]
    results['FN'] = matrix[1][0]
    results['TN'] = matrix[1][1]
    
    array.append(accuracy)
    array.append(precision)
    array.append(recall)
    array.append(f1)
    array.append(kappa)
    array.append(auc)
    array.append("[[ " +str(matrix[0][0]) + " " +str(matrix[0][1]) +"][ " +str(matrix[1][0]) + " " + str(matrix[1][1]) +"]]") # array.append(np.array(matrix,dtype=object))
    array.append(matrix[0][0]) # TP
    array.append(matrix[0][1]) # FP
    array.append(matrix[1][0]) # FN
    array.append(matrix[1][1]) # TN
    
    return results, array

# y_test     = Array with real values
# yhat_probs = Array with predicted values
def printMetrics(y_test,yhat_probs):
    # generate metrics
    results, array= generateMetrics(y_test,yhat_probs)

    # accuracy: (tp + tn) / (p + n)
    accuracy = results['accuracy']
    print('Accuracy: %f' % accuracy)
    # precision tp / (tp + fp)
    precision = results['precision']
    print('Precision: %f' % precision)
    # recall: tp / (tp + fn)
    recall = results['recall'] 
    print('Recall: %f' % recall)
    # f1: 2 tp / (2 tp + fp + fn)
    f1 = results['f1_score']
    print('F1 score: %f' % f1)
    # kappa
    kappa = results['cohen_kappa_score']
    print('Cohens kappa: %f' % kappa)
    # ROC AUC
    auc = results['roc_auc_score']
    print('ROC AUC: %f' % auc)
    # confusion matrix
    print("Confusion Matrix")
    matrix = results['matrix']
    print(matrix)
    
    return results, array

def generateGlobalMetrics(metrics):
    accuracy,precision,recall,f1_score,cohen_kappa_score,roc_auc_score = 0,0,0,0,0,0
    for metric in metrics:
        accuracy = accuracy + metric['accuracy']
        precision = precision + metric['precision']
        recall = recall + metric['recall']
        f1_score = f1_score + metric['f1_score']
        cohen_kappa_score = cohen_kappa_score + metric['cohen_kappa_score']
        roc_auc_score = roc_auc_score + metric['roc_auc_score']
        
    # mean
    size = len(metrics)
    print(size)
    accuracy = accuracy / size
    precision = precision / size
    recall = recall / size
    f1_score = f1_score / size
    cohen_kappa_score = cohen_kappa_score / size
    roc_auc_score = roc_auc_score / size
    
    return [accuracy,precision,recall,f1_score,cohen_kappa_score,roc_auc_score]

def showGlobalMetrics(metrics):
    res = generateGlobalMetrics(metrics)
    
    accuracy = res[0]
    precision = res[1]
    recall = res[2]
    f1_score = res[3]
    cohen_kappa_score = res[4]
    roc_auc_score = res[5]
    
    #show:\
    print("accuracy: ",accuracy)
    print("precision: ",precision)
    print("recall: ",recall)
    print("f1_score: ",f1_score)
    print("cohen_kappa_score: ",cohen_kappa_score)
    print("roc_auc_score: ",roc_auc_score)
    
    return res

In [10]:
# take the list of directories and concat them
def loadDataFromFolders(foldersToLoad,inputFolders,fileType = ""):
    print(len(foldersToLoad), "datasets")
    for i in range(0,len(foldersToLoad)):
        currentFolder = foldersToLoad[i]
        print(i , "-", currentFolder,inputFolders+"student_"+currentFolder+"_transformed"+fileType+".csv")
        #print(trainingDataSet[i])
        if(i == 0):
            temp_data = pd.read_csv(inputFolders+"student_"+currentFolder+"_transformed"+fileType+".csv")
        else:
            dataset = pd.read_csv(inputFolders+"student_"+currentFolder+"_transformed"+fileType+".csv")
            temp_data = pd.concat([temp_data, dataset])
    # return the dataset        
    return temp_data

# take the list of directories and concat them
def loadDataFromLastCycleFoldersOnList():
    clientList = []
    foldersToLoad = trainFolders
    print(len(foldersToLoad), "datasets")
    for i in range(0,len(foldersToLoad)):
        currentFolder = foldersToLoad[i]
        print(i , "-", currentFolder,iferredCycleDataFolder+"/student_"+currentFolder+".csv")
        #print(trainingDataSet[i])
        temp_data = pd.read_csv(iferredCycleDataFolder+"/student_"+currentFolder+".csv")
        temp_data['class'] = temp_data['awake']
        temp_data['class'] = temp_data['class'].astype('int32')
        
        mapper = {0: 'asleep', 1: 'awake'}

        temp_data['class'] = temp_data['class'].map(mapper)
        #temp_data['class'] = temp_data['class'].apply(lambda tpl: [mapper.get(x) for x in tpl])

        #temp_data['fullcoursenames'] = [[mapper.get(x) for x in tpl] for tpl in temp_data['itemsets']]

        #del temp_data['awake']
        #del temp_data['asleep']
        
        print("Adding to the list: ", temp_data.shape)
        clientList.append(temp_data)
    # return the dataset        
    return clientList

# take the list of directories and concat them
def loadDataFromFoldersOnList(foldersToLoad,inputFolders,fileType = ""):
    clientList = []
    print(len(foldersToLoad), "datasets")
    for i in range(0,len(foldersToLoad)):
        currentFolder = foldersToLoad[i]
        print(i , "-", currentFolder,inputFolders+"student_"+currentFolder+"_transformed"+fileType+".csv")
        #print(trainingDataSet[i])
        temp_data = pd.read_csv(inputFolders+"student_"+currentFolder+"_transformed"+fileType+".csv")
        print("Adding to the list: ", temp_data.shape)
        clientList.append(temp_data)
    # return the dataset        
    return clientList

In [11]:
print("Preparing test data")
 
# test data comprising 25% of the data. It must be fixed to all models being evaluated
#X_test  = pd.read_csv(inputFolders+"test/allData-classification-numeric-normalized.csv")
X_test = loadDataFromFolders(testFolders,inputFolderPath,"")

print()
# undestand the dataset by looking on their infos
print(X_test.info())

X_test

Preparing test data
8 datasets
0 - rIl2UK9+bQ+tzpFdbJAdbBxEa5GbgrgC030yEaENLw ../data_2019_processed/student_rIl2UK9+bQ+tzpFdbJAdbBxEa5GbgrgC030yEaENLw_transformed.csv
1 - RoBW3cDOO9wWRMPO2twQff83MPc+OXn6gJ+a1DafreI ../data_2019_processed/student_RoBW3cDOO9wWRMPO2twQff83MPc+OXn6gJ+a1DafreI_transformed.csv
2 - VVpwFNMrEglveh6MDN8lrRzTy5OwzglD4FURfM4A2is ../data_2019_processed/student_VVpwFNMrEglveh6MDN8lrRzTy5OwzglD4FURfM4A2is_transformed.csv
3 - Wa1mcNmbh66S7VS6GIzyfCFMD3SGhbtDQyFP1ywJEsw ../data_2019_processed/student_Wa1mcNmbh66S7VS6GIzyfCFMD3SGhbtDQyFP1ywJEsw_transformed.csv
4 - XCKRE0BWRHxfP1kZIihgtT+jUjSp2GE8v5ZlhcIhVmA ../data_2019_processed/student_XCKRE0BWRHxfP1kZIihgtT+jUjSp2GE8v5ZlhcIhVmA_transformed.csv
5 - YI5Y79K6GXqAUoGP6PNyII8WKlAoel4urDxWSVVOvBw ../data_2019_processed/student_YI5Y79K6GXqAUoGP6PNyII8WKlAoel4urDxWSVVOvBw_transformed.csv
6 - ypklj+8GJ15rOIH1lpKQtFJOuK+VdvyCuBPqhY3aoM ../data_2019_processed/student_ypklj+8GJ15rOIH1lpKQtFJOuK+VdvyCuBPqhY3aoM_transformed.csv


Unnamed: 0,activity,location,timestamp,time_to_next_alarm,sound,proximity,phone_lock,light,day_of_week,minutes_day,timestamp_text,class
0,0.75,1.0,0.000000e+00,0.000000,0.515992,1.0,0.0,0.000000,1.000000,0.678249,2018-05-14 16:16:08+00:00,asleep
1,0.25,1.0,3.211282e-07,0.000000,0.542171,0.0,1.0,0.000007,1.000000,0.678944,2018-05-14 16:17:39+00:00,asleep
2,0.25,1.0,6.422564e-07,0.000000,0.515992,0.0,1.0,0.000000,1.000000,0.679639,2018-05-14 16:18:39+00:00,asleep
3,0.00,1.0,6.422564e-07,0.000000,0.515992,0.0,1.0,0.000000,1.000000,0.680334,2018-05-14 16:19:09+00:00,asleep
4,0.25,1.0,6.422564e-07,0.000000,0.531341,0.0,1.0,0.000000,1.000000,0.681028,2018-05-14 16:20:09+00:00,asleep
...,...,...,...,...,...,...,...,...,...,...,...,...
23747,0.25,1.0,5.819100e-03,0.000099,0.000000,1.0,1.0,0.000236,0.166667,0.510076,2018-06-13 12:14:37+00:00,awake
23748,0.25,1.0,5.819743e-03,0.000694,0.000000,1.0,1.0,0.000325,0.166667,0.512856,2018-06-13 12:18:08+00:00,awake
23749,0.25,1.0,5.819743e-03,0.000595,0.000000,1.0,1.0,0.000325,0.166667,0.513551,2018-06-13 12:19:08+00:00,awake
23750,0.25,1.0,5.820064e-03,0.000595,0.000000,1.0,1.0,0.000354,0.166667,0.513551,2018-06-13 12:19:38+00:00,awake


In [12]:
print("Preparing X_train data")
# load cliend data
if(cycle_index == 0):
    print("Gething data from training folder")
    clientList = loadDataFromFoldersOnList(trainFolders,inputFolderPath,fileSufixTrain)
else:
    print("Gething data from human inference folder")
    clientList = loadDataFromLastCycleFoldersOnList()

NUMBER_OF_CLIENTS = len(clientList)
print("Total",(len(clientList)))

Preparing X_train data
Gething data from human inference folder
19 datasets
0 - 0Jf4TH9Zzse0Z1Jjh7SnTOe2MMzeSnFi7feTnkG6vgs result_unbalanced_epoch_1_rounds_200_cycle_0/inferred_datasets/student_0Jf4TH9Zzse0Z1Jjh7SnTOe2MMzeSnFi7feTnkG6vgs.csv
Adding to the list:  (17993, 12)
1 - 0tdmm6rwW3KquQ73ATYYJ5JkpMtvbppJ0VzA2GExdA result_unbalanced_epoch_1_rounds_200_cycle_0/inferred_datasets/student_0tdmm6rwW3KquQ73ATYYJ5JkpMtvbppJ0VzA2GExdA.csv
Adding to the list:  (11561, 12)
2 - 2cyV53lVyUtlMj0BRwilEWtYJwUiviYoL48cZBPBq0 result_unbalanced_epoch_1_rounds_200_cycle_0/inferred_datasets/student_2cyV53lVyUtlMj0BRwilEWtYJwUiviYoL48cZBPBq0.csv
Adding to the list:  (3383, 12)
3 - 2J22RukYnEbKTk7t+iUVDBkorcyL5NKN6TrLe89ys result_unbalanced_epoch_1_rounds_200_cycle_0/inferred_datasets/student_2J22RukYnEbKTk7t+iUVDBkorcyL5NKN6TrLe89ys.csv
Adding to the list:  (19389, 12)
4 - 7EYF5I04EVqisUJCVNHlqn77UAuOmwL2Dahxd3cA result_unbalanced_epoch_1_rounds_200_cycle_0/inferred_datasets/student_7EYF5I04EVqisUJCV

In [13]:
#clientList

In [14]:
#for t in clientList:
#    print(collections.Counter(t['class']))

In [15]:
#train =  loadDataFromFoldersOnList(trainFolders,inputFolderPath,fileSufixTrain)

In [16]:
#train

In [17]:
# one-hot encoding function
def transform_output_nominal_class_into_one_hot_encoding(dataset):
    # create two classes based on the single class
    one_hot_encoded_data = pd.get_dummies(dataset['class'])
    #print(one_hot_encoded_data)
    dataset['awake'] = one_hot_encoded_data['awake']
    dataset['asleep'] = one_hot_encoded_data['asleep']
    
    return dataset

# one-hot encoding function
def transform_output_numerical_class_into_one_hot_encoding(dataset):
    # create two classes based on the single class
    one_hot_encoded_data = pd.get_dummies(dataset['class'])
    #print(one_hot_encoded_data)
    dataset['awake'] = one_hot_encoded_data[0]
    dataset['asleep'] = one_hot_encoded_data[1]
    
    return dataset

# transform output to one_hot_encoding for the testing dataset
X_test = transform_output_nominal_class_into_one_hot_encoding(X_test)

# transform output to one_hot_encoding for the input dataset
for i in range(0,len(clientList)):
    clientList[i] = transform_output_nominal_class_into_one_hot_encoding(clientList[i])
    #print (clientList[i])
    

X_test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 134888 entries, 0 to 23751
Data columns (total 14 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   activity            134888 non-null  float64
 1   location            134888 non-null  float64
 2   timestamp           134888 non-null  float64
 3   time_to_next_alarm  134888 non-null  float64
 4   sound               134888 non-null  float64
 5   proximity           134888 non-null  float64
 6   phone_lock          134888 non-null  float64
 7   light               134888 non-null  float64
 8   day_of_week         134888 non-null  float64
 9   minutes_day         134888 non-null  float64
 10  timestamp_text      134888 non-null  object 
 11  class               134888 non-null  object 
 12  awake               134888 non-null  bool   
 13  asleep              134888 non-null  bool   
dtypes: bool(2), float64(10), object(2)
memory usage: 13.6+ MB


In [18]:
def transform_data_type(dataframe):
    
    # transform inputs
    for column in inputFeatures:
        dataframe[column] = dataframe[column].astype('float32')
    
    # transform outputs
    for column in outputClasses:
        dataframe[column] = dataframe[column].astype('float32')
    
    return dataframe

# transforms the data
X_test = transform_data_type(X_test)

X_test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 134888 entries, 0 to 23751
Data columns (total 14 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   activity            134888 non-null  float32
 1   location            134888 non-null  float32
 2   timestamp           134888 non-null  float64
 3   time_to_next_alarm  134888 non-null  float32
 4   sound               134888 non-null  float32
 5   proximity           134888 non-null  float32
 6   phone_lock          134888 non-null  float32
 7   light               134888 non-null  float32
 8   day_of_week         134888 non-null  float32
 9   minutes_day         134888 non-null  float32
 10  timestamp_text      134888 non-null  object 
 11  class               134888 non-null  object 
 12  awake               134888 non-null  float32
 13  asleep              134888 non-null  float32
dtypes: float32(11), float64(1), object(2)
memory usage: 9.8+ MB


In [19]:
print("Prepering the test dataset")
# selects the data to train and test
X_test_data = X_test[inputFeatures]
y_test_label = X_test[outputClasses]

# transtorm data to tensor slices
#client_test_dataset = tf.data.Dataset.from_tensor_slices((X_test_data.values, y_test_label.values))

#client_test_dataset = client_test_dataset.repeat(NUM_EPOCHS).batch(BATCH_SIZE, drop_remainder=True)
#client_test_dataset = client_test_dataset.repeat(NUM_EPOCHS).batch(BATCH_SIZE)

#print(client_test_dataset.element_spec)
#client_test_dataset

Prepering the test dataset


In [20]:
#print("preparing the training datasets")
#federated_training_data = []
# transform the data
#for i in range(0,len(clientList)):
#    # selects the data to train and test
#    data   = clientList[i][inputFeatures]
#    labels = clientList[i][outputClasses]
#    # transform the data to tensor slices
#    client_train_dataset = tf.data.Dataset.from_tensor_slices((data.values, labels.values))
    # apply the configs
#    client_train_dataset = client_train_dataset.repeat(NUM_EPOCHS).batch(BATCH_SIZE)
    # transform the data to
 #   federated_training_data.append(client_train_dataset)

In [21]:
print("creating model")

def create_keras_model():
    return tf.keras.models.Sequential([
      tf.keras.layers.InputLayer(input_shape=(9,)),
      #tf.keras.layers.Dense(9, activation=tf.keras.activations.relu), 
      tf.keras.layers.Dense(16, activation=tf.keras.activations.relu),
      tf.keras.layers.Dense(8, activation=tf.keras.activations.relu),
      tf.keras.layers.Dense(2, activation=tf.keras.activations.softmax)
      #tf.keras.layers.Dense(2, activation=tf.nn.sigmoid)
    ])

keras_model = create_keras_model()
#keras_model.summary()
keras_model.summary()

creating model
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 16)                160       
                                                                 
 dense_1 (Dense)             (None, 8)                 136       
                                                                 
 dense_2 (Dense)             (None, 2)                 18        
                                                                 
Total params: 314
Trainable params: 314
Non-trainable params: 0
_________________________________________________________________


2024-02-15 18:03:42.780401: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:966] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-02-15 18:03:42.814383: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudnn.so.8'; dlerror: libcudnn.so.8: cannot open shared object file: No such file or directory
2024-02-15 18:03:42.814405: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1934] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
2024-02-15 18:03:42.815568: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (o

In [22]:
# Load model and data (MobileNetV2, CIFAR-10)
#model = keras_model
#model.compile("adam", "categorical_crossentropy", metrics=["accuracy"])
#(x_train, y_train), (x_test, y_test) = tf.keras.datasets.cifar10.load_data()

In [23]:
def evaluate_and_save_results(keras_model,X_test_data, y_test_label, current_round_index, 
                              clientId, prefix_string = "Results", lossValue = -1):
     # predict values
    yhat_probs = keras_model.predict(X_test_data,verbose=VERBOSE)
    
    # as we deal with a classification problem with one hot encoding, we must round the values to 0 and 1.
    yhat_probs_rounded = yhat_probs.round()
    
    # create a dataframe with the predicted data
    y_predicted_df = pd.DataFrame(data=yhat_probs_rounded,columns=['awake','asleep']) 
    #y_test_label_label = pd.DataFrame(data=y_test_label,columns=['awake','asleep']) 
    
    roundData = []

    columns = ['client','round','loss','class','accuracy','precision','recall', 
               'f1_score','cohen_kappa_score','roc_auc_score','confusion_matrix',
               'TP','FP','FN','TN']
    
    # Instantiate the list that will contain the results
    listOfMetrics = list()
    
    #print('awake')    
    #res,resA = printMetrics(y_test_label['awake'],y_predicted_df['awake'])
    res,resA = generateMetrics(y_test_label['awake'],y_predicted_df['awake'])
    listOfMetrics.append(res)
    
    classData = np.concatenate(([clientId,current_round_index,lossValue,'awake'], resA))
    roundData.append(classData)
    
    #print('')
    #print('asleep')
    #res,resA = printMetrics(y_test_label['asleep'],y_predicted_df['asleep'])
    res,resA = generateMetrics(y_test_label['asleep'],y_predicted_df['asleep'])
    listOfMetrics.append(res)
    # new data
    classData = np.concatenate(([clientId,current_round_index,lossValue,'asleep'], resA))
    roundData.append(classData)
    
    #print('Global')
    #resA = showGlobalMetrics(listOfMetrics) #return [accuracy,precision,recall,f1_score,cohen_kappa_score,roc_auc_score
    resA = generateGlobalMetrics(listOfMetrics) #return [accuracy,precision,recall,f1_score,cohen_kappa_score,roc_auc_score
    # new data
    classData = np.concatenate(([clientId,current_round_index,lossValue,'avg'], resA))
    roundData.append(classData)
    
    dataMetrics = pd.DataFrame(data=roundData,columns=columns) 
    # write file
    if(clientId >= 0):
        outputMetricFile = outputFolder+"/"+prefix_string+"_MLP_client_" + str(clientId) + "_round_" + str(current_round_index) + ".csv"
    else:
        outputMetricFile = outputFolder+"/global_model_MLP_metrics.csv"
        outputMetricFile = outputFolder+"/"+prefix_string+".csv"
        # print global model results
        if(os.path.isfile(outputMetricFile)):
            dataset = pd.read_csv(outputMetricFile)
            dataMetrics = pd.concat([dataset, dataMetrics], axis=0)
        # Perform garbage collection
        gc.collect()
        
    dataMetrics.to_csv(outputMetricFile, sep=',', encoding='utf-8', index=False)

In [24]:
print("Loading checkpoint model",checkPointFolder+"/round-*")
list_of_files = [fname for fname in glob.glob(checkPointFolder+"/round-*")]
last_round_checkpoint = -1
latest_round_file = None
model_check_point = None
filename_np = None
filename_h5 = None

if len(list_of_files) > 0:
    latest_round_file = max(list_of_files, key=os.path.getctime)
    print("Loading pre-trained model from: ", latest_round_file)
    if(len(latest_round_file) > 0):
        # load the name
        last_round = latest_round_file.replace(checkPointFolder+"/round-","")
        last_round = last_round.replace("-weights.npz","")
        last_round = last_round.replace("-weights.h5","")
        print("Last round: ",last_round)
    
        last_round_checkpoint = int(last_round)
        filename_h5 = checkPointFolder+"/round-"+last_round+"-weights.h5"
        filename_np = checkPointFolder+"/round-"+last_round+"-weights.npz"
else:
    print("No checkpoint found")

    #check_point_model = tf.keras.models.load_model(latest_round_file)
    

Loading checkpoint model result_unbalanced_epoch_1_rounds_5_cycle_1/checkpoints/round-*
No checkpoint found


In [25]:
last_round_checkpoint

-1

In [26]:
#if latest_round_file is not None:
#    keras_model.load_weights(latest_round_file)

In [27]:
NUMBER_OF_ITERATIONS

5

In [28]:
if(last_round_checkpoint > -1):
    NUMBER_OF_ITERATIONS = NUMBER_OF_ITERATIONS_FINAL - (last_round_checkpoint)

    print("Number of iteractions after the checkpoint: ",NUMBER_OF_ITERATIONS)

In [29]:
class SaveModelStrategy(fl.server.strategy.FedAvg):
    def aggregate_fit(
        self,
        server_round: int,
        results: List[Tuple[fl.server.client_proxy.ClientProxy, fl.common.FitRes]],
        failures: List[Union[Tuple[ClientProxy, FitRes], BaseException]],
    ) -> Tuple[Optional[Parameters], Dict[str, Scalar]]:

        # Call aggregate_fit from base class (FedAvg) to aggregate parameters and metrics
        aggregated_parameters, aggregated_metrics = super().aggregate_fit(server_round, results, failures)
        
        #print("TEsteeee", aggregated_parameters)
        if aggregated_parameters is not None:
            # Convert `Parameters` to `List[np.ndarray]`
            aggregated_ndarrays: List[np.ndarray] = fl.common.parameters_to_ndarrays(aggregated_parameters)

            # Save aggregated_ndarrays
            print(f"Saving round {server_round} aggregated_ndarrays...")
            fileName = f"{checkPointFolder}/round-{server_round}-weights.npz"
            print(fileName)
            #print(aggregated_parameters)
            print()
            np.savez(fileName, *aggregated_ndarrays)
            #np.savez(fileName+"2", *aggregated_parameters)
            #keras_model = create_keras_model()
            #keras_model.set_weights(aggregated_parameters)
            #keras_model.save_weights(fileName)

        return aggregated_parameters, aggregated_metrics

In [30]:
print("Declarating client function")

# Define a Flower client
class FlowerISABELASleepClient(fl.client.NumPyClient):

    def __init__(self, clientId, model, X_train_data, y_train_label,round_index=0):
        self.round_index = round_index
        self.clientId = clientId
        self.model = model
        self.X_train_data = X_train_data
        self.y_train_label = y_train_label


    def get_parameters(self, config):
        """Return current weights."""
        return self.model.get_weights()

    def fit(self, parameters, config):
               
        """Fit model and return new weights as well as number of training examples."""
        self.model.set_weights(parameters)
        
        
        # Evaluate local model parameters on the local test data
        loss, accuracy = self.model.evaluate(self.X_train_data, self.y_train_label,verbose=VERBOSE)       
        # print model results (verify the quality from the proxy data)
        evaluate_and_save_results(self.model,self.X_train_data, self.y_train_label, self.round_index, self.clientId,"local_data_before_fit",loss)
 
        # use the checkpoint if it is not -1
        #if(last_round_checkpoint == self.round_index and last_round_checkpoint != -1):
        #    print("loading checkpoint: ",filename_h5, " to client ",self.clientId)
        #    print("loading", latest_round_file)
        #    self.model = tf.keras.models.load_weights(filename_h5)
            
        self.model.fit(self.X_train_data, self.y_train_label, epochs=NUM_EPOCHS, batch_size=BATCH_SIZE,verbose=VERBOSE)

        # Evaluate local model parameters on the local test data
        loss, accuracy = self.model.evaluate(X_test_data, y_test_label,verbose=VERBOSE)

        # print model results
        evaluate_and_save_results(self.model,X_test_data, y_test_label, self.round_index, self.clientId,"proxy_data_after_fit",loss)
        return self.model.get_weights(), len(self.X_train_data), {}
    

Declarating client function


In [31]:
def get_evaluate_fn( model):
    
    def evaluate(
        server_round: int, parameters: NDArrays, config: Dict[str, Scalar]
    ) -> Optional[Tuple[float, Dict[str, Scalar]]]:
        
        current_round = server_round
        
        if(last_round_checkpoint > -1):
            current_round = server_round + last_round_checkpoint
            
        print("Evaluating global model round",current_round)
        
        model.set_weights(parameters)
        
        # Evaluate local model parameters on the local test data
        loss, accuracy = model.evaluate(X_test_data, y_test_label,verbose=VERBOSE)

        # only saves if the server_round + last_round_checkpoint != last_round_checkpoint to avaid duble metrics
        if(current_round > last_round_checkpoint):
            # print model results
            evaluate_and_save_results(model,X_test_data, y_test_label, current_round, -1,"global_model_metrics_after_agregation",loss)

            #checkpoint
            fileName = f"{checkPointFolder}/round-{current_round}-weights.h5"
            model.save_weights(fileName)
        else:
            print("Round already evaluated")
        
        return loss, { 'accuracy': accuracy }
    return evaluate

In [32]:
import ray
import math
# Create an instance of the model and get the parameters

# Specify client resources if you need GPU (defaults to 1 CPU and 0 GPU)
client_resources = None
#if DEVICE.type == "cuda":

client_resources = {"num_cpus": 1}

#keras_model = create_keras_model()
keras_model.compile("adam", "categorical_crossentropy", metrics=["accuracy"])



def client_fn(cid) -> FlowerISABELASleepClient:
    print("starting client: "+str(cid),type(cid))
    #convert client ID to int
    clientId = int(cid)
    print("starting client: ", type(clientId))

    data   = clientList[clientId][inputFeatures]
    labels = clientList[clientId][outputClasses]
    
    print("Creating client model to client: "+str(cid))
    print("Data X: "+str(len(data)))
    print("Data Y: "+str(len(labels)))
    
    file_global_model = outputFolder+"/global_model_metrics_after_agregation.csv"
    index_round = 0 
    
    # get last
    if(os.path.isfile(file_global_model)):
        dataset = pd.read_csv(file_global_model)
        index_round = dataset["round"].max() + 1
        del dataset
    
    # update the index round in the previous checkpoint
    if(last_round_checkpoint > -1 and (index_round == last_round_checkpoint)):
        index_round = last_round_checkpoint
    
    print("Creating client model to client: "+str(cid),"round",index_round)
    # Load and compile a Keras model for CIFAR-10
    model = create_keras_model()
    model.compile("adam", "categorical_crossentropy", metrics=["accuracy"])
    
    proxyClient = FlowerISABELASleepClient(clientId,model,data,labels,index_round)
    
    return proxyClient

strategy = SaveModelStrategy(
    min_available_clients=NUMBER_OF_CLIENTS,
    evaluate_fn=get_evaluate_fn(keras_model)
) # (same arguments as FedAvg here)

# load checkpoint
if(filename_h5 is not None):
    
    #npzFile = np.load(filename_np)
    keras_model.load_weights(filename_h5)
    
    initial_parameters = keras_model.get_weights() 
    # Convert the weights (np.ndarray) to parameters (bytes)
    init_param = fl.common.ndarrays_to_parameters(initial_parameters)

    strategy = SaveModelStrategy(
        min_available_clients=NUMBER_OF_CLIENTS,
        evaluate_fn=get_evaluate_fn(keras_model),
        initial_parameters = init_param
    ) # (same arguments as FedAvg here)

# Start simulation
fl.simulation.start_simulation(
    client_fn=client_fn,
    num_clients=NUMBER_OF_CLIENTS,
    config=fl.server.ServerConfig(num_rounds=NUMBER_OF_ITERATIONS),  # Just three rounds
    client_resources=client_resources,
    strategy = strategy
)

INFO flwr 2024-02-15 18:03:49,890 | app.py:178 | Starting Flower simulation, config: ServerConfig(num_rounds=5, round_timeout=None)
2024-02-15 18:03:51,824	INFO worker.py:1621 -- Started a local Ray instance.
INFO flwr 2024-02-15 18:03:52,946 | app.py:213 | Flower VCE: Ray initialized with resources: {'CPU': 16.0, 'node:__internal_head__': 1.0, 'node:172.30.126.159': 1.0, 'object_store_memory': 2084240179.0, 'memory': 4168480359.0}
INFO flwr 2024-02-15 18:03:52,947 | app.py:219 | Optimize your simulation with Flower VCE: https://flower.dev/docs/framework/how-to-run-simulations.html
INFO flwr 2024-02-15 18:03:52,948 | app.py:242 | Flower VCE: Resources for each Virtual Client: {'num_cpus': 1}
INFO flwr 2024-02-15 18:03:52,963 | app.py:288 | Flower VCE: Creating VirtualClientEngineActorPool with 16 actors
INFO flwr 2024-02-15 18:03:52,965 | server.py:89 | Initializing global parameters
INFO flwr 2024-02-15 18:03:52,966 | server.py:276 | Requesting initial parameters from one random clien

[2m[36m(DefaultActor pid=24189)[0m starting client: 4 <class 'str'>
[2m[36m(DefaultActor pid=24189)[0m starting client:  <class 'int'>
[2m[36m(DefaultActor pid=24189)[0m Creating client model to client: 4
[2m[36m(DefaultActor pid=24189)[0m Data X: 2753
[2m[36m(DefaultActor pid=24189)[0m Data Y: 2753
[2m[36m(DefaultActor pid=24189)[0m Creating client model to client: 4 round 0
Evaluating global model round 0


[2m[36m(DefaultActor pid=24189)[0m   client = check_clientfn_returns_client(client_fn(cid))


2


INFO flwr 2024-02-15 18:04:05,787 | server.py:94 | initial parameters (loss, other metrics): 0.6753955483436584, {'accuracy': 0.6329399347305298}
INFO flwr 2024-02-15 18:04:05,788 | server.py:104 | FL starting
DEBUG flwr 2024-02-15 18:04:05,789 | server.py:222 | fit_round 1: strategy sampled 19 clients (out of 19)
[2m[36m(DefaultActor pid=24189)[0m   client = check_clientfn_returns_client(client_fn(cid))


[2m[36m(DefaultActor pid=24189)[0m starting client: 15 <class 'str'>
[2m[36m(DefaultActor pid=24189)[0m starting client:  <class 'int'>
[2m[36m(DefaultActor pid=24189)[0m Creating client model to client: 15
[2m[36m(DefaultActor pid=24189)[0m Data X: 21669
[2m[36m(DefaultActor pid=24189)[0m Data Y: 21669
[2m[36m(DefaultActor pid=24189)[0m Creating client model to client: 15 round 0
[2m[36m(DefaultActor pid=24187)[0m starting client: 10 <class 'str'>
[2m[36m(DefaultActor pid=24187)[0m starting client:  <class 'int'>
[2m[36m(DefaultActor pid=24187)[0m Creating client model to client: 10
[2m[36m(DefaultActor pid=24187)[0m Data X: 26020
[2m[36m(DefaultActor pid=24187)[0m Data Y: 26020
[2m[36m(DefaultActor pid=24187)[0m Creating client model to client: 10 round 0


[2m[36m(DefaultActor pid=24187)[0m   client = check_clientfn_returns_client(client_fn(cid))


[2m[36m(DefaultActor pid=24189)[0m 2
[2m[36m(DefaultActor pid=24171)[0m starting client: 13 <class 'str'>[32m [repeated 14x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/ray-logging.html#log-deduplication for more options.)[0m
[2m[36m(DefaultActor pid=24171)[0m starting client:  <class 'int'>[32m [repeated 14x across cluster][0m
[2m[36m(DefaultActor pid=24171)[0m Creating client model to client: 13[32m [repeated 14x across cluster][0m
[2m[36m(DefaultActor pid=24171)[0m Data X: 23244[32m [repeated 14x across cluster][0m
[2m[36m(DefaultActor pid=24171)[0m Data Y: 23244[32m [repeated 14x across cluster][0m
[2m[36m(DefaultActor pid=24171)[0m Creating client model to client: 13 round 0[32m [repeated 14x across cluster][0m


ERROR flwr 2024-02-15 18:04:13,877 | ray_client_proxy.py:161 | Traceback (most recent call last):
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/flwr/simulation/ray_transport/ray_client_proxy.py", line 151, in _submit_job
    res, updated_context = self.actor_pool.get_client_result(self.cid, timeout)
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/flwr/simulation/ray_transport/ray_actor.py", line 425, in get_client_result
    return self._fetch_future_result(cid)
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/flwr/simulation/ray_transport/ray_actor.py", line 306, in _fetch_future_result
    res_cid, res, updated_context = ray.get(
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/ray/_private/auto_init_hook.py", line 24, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/home/gui

ERROR flwr 2024-02-15 18:04:13,882 | ray_client_proxy.py:161 | Traceback (most recent call last):
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/flwr/simulation/ray_transport/ray_client_proxy.py", line 151, in _submit_job
    res, updated_context = self.actor_pool.get_client_result(self.cid, timeout)
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/flwr/simulation/ray_transport/ray_actor.py", line 425, in get_client_result
    return self._fetch_future_result(cid)
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/flwr/simulation/ray_transport/ray_actor.py", line 306, in _fetch_future_result
    res_cid, res, updated_context = ray.get(
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/ray/_private/auto_init_hook.py", line 24, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/home/gui

ERROR flwr 2024-02-15 18:04:21,984 | ray_client_proxy.py:161 | Traceback (most recent call last):
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/flwr/simulation/ray_transport/ray_client_proxy.py", line 151, in _submit_job
    res, updated_context = self.actor_pool.get_client_result(self.cid, timeout)
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/flwr/simulation/ray_transport/ray_actor.py", line 425, in get_client_result
    return self._fetch_future_result(cid)
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/flwr/simulation/ray_transport/ray_actor.py", line 306, in _fetch_future_result
    res_cid, res, updated_context = ray.get(
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/ray/_private/auto_init_hook.py", line 24, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/home/gui

[2m[36m(DefaultActor pid=24174)[0m 2[32m [repeated 16x across cluster][0m


ERROR flwr 2024-02-15 18:04:28,147 | ray_client_proxy.py:161 | Traceback (most recent call last):
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/flwr/simulation/ray_transport/ray_client_proxy.py", line 151, in _submit_job
    res, updated_context = self.actor_pool.get_client_result(self.cid, timeout)
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/flwr/simulation/ray_transport/ray_actor.py", line 425, in get_client_result
    return self._fetch_future_result(cid)
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/flwr/simulation/ray_transport/ray_actor.py", line 306, in _fetch_future_result
    res_cid, res, updated_context = ray.get(
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/ray/_private/auto_init_hook.py", line 24, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/home/gui

DEBUG flwr 2024-02-15 18:04:29,913 | server.py:236 | fit_round 1 received 12 results and 7 failures


Saving round 1 aggregated_ndarrays...
result_unbalanced_epoch_1_rounds_5_cycle_1/checkpoints/round-1-weights.npz

Evaluating global model round 1


INFO flwr 2024-02-15 18:04:36,627 | server.py:125 | fit progress: (1, 0.7319059371948242, {'accuracy': 0.7041471600532532}, 30.83866578800007)
DEBUG flwr 2024-02-15 18:04:36,629 | server.py:173 | evaluate_round 1: strategy sampled 19 clients (out of 19)


2


[2m[36m(DefaultActor pid=24171)[0m   _warn_prf(average, modifier, msg_start, len(result))
[2m[36m(DefaultActor pid=24186)[0m   client = check_clientfn_returns_client(client_fn(cid))
[2m[36m(DefaultActor pid=24171)[0m   client = check_clientfn_returns_client(client_fn(cid))


[2m[36m(DefaultActor pid=24171)[0m starting client: 12 <class 'str'>
[2m[36m(DefaultActor pid=24171)[0m starting client:  <class 'int'>
[2m[36m(DefaultActor pid=24171)[0m Creating client model to client: 12
[2m[36m(DefaultActor pid=24171)[0m Data X: 31873
[2m[36m(DefaultActor pid=24171)[0m Data Y: 31873
[2m[36m(DefaultActor pid=24171)[0m Creating client model to client: 12 round 0
[2m[36m(DefaultActor pid=24186)[0m 2[32m [repeated 11x across cluster][0m


ERROR flwr 2024-02-15 18:04:37,233 | ray_client_proxy.py:161 | Traceback (most recent call last):
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/flwr/simulation/ray_transport/ray_client_proxy.py", line 151, in _submit_job
    res, updated_context = self.actor_pool.get_client_result(self.cid, timeout)
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/flwr/simulation/ray_transport/ray_actor.py", line 425, in get_client_result
    return self._fetch_future_result(cid)
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/flwr/simulation/ray_transport/ray_actor.py", line 306, in _fetch_future_result
    res_cid, res, updated_context = ray.get(
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/ray/_private/auto_init_hook.py", line 24, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/home/gui

ERROR flwr 2024-02-15 18:04:37,239 | ray_client_proxy.py:162 | Task was killed due to the node running low on memory.
Memory on the node (IP: 172.30.126.159, ID: 6a6218aaeb17a738f81f0d76aa128037ca9b5ff2352dd37d55158a67) where the task (actor ID: 388fabd5122dfa924f937a4401000000, name=DefaultActor.__init__, pid=24184, memory used=0.33GB) was running was 7.07GB / 7.44GB (0.950416), which exceeds the memory usage threshold of 0.95. Ray killed this worker (ID: 7183a21e8c6dc6d32f920bd6e9b8a0be3c757fc658e03caf33a1bb11) because it was the most recently scheduled task; to see more information about memory usage on this node, use `ray logs raylet.out -ip 172.30.126.159`. To see the logs of the worker, use `ray logs worker-7183a21e8c6dc6d32f920bd6e9b8a0be3c757fc658e03caf33a1bb11*out -ip 172.30.126.159. Top 10 memory users:
PID	MEM(GB)	COMMAND
23616	1.12	/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/bin/python -m ipykernel_launcher -f /h...
24184	0.33	ray::DefaultActor.run
24186	0

DEBUG flwr 2024-02-15 18:04:37,267 | server.py:187 | evaluate_round 1 received 0 results and 19 failures
DEBUG flwr 2024-02-15 18:04:37,268 | server.py:222 | fit_round 2: strategy sampled 19 clients (out of 19)
ERROR flwr 2024-02-15 18:04:38,160 | ray_client_proxy.py:161 | Traceback (most recent call last):
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/flwr/simulation/ray_transport/ray_client_proxy.py", line 151, in _submit_job
    res, updated_context = self.actor_pool.get_client_result(self.cid, timeout)
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/flwr/simulation/ray_transport/ray_actor.py", line 425, in get_client_result
    return self._fetch_future_result(cid)
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/flwr/simulation/ray_transport/ray_actor.py", line 306, in _fetch_future_result
    res_cid, res, updated_context = ray.get

ERROR flwr 2024-02-15 18:04:38,237 | ray_client_proxy.py:162 | Task was killed due to the node running low on memory.
Memory on the node (IP: 172.30.126.159, ID: 6a6218aaeb17a738f81f0d76aa128037ca9b5ff2352dd37d55158a67) where the task (actor ID: d581abe9f194281cae040ffe01000000, name=DefaultActor.__init__, pid=24187, memory used=0.30GB) was running was 7.07GB / 7.44GB (0.950222), which exceeds the memory usage threshold of 0.95. Ray killed this worker (ID: ca86cd785378531d2b883fd9a151213ec5b7e9009ad88559c186049f) because it was the most recently scheduled task; to see more information about memory usage on this node, use `ray logs raylet.out -ip 172.30.126.159`. To see the logs of the worker, use `ray logs worker-ca86cd785378531d2b883fd9a151213ec5b7e9009ad88559c186049f*out -ip 172.30.126.159. Top 10 memory users:
PID	MEM(GB)	COMMAND
23616	1.10	/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/bin/python -m ipykernel_launcher -f /h...
24189	0.31	ray::DefaultActor.run
24182	0

ERROR flwr 2024-02-15 18:04:38,286 | ray_client_proxy.py:161 | Traceback (most recent call last):
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/flwr/simulation/ray_transport/ray_client_proxy.py", line 151, in _submit_job
    res, updated_context = self.actor_pool.get_client_result(self.cid, timeout)
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/flwr/simulation/ray_transport/ray_actor.py", line 425, in get_client_result
    return self._fetch_future_result(cid)
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/flwr/simulation/ray_transport/ray_actor.py", line 306, in _fetch_future_result
    res_cid, res, updated_context = ray.get(
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/ray/_private/auto_init_hook.py", line 24, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/home/gui

ERROR flwr 2024-02-15 18:04:38,292 | ray_client_proxy.py:161 | Traceback (most recent call last):
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/flwr/simulation/ray_transport/ray_client_proxy.py", line 151, in _submit_job
    res, updated_context = self.actor_pool.get_client_result(self.cid, timeout)
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/flwr/simulation/ray_transport/ray_actor.py", line 425, in get_client_result
    return self._fetch_future_result(cid)
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/flwr/simulation/ray_transport/ray_actor.py", line 306, in _fetch_future_result
    res_cid, res, updated_context = ray.get(
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/ray/_private/auto_init_hook.py", line 24, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/home/gui

ERROR flwr 2024-02-15 18:04:38,299 | ray_client_proxy.py:161 | Traceback (most recent call last):
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/flwr/simulation/ray_transport/ray_client_proxy.py", line 151, in _submit_job
    res, updated_context = self.actor_pool.get_client_result(self.cid, timeout)
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/flwr/simulation/ray_transport/ray_actor.py", line 425, in get_client_result
    return self._fetch_future_result(cid)
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/flwr/simulation/ray_transport/ray_actor.py", line 306, in _fetch_future_result
    res_cid, res, updated_context = ray.get(
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/ray/_private/auto_init_hook.py", line 24, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/home/gui

[2m[36m(DefaultActor pid=24185)[0m starting client: 12 <class 'str'>[32m [repeated 25x across cluster][0m
[2m[36m(DefaultActor pid=24185)[0m starting client:  <class 'int'>[32m [repeated 25x across cluster][0m
[2m[36m(DefaultActor pid=24185)[0m Creating client model to client: 12[32m [repeated 25x across cluster][0m
[2m[36m(DefaultActor pid=24185)[0m Data X: 31873[32m [repeated 25x across cluster][0m
[2m[36m(DefaultActor pid=24185)[0m Data Y: 31873[32m [repeated 25x across cluster][0m
[2m[36m(DefaultActor pid=24185)[0m Creating client model to client: 12 round 0[32m [repeated 25x across cluster][0m
[2m[36m(DefaultActor pid=24185)[0m 2[32m [repeated 10x across cluster][0m


[2m[33m(raylet)[0m [2024-02-15 18:04:51,751 E 23772 23772] (raylet) node_manager.cc:3084: 6 Workers (tasks / actors) killed due to memory pressure (OOM), 0 Workers crashed due to other reasons at node (ID: 6a6218aaeb17a738f81f0d76aa128037ca9b5ff2352dd37d55158a67, IP: 172.30.126.159) over the last time period. To see more information about the Workers killed on this node, use `ray logs raylet.out -ip 172.30.126.159`
[2m[33m(raylet)[0m 
[2m[33m(raylet)[0m Refer to the documentation on how to address the out of memory issue: https://docs.ray.io/en/latest/ray-core/scheduling/ray-oom-prevention.html. Consider provisioning more memory on this node or reducing task parallelism by requesting more CPUs per task. To adjust the kill threshold, set the environment variable `RAY_memory_usage_threshold` when starting Ray. To disable worker killing, set the environment variable `RAY_memory_monitor_refresh_ms` to zero.


[2m[36m(DefaultActor pid=24186)[0m 2
[2m[36m(DefaultActor pid=24181)[0m 2


DEBUG flwr 2024-02-15 18:04:56,508 | server.py:236 | fit_round 2 received 10 results and 9 failures


Saving round 2 aggregated_ndarrays...
result_unbalanced_epoch_1_rounds_5_cycle_1/checkpoints/round-2-weights.npz

Evaluating global model round 2


INFO flwr 2024-02-15 18:05:03,523 | server.py:125 | fit progress: (2, 0.8276991248130798, {'accuracy': 0.7227329611778259}, 57.73447790199998)
DEBUG flwr 2024-02-15 18:05:03,525 | server.py:173 | evaluate_round 2: strategy sampled 19 clients (out of 19)


2


[2m[36m(DefaultActor pid=24182)[0m   client = check_clientfn_returns_client(client_fn(cid))[32m [repeated 25x across cluster][0m
[2m[36m(DefaultActor pid=24179)[0m   _warn_prf(average, modifier, msg_start, len(result))


[2m[36m(DefaultActor pid=24171)[0m starting client: 5 <class 'str'>
[2m[36m(DefaultActor pid=24171)[0m starting client:  <class 'int'>
[2m[36m(DefaultActor pid=24171)[0m Creating client model to client: 5
[2m[36m(DefaultActor pid=24171)[0m Data X: 26567
[2m[36m(DefaultActor pid=24171)[0m Data Y: 26567
[2m[36m(DefaultActor pid=24171)[0m Creating client model to client: 5 round 0
[2m[36m(DefaultActor pid=24185)[0m 2[32m [repeated 8x across cluster][0m


ERROR flwr 2024-02-15 18:05:03,936 | ray_client_proxy.py:161 | Traceback (most recent call last):
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/flwr/simulation/ray_transport/ray_client_proxy.py", line 151, in _submit_job
    res, updated_context = self.actor_pool.get_client_result(self.cid, timeout)
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/flwr/simulation/ray_transport/ray_actor.py", line 425, in get_client_result
    return self._fetch_future_result(cid)
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/flwr/simulation/ray_transport/ray_actor.py", line 306, in _fetch_future_result
    res_cid, res, updated_context = ray.get(
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/ray/_private/auto_init_hook.py", line 24, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/home/gui

ERROR flwr 2024-02-15 18:05:03,948 | ray_client_proxy.py:161 | Traceback (most recent call last):
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/flwr/simulation/ray_transport/ray_client_proxy.py", line 151, in _submit_job
    res, updated_context = self.actor_pool.get_client_result(self.cid, timeout)
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/flwr/simulation/ray_transport/ray_actor.py", line 425, in get_client_result
    return self._fetch_future_result(cid)
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/flwr/simulation/ray_transport/ray_actor.py", line 306, in _fetch_future_result
    res_cid, res, updated_context = ray.get(
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/ray/_private/auto_init_hook.py", line 24, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/home/gui

ERROR flwr 2024-02-15 18:05:03,954 | ray_client_proxy.py:161 | Traceback (most recent call last):
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/flwr/simulation/ray_transport/ray_client_proxy.py", line 151, in _submit_job
    res, updated_context = self.actor_pool.get_client_result(self.cid, timeout)
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/flwr/simulation/ray_transport/ray_actor.py", line 425, in get_client_result
    return self._fetch_future_result(cid)
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/flwr/simulation/ray_transport/ray_actor.py", line 306, in _fetch_future_result
    res_cid, res, updated_context = ray.get(
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/ray/_private/auto_init_hook.py", line 24, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/home/gui

DEBUG flwr 2024-02-15 18:05:03,982 | server.py:187 | evaluate_round 2 received 0 results and 19 failures
DEBUG flwr 2024-02-15 18:05:03,983 | server.py:222 | fit_round 3: strategy sampled 19 clients (out of 19)
ERROR flwr 2024-02-15 18:05:04,517 | ray_client_proxy.py:161 | Traceback (most recent call last):
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/flwr/simulation/ray_transport/ray_client_proxy.py", line 151, in _submit_job
    res, updated_context = self.actor_pool.get_client_result(self.cid, timeout)
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/flwr/simulation/ray_transport/ray_actor.py", line 425, in get_client_result
    return self._fetch_future_result(cid)
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/flwr/simulation/ray_transport/ray_actor.py", line 306, in _fetch_future_result
    res_cid, res, updated_context = ray.get

ERROR flwr 2024-02-15 18:05:04,522 | ray_client_proxy.py:162 | Task was killed due to the node running low on memory.
Memory on the node (IP: 172.30.126.159, ID: 6a6218aaeb17a738f81f0d76aa128037ca9b5ff2352dd37d55158a67) where the task (actor ID: 1aabde1e63fd801f3a80b1b001000000, name=DefaultActor.__init__, pid=24178, memory used=0.33GB) was running was 7.07GB / 7.44GB (0.950049), which exceeds the memory usage threshold of 0.95. Ray killed this worker (ID: 5520f083efbdea86985d6fe1a9f8ec4a8d92792d52c2c3639df180d8) because it was the most recently scheduled task; to see more information about memory usage on this node, use `ray logs raylet.out -ip 172.30.126.159`. To see the logs of the worker, use `ray logs worker-5520f083efbdea86985d6fe1a9f8ec4a8d92792d52c2c3639df180d8*out -ip 172.30.126.159. Top 10 memory users:
PID	MEM(GB)	COMMAND
23616	1.13	/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/bin/python -m ipykernel_launcher -f /h...
24189	0.37	ray::DefaultActor.run
24182	0

ERROR flwr 2024-02-15 18:05:04,528 | ray_client_proxy.py:162 | Task was killed due to the node running low on memory.
Memory on the node (IP: 172.30.126.159, ID: 6a6218aaeb17a738f81f0d76aa128037ca9b5ff2352dd37d55158a67) where the task (actor ID: d581abe9f194281cae040ffe01000000, name=DefaultActor.__init__, pid=24187, memory used=0.30GB) was running was 7.07GB / 7.44GB (0.950222), which exceeds the memory usage threshold of 0.95. Ray killed this worker (ID: ca86cd785378531d2b883fd9a151213ec5b7e9009ad88559c186049f) because it was the most recently scheduled task; to see more information about memory usage on this node, use `ray logs raylet.out -ip 172.30.126.159`. To see the logs of the worker, use `ray logs worker-ca86cd785378531d2b883fd9a151213ec5b7e9009ad88559c186049f*out -ip 172.30.126.159. Top 10 memory users:
PID	MEM(GB)	COMMAND
23616	1.10	/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/bin/python -m ipykernel_launcher -f /h...
24189	0.31	ray::DefaultActor.run
24182	0

ERROR flwr 2024-02-15 18:05:04,535 | ray_client_proxy.py:161 | Traceback (most recent call last):
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/flwr/simulation/ray_transport/ray_client_proxy.py", line 151, in _submit_job
    res, updated_context = self.actor_pool.get_client_result(self.cid, timeout)
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/flwr/simulation/ray_transport/ray_actor.py", line 425, in get_client_result
    return self._fetch_future_result(cid)
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/flwr/simulation/ray_transport/ray_actor.py", line 306, in _fetch_future_result
    res_cid, res, updated_context = ray.get(
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/ray/_private/auto_init_hook.py", line 24, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/home/gui

ERROR flwr 2024-02-15 18:05:04,542 | ray_client_proxy.py:162 | Task was killed due to the node running low on memory.
Memory on the node (IP: 172.30.126.159, ID: 6a6218aaeb17a738f81f0d76aa128037ca9b5ff2352dd37d55158a67) where the task (actor ID: d581abe9f194281cae040ffe01000000, name=DefaultActor.__init__, pid=24187, memory used=0.30GB) was running was 7.07GB / 7.44GB (0.950222), which exceeds the memory usage threshold of 0.95. Ray killed this worker (ID: ca86cd785378531d2b883fd9a151213ec5b7e9009ad88559c186049f) because it was the most recently scheduled task; to see more information about memory usage on this node, use `ray logs raylet.out -ip 172.30.126.159`. To see the logs of the worker, use `ray logs worker-ca86cd785378531d2b883fd9a151213ec5b7e9009ad88559c186049f*out -ip 172.30.126.159. Top 10 memory users:
PID	MEM(GB)	COMMAND
23616	1.10	/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/bin/python -m ipykernel_launcher -f /h...
24189	0.31	ray::DefaultActor.run
24182	0

ERROR flwr 2024-02-15 18:05:06,868 | ray_client_proxy.py:162 | Task was killed due to the node running low on memory.
Memory on the node (IP: 172.30.126.159, ID: 6a6218aaeb17a738f81f0d76aa128037ca9b5ff2352dd37d55158a67) where the task (actor ID: 8080131699fdec87227ce5a101000000, name=DefaultActor.__init__, pid=24181, memory used=0.39GB) was running was 7.07GB / 7.44GB (0.950481), which exceeds the memory usage threshold of 0.95. Ray killed this worker (ID: 8f9f4202b7c08e54248e55344f63a4dff0d1bcebccfc900a981c5fe4) because it was the most recently scheduled task; to see more information about memory usage on this node, use `ray logs raylet.out -ip 172.30.126.159`. To see the logs of the worker, use `ray logs worker-8f9f4202b7c08e54248e55344f63a4dff0d1bcebccfc900a981c5fe4*out -ip 172.30.126.159. Top 10 memory users:
PID	MEM(GB)	COMMAND
23616	1.84	/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/bin/python -m ipykernel_launcher -f /h...
24171	0.40	ray::DefaultActor.run
24182	0

[2m[36m(DefaultActor pid=24176)[0m starting client: 2 <class 'str'>[32m [repeated 22x across cluster][0m
[2m[36m(DefaultActor pid=24176)[0m starting client:  <class 'int'>[32m [repeated 22x across cluster][0m
[2m[36m(DefaultActor pid=24176)[0m Creating client model to client: 2[32m [repeated 22x across cluster][0m
[2m[36m(DefaultActor pid=24176)[0m Data X: 3383[32m [repeated 22x across cluster][0m
[2m[36m(DefaultActor pid=24176)[0m Data Y: 3383[32m [repeated 22x across cluster][0m
[2m[36m(DefaultActor pid=24176)[0m Creating client model to client: 2 round 0[32m [repeated 22x across cluster][0m
[2m[36m(DefaultActor pid=24176)[0m 2[32m [repeated 10x across cluster][0m


DEBUG flwr 2024-02-15 18:05:24,128 | server.py:236 | fit_round 3 received 9 results and 10 failures


Saving round 3 aggregated_ndarrays...
result_unbalanced_epoch_1_rounds_5_cycle_1/checkpoints/round-3-weights.npz

Evaluating global model round 3


INFO flwr 2024-02-15 18:05:30,994 | server.py:125 | fit progress: (3, 0.8657560348510742, {'accuracy': 0.7227032780647278}, 85.20560923000016)
DEBUG flwr 2024-02-15 18:05:30,996 | server.py:173 | evaluate_round 3: strategy sampled 19 clients (out of 19)


2


[2m[36m(DefaultActor pid=24179)[0m   client = check_clientfn_returns_client(client_fn(cid))[32m [repeated 23x across cluster][0m


[2m[36m(DefaultActor pid=24186)[0m starting client: 5 <class 'str'>
[2m[36m(DefaultActor pid=24186)[0m starting client:  <class 'int'>
[2m[36m(DefaultActor pid=24186)[0m Creating client model to client: 5
[2m[36m(DefaultActor pid=24186)[0m Data X: 26567
[2m[36m(DefaultActor pid=24186)[0m Data Y: 26567
[2m[36m(DefaultActor pid=24186)[0m Creating client model to client: 5 round 0
[2m[36m(DefaultActor pid=24186)[0m 2[32m [repeated 8x across cluster][0m
[2m[36m(DefaultActor pid=24171)[0m starting client: 17 <class 'str'>
[2m[36m(DefaultActor pid=24171)[0m starting client:  <class 'int'>
[2m[36m(DefaultActor pid=24171)[0m Creating client model to client: 17
[2m[36m(DefaultActor pid=24171)[0m Data X: 22059
[2m[36m(DefaultActor pid=24171)[0m Data Y: 22059
[2m[36m(DefaultActor pid=24171)[0m Creating client model to client: 17 round 0


ERROR flwr 2024-02-15 18:05:31,416 | ray_client_proxy.py:161 | Traceback (most recent call last):
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/flwr/simulation/ray_transport/ray_client_proxy.py", line 151, in _submit_job
    res, updated_context = self.actor_pool.get_client_result(self.cid, timeout)
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/flwr/simulation/ray_transport/ray_actor.py", line 425, in get_client_result
    return self._fetch_future_result(cid)
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/flwr/simulation/ray_transport/ray_actor.py", line 306, in _fetch_future_result
    res_cid, res, updated_context = ray.get(
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/ray/_private/auto_init_hook.py", line 24, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/home/gui

ERROR flwr 2024-02-15 18:05:31,418 | ray_client_proxy.py:162 | Task was killed due to the node running low on memory.
Memory on the node (IP: 172.30.126.159, ID: 6a6218aaeb17a738f81f0d76aa128037ca9b5ff2352dd37d55158a67) where the task (actor ID: c15383f49961ecb4a37d265501000000, name=DefaultActor.__init__, pid=24180, memory used=0.37GB) was running was 7.12GB / 7.44GB (0.95787), which exceeds the memory usage threshold of 0.95. Ray killed this worker (ID: b5d7f31e8e1c5600a4998934a7df6c8f6b9f159ae5138dfa91eb3fcf) because it was the most recently scheduled task; to see more information about memory usage on this node, use `ray logs raylet.out -ip 172.30.126.159`. To see the logs of the worker, use `ray logs worker-b5d7f31e8e1c5600a4998934a7df6c8f6b9f159ae5138dfa91eb3fcf*out -ip 172.30.126.159. Top 10 memory users:
PID	MEM(GB)	COMMAND
23616	1.83	/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/bin/python -m ipykernel_launcher -f /h...
24186	0.38	ray::DefaultActor.run
24171	0.

ERROR flwr 2024-02-15 18:05:31,421 | ray_client_proxy.py:162 | Task was killed due to the node running low on memory.
Memory on the node (IP: 172.30.126.159, ID: 6a6218aaeb17a738f81f0d76aa128037ca9b5ff2352dd37d55158a67) where the task (actor ID: b69812a30d49bad7a23056ae01000000, name=DefaultActor.__init__, pid=24177, memory used=0.36GB) was running was 7.07GB / 7.44GB (0.950478), which exceeds the memory usage threshold of 0.95. Ray killed this worker (ID: eb87cbf60b1e6dc3a911ba590051b73508b005f44b4692d9a1345e71) because it was the most recently scheduled task; to see more information about memory usage on this node, use `ray logs raylet.out -ip 172.30.126.159`. To see the logs of the worker, use `ray logs worker-eb87cbf60b1e6dc3a911ba590051b73508b005f44b4692d9a1345e71*out -ip 172.30.126.159. Top 10 memory users:
PID	MEM(GB)	COMMAND
23616	1.13	/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/bin/python -m ipykernel_launcher -f /h...
24189	0.37	ray::DefaultActor
24182	0.37	

ERROR flwr 2024-02-15 18:05:31,428 | ray_client_proxy.py:161 | Traceback (most recent call last):
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/flwr/simulation/ray_transport/ray_client_proxy.py", line 151, in _submit_job
    res, updated_context = self.actor_pool.get_client_result(self.cid, timeout)
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/flwr/simulation/ray_transport/ray_actor.py", line 425, in get_client_result
    return self._fetch_future_result(cid)
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/flwr/simulation/ray_transport/ray_actor.py", line 306, in _fetch_future_result
    res_cid, res, updated_context = ray.get(
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/ray/_private/auto_init_hook.py", line 24, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/home/gui

ERROR flwr 2024-02-15 18:05:32,227 | ray_client_proxy.py:162 | Task was killed due to the node running low on memory.
Memory on the node (IP: 172.30.126.159, ID: 6a6218aaeb17a738f81f0d76aa128037ca9b5ff2352dd37d55158a67) where the task (actor ID: 1aabde1e63fd801f3a80b1b001000000, name=DefaultActor.__init__, pid=24178, memory used=0.33GB) was running was 7.07GB / 7.44GB (0.950049), which exceeds the memory usage threshold of 0.95. Ray killed this worker (ID: 5520f083efbdea86985d6fe1a9f8ec4a8d92792d52c2c3639df180d8) because it was the most recently scheduled task; to see more information about memory usage on this node, use `ray logs raylet.out -ip 172.30.126.159`. To see the logs of the worker, use `ray logs worker-5520f083efbdea86985d6fe1a9f8ec4a8d92792d52c2c3639df180d8*out -ip 172.30.126.159. Top 10 memory users:
PID	MEM(GB)	COMMAND
23616	1.13	/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/bin/python -m ipykernel_launcher -f /h...
24189	0.37	ray::DefaultActor.run
24182	0

ERROR flwr 2024-02-15 18:05:32,235 | ray_client_proxy.py:162 | Task was killed due to the node running low on memory.
Memory on the node (IP: 172.30.126.159, ID: 6a6218aaeb17a738f81f0d76aa128037ca9b5ff2352dd37d55158a67) where the task (actor ID: b69812a30d49bad7a23056ae01000000, name=DefaultActor.__init__, pid=24177, memory used=0.36GB) was running was 7.07GB / 7.44GB (0.950478), which exceeds the memory usage threshold of 0.95. Ray killed this worker (ID: eb87cbf60b1e6dc3a911ba590051b73508b005f44b4692d9a1345e71) because it was the most recently scheduled task; to see more information about memory usage on this node, use `ray logs raylet.out -ip 172.30.126.159`. To see the logs of the worker, use `ray logs worker-eb87cbf60b1e6dc3a911ba590051b73508b005f44b4692d9a1345e71*out -ip 172.30.126.159. Top 10 memory users:
PID	MEM(GB)	COMMAND
23616	1.13	/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/bin/python -m ipykernel_launcher -f /h...
24189	0.37	ray::DefaultActor
24182	0.37	

ERROR flwr 2024-02-15 18:05:32,287 | ray_client_proxy.py:162 | Task was killed due to the node running low on memory.
Memory on the node (IP: 172.30.126.159, ID: 6a6218aaeb17a738f81f0d76aa128037ca9b5ff2352dd37d55158a67) where the task (actor ID: 388fabd5122dfa924f937a4401000000, name=DefaultActor.__init__, pid=24184, memory used=0.33GB) was running was 7.07GB / 7.44GB (0.950416), which exceeds the memory usage threshold of 0.95. Ray killed this worker (ID: 7183a21e8c6dc6d32f920bd6e9b8a0be3c757fc658e03caf33a1bb11) because it was the most recently scheduled task; to see more information about memory usage on this node, use `ray logs raylet.out -ip 172.30.126.159`. To see the logs of the worker, use `ray logs worker-7183a21e8c6dc6d32f920bd6e9b8a0be3c757fc658e03caf33a1bb11*out -ip 172.30.126.159. Top 10 memory users:
PID	MEM(GB)	COMMAND
23616	1.12	/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/bin/python -m ipykernel_launcher -f /h...
24184	0.33	ray::DefaultActor.run
24186	0

ERROR flwr 2024-02-15 18:05:32,295 | ray_client_proxy.py:161 | Traceback (most recent call last):
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/flwr/simulation/ray_transport/ray_client_proxy.py", line 151, in _submit_job
    res, updated_context = self.actor_pool.get_client_result(self.cid, timeout)
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/flwr/simulation/ray_transport/ray_actor.py", line 425, in get_client_result
    return self._fetch_future_result(cid)
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/flwr/simulation/ray_transport/ray_actor.py", line 306, in _fetch_future_result
    res_cid, res, updated_context = ray.get(
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/ray/_private/auto_init_hook.py", line 24, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/home/gui

ERROR flwr 2024-02-15 18:05:32,306 | ray_client_proxy.py:162 | Task was killed due to the node running low on memory.
Memory on the node (IP: 172.30.126.159, ID: 6a6218aaeb17a738f81f0d76aa128037ca9b5ff2352dd37d55158a67) where the task (actor ID: b69812a30d49bad7a23056ae01000000, name=DefaultActor.__init__, pid=24177, memory used=0.36GB) was running was 7.07GB / 7.44GB (0.950478), which exceeds the memory usage threshold of 0.95. Ray killed this worker (ID: eb87cbf60b1e6dc3a911ba590051b73508b005f44b4692d9a1345e71) because it was the most recently scheduled task; to see more information about memory usage on this node, use `ray logs raylet.out -ip 172.30.126.159`. To see the logs of the worker, use `ray logs worker-eb87cbf60b1e6dc3a911ba590051b73508b005f44b4692d9a1345e71*out -ip 172.30.126.159. Top 10 memory users:
PID	MEM(GB)	COMMAND
23616	1.13	/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/bin/python -m ipykernel_launcher -f /h...
24189	0.37	ray::DefaultActor
24182	0.37	

[2m[36m(DefaultActor pid=24182)[0m 2[32m [repeated 9x across cluster][0m
[2m[36m(DefaultActor pid=24185)[0m starting client: 17 <class 'str'>[32m [repeated 19x across cluster][0m
[2m[36m(DefaultActor pid=24185)[0m starting client:  <class 'int'>[32m [repeated 19x across cluster][0m
[2m[36m(DefaultActor pid=24185)[0m Creating client model to client: 17[32m [repeated 19x across cluster][0m
[2m[36m(DefaultActor pid=24185)[0m Data X: 22059[32m [repeated 19x across cluster][0m
[2m[36m(DefaultActor pid=24185)[0m Data Y: 22059[32m [repeated 19x across cluster][0m
[2m[36m(DefaultActor pid=24185)[0m Creating client model to client: 17 round 0[32m [repeated 19x across cluster][0m
[2m[36m(DefaultActor pid=24171)[0m 2
[2m[36m(DefaultActor pid=24179)[0m 2


DEBUG flwr 2024-02-15 18:05:50,569 | server.py:236 | fit_round 4 received 9 results and 10 failures


Saving round 4 aggregated_ndarrays...
result_unbalanced_epoch_1_rounds_5_cycle_1/checkpoints/round-4-weights.npz

Evaluating global model round 4


[2m[33m(raylet)[0m [2024-02-15 18:05:51,752 E 23772 23772] (raylet) node_manager.cc:3084: 1 Workers (tasks / actors) killed due to memory pressure (OOM), 0 Workers crashed due to other reasons at node (ID: 6a6218aaeb17a738f81f0d76aa128037ca9b5ff2352dd37d55158a67, IP: 172.30.126.159) over the last time period. To see more information about the Workers killed on this node, use `ray logs raylet.out -ip 172.30.126.159`
[2m[33m(raylet)[0m 
[2m[33m(raylet)[0m Refer to the documentation on how to address the out of memory issue: https://docs.ray.io/en/latest/ray-core/scheduling/ray-oom-prevention.html. Consider provisioning more memory on this node or reducing task parallelism by requesting more CPUs per task. To adjust the kill threshold, set the environment variable `RAY_memory_usage_threshold` when starting Ray. To disable worker killing, set the environment variable `RAY_memory_monitor_refresh_ms` to zero.
INFO flwr 2024-02-15 18:05:57,475 | server.py:125 | fit progress: (4, 0.90

2


[2m[36m(DefaultActor pid=24182)[0m   client = check_clientfn_returns_client(client_fn(cid))[32m [repeated 21x across cluster][0m


[2m[36m(DefaultActor pid=24182)[0m starting client: 7 <class 'str'>
[2m[36m(DefaultActor pid=24182)[0m starting client:  <class 'int'>
[2m[36m(DefaultActor pid=24182)[0m Creating client model to client: 7
[2m[36m(DefaultActor pid=24182)[0m Data X: 33903
[2m[36m(DefaultActor pid=24182)[0m Data Y: 33903
[2m[36m(DefaultActor pid=24182)[0m Creating client model to client: 7 round 0
[2m[36m(DefaultActor pid=24172)[0m 2[32m [repeated 7x across cluster][0m
[2m[36m(DefaultActor pid=24172)[0m starting client: 2 <class 'str'>
[2m[36m(DefaultActor pid=24172)[0m starting client:  <class 'int'>
[2m[36m(DefaultActor pid=24172)[0m Creating client model to client: 2
[2m[36m(DefaultActor pid=24172)[0m Data X: 3383
[2m[36m(DefaultActor pid=24172)[0m Data Y: 3383
[2m[36m(DefaultActor pid=24172)[0m Creating client model to client: 2 round 0


ERROR flwr 2024-02-15 18:05:57,891 | ray_client_proxy.py:161 | Traceback (most recent call last):
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/flwr/simulation/ray_transport/ray_client_proxy.py", line 151, in _submit_job
    res, updated_context = self.actor_pool.get_client_result(self.cid, timeout)
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/flwr/simulation/ray_transport/ray_actor.py", line 425, in get_client_result
    return self._fetch_future_result(cid)
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/flwr/simulation/ray_transport/ray_actor.py", line 306, in _fetch_future_result
    res_cid, res, updated_context = ray.get(
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/ray/_private/auto_init_hook.py", line 24, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/home/gui

ERROR flwr 2024-02-15 18:05:57,896 | ray_client_proxy.py:161 | Traceback (most recent call last):
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/flwr/simulation/ray_transport/ray_client_proxy.py", line 151, in _submit_job
    res, updated_context = self.actor_pool.get_client_result(self.cid, timeout)
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/flwr/simulation/ray_transport/ray_actor.py", line 425, in get_client_result
    return self._fetch_future_result(cid)
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/flwr/simulation/ray_transport/ray_actor.py", line 306, in _fetch_future_result
    res_cid, res, updated_context = ray.get(
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/ray/_private/auto_init_hook.py", line 24, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/home/gui

ERROR flwr 2024-02-15 18:05:57,906 | ray_client_proxy.py:161 | Traceback (most recent call last):
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/flwr/simulation/ray_transport/ray_client_proxy.py", line 151, in _submit_job
    res, updated_context = self.actor_pool.get_client_result(self.cid, timeout)
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/flwr/simulation/ray_transport/ray_actor.py", line 425, in get_client_result
    return self._fetch_future_result(cid)
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/flwr/simulation/ray_transport/ray_actor.py", line 306, in _fetch_future_result
    res_cid, res, updated_context = ray.get(
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/ray/_private/auto_init_hook.py", line 24, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/home/gui

ERROR flwr 2024-02-15 18:05:57,911 | ray_client_proxy.py:162 | Task was killed due to the node running low on memory.
Memory on the node (IP: 172.30.126.159, ID: 6a6218aaeb17a738f81f0d76aa128037ca9b5ff2352dd37d55158a67) where the task (actor ID: d581abe9f194281cae040ffe01000000, name=DefaultActor.__init__, pid=24187, memory used=0.30GB) was running was 7.07GB / 7.44GB (0.950222), which exceeds the memory usage threshold of 0.95. Ray killed this worker (ID: ca86cd785378531d2b883fd9a151213ec5b7e9009ad88559c186049f) because it was the most recently scheduled task; to see more information about memory usage on this node, use `ray logs raylet.out -ip 172.30.126.159`. To see the logs of the worker, use `ray logs worker-ca86cd785378531d2b883fd9a151213ec5b7e9009ad88559c186049f*out -ip 172.30.126.159. Top 10 memory users:
PID	MEM(GB)	COMMAND
23616	1.10	/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/bin/python -m ipykernel_launcher -f /h...
24189	0.31	ray::DefaultActor.run
24182	0

ERROR flwr 2024-02-15 18:05:58,466 | ray_client_proxy.py:161 | Traceback (most recent call last):
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/flwr/simulation/ray_transport/ray_client_proxy.py", line 151, in _submit_job
    res, updated_context = self.actor_pool.get_client_result(self.cid, timeout)
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/flwr/simulation/ray_transport/ray_actor.py", line 425, in get_client_result
    return self._fetch_future_result(cid)
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/flwr/simulation/ray_transport/ray_actor.py", line 306, in _fetch_future_result
    res_cid, res, updated_context = ray.get(
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/ray/_private/auto_init_hook.py", line 24, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/home/gui

ERROR flwr 2024-02-15 18:05:58,521 | ray_client_proxy.py:162 | Task was killed due to the node running low on memory.
Memory on the node (IP: 172.30.126.159, ID: 6a6218aaeb17a738f81f0d76aa128037ca9b5ff2352dd37d55158a67) where the task (actor ID: b69812a30d49bad7a23056ae01000000, name=DefaultActor.__init__, pid=24177, memory used=0.36GB) was running was 7.07GB / 7.44GB (0.950478), which exceeds the memory usage threshold of 0.95. Ray killed this worker (ID: eb87cbf60b1e6dc3a911ba590051b73508b005f44b4692d9a1345e71) because it was the most recently scheduled task; to see more information about memory usage on this node, use `ray logs raylet.out -ip 172.30.126.159`. To see the logs of the worker, use `ray logs worker-eb87cbf60b1e6dc3a911ba590051b73508b005f44b4692d9a1345e71*out -ip 172.30.126.159. Top 10 memory users:
PID	MEM(GB)	COMMAND
23616	1.13	/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/bin/python -m ipykernel_launcher -f /h...
24189	0.37	ray::DefaultActor
24182	0.37	

ERROR flwr 2024-02-15 18:05:58,534 | ray_client_proxy.py:161 | Traceback (most recent call last):
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/flwr/simulation/ray_transport/ray_client_proxy.py", line 151, in _submit_job
    res, updated_context = self.actor_pool.get_client_result(self.cid, timeout)
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/flwr/simulation/ray_transport/ray_actor.py", line 425, in get_client_result
    return self._fetch_future_result(cid)
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/flwr/simulation/ray_transport/ray_actor.py", line 306, in _fetch_future_result
    res_cid, res, updated_context = ray.get(
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/ray/_private/auto_init_hook.py", line 24, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/home/gui

ERROR flwr 2024-02-15 18:05:58,541 | ray_client_proxy.py:162 | Task was killed due to the node running low on memory.
Memory on the node (IP: 172.30.126.159, ID: 6a6218aaeb17a738f81f0d76aa128037ca9b5ff2352dd37d55158a67) where the task (actor ID: 8080131699fdec87227ce5a101000000, name=DefaultActor.__init__, pid=24181, memory used=0.39GB) was running was 7.07GB / 7.44GB (0.950481), which exceeds the memory usage threshold of 0.95. Ray killed this worker (ID: 8f9f4202b7c08e54248e55344f63a4dff0d1bcebccfc900a981c5fe4) because it was the most recently scheduled task; to see more information about memory usage on this node, use `ray logs raylet.out -ip 172.30.126.159`. To see the logs of the worker, use `ray logs worker-8f9f4202b7c08e54248e55344f63a4dff0d1bcebccfc900a981c5fe4*out -ip 172.30.126.159. Top 10 memory users:
PID	MEM(GB)	COMMAND
23616	1.84	/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/bin/python -m ipykernel_launcher -f /h...
24171	0.40	ray::DefaultActor.run
24182	0

ERROR flwr 2024-02-15 18:05:58,554 | ray_client_proxy.py:162 | Task was killed due to the node running low on memory.
Memory on the node (IP: 172.30.126.159, ID: 6a6218aaeb17a738f81f0d76aa128037ca9b5ff2352dd37d55158a67) where the task (actor ID: 388fabd5122dfa924f937a4401000000, name=DefaultActor.__init__, pid=24184, memory used=0.33GB) was running was 7.07GB / 7.44GB (0.950416), which exceeds the memory usage threshold of 0.95. Ray killed this worker (ID: 7183a21e8c6dc6d32f920bd6e9b8a0be3c757fc658e03caf33a1bb11) because it was the most recently scheduled task; to see more information about memory usage on this node, use `ray logs raylet.out -ip 172.30.126.159`. To see the logs of the worker, use `ray logs worker-7183a21e8c6dc6d32f920bd6e9b8a0be3c757fc658e03caf33a1bb11*out -ip 172.30.126.159. Top 10 memory users:
PID	MEM(GB)	COMMAND
23616	1.12	/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/bin/python -m ipykernel_launcher -f /h...
24184	0.33	ray::DefaultActor.run
24186	0

[2m[36m(DefaultActor pid=24171)[0m 2[32m [repeated 10x across cluster][0m
[2m[36m(DefaultActor pid=24171)[0m starting client: 2 <class 'str'>[32m [repeated 19x across cluster][0m
[2m[36m(DefaultActor pid=24171)[0m starting client:  <class 'int'>[32m [repeated 19x across cluster][0m
[2m[36m(DefaultActor pid=24171)[0m Creating client model to client: 2[32m [repeated 19x across cluster][0m
[2m[36m(DefaultActor pid=24171)[0m Data X: 3383[32m [repeated 19x across cluster][0m
[2m[36m(DefaultActor pid=24171)[0m Data Y: 3383[32m [repeated 19x across cluster][0m
[2m[36m(DefaultActor pid=24171)[0m Creating client model to client: 2 round 0[32m [repeated 19x across cluster][0m


DEBUG flwr 2024-02-15 18:06:16,313 | server.py:236 | fit_round 5 received 9 results and 10 failures


Saving round 5 aggregated_ndarrays...
result_unbalanced_epoch_1_rounds_5_cycle_1/checkpoints/round-5-weights.npz

Evaluating global model round 5


INFO flwr 2024-02-15 18:06:23,093 | server.py:125 | fit progress: (5, 0.8721485733985901, {'accuracy': 0.7343499660491943}, 137.3044667480001)
DEBUG flwr 2024-02-15 18:06:23,095 | server.py:173 | evaluate_round 5: strategy sampled 19 clients (out of 19)


2


[2m[36m(DefaultActor pid=24182)[0m   client = check_clientfn_returns_client(client_fn(cid))[32m [repeated 21x across cluster][0m


[2m[36m(DefaultActor pid=24185)[0m 2[32m [repeated 8x across cluster][0m
[2m[36m(DefaultActor pid=24182)[0m starting client: 15 <class 'str'>
[2m[36m(DefaultActor pid=24182)[0m starting client:  <class 'int'>
[2m[36m(DefaultActor pid=24182)[0m Creating client model to client: 15
[2m[36m(DefaultActor pid=24182)[0m Data X: 21669
[2m[36m(DefaultActor pid=24182)[0m Data Y: 21669
[2m[36m(DefaultActor pid=24182)[0m Creating client model to client: 15 round 0
[2m[36m(DefaultActor pid=24179)[0m starting client: 5 <class 'str'>
[2m[36m(DefaultActor pid=24179)[0m starting client:  <class 'int'>
[2m[36m(DefaultActor pid=24179)[0m Creating client model to client: 5
[2m[36m(DefaultActor pid=24179)[0m Data X: 26567
[2m[36m(DefaultActor pid=24179)[0m Data Y: 26567
[2m[36m(DefaultActor pid=24179)[0m Creating client model to client: 5 round 0


ERROR flwr 2024-02-15 18:06:23,510 | ray_client_proxy.py:161 | Traceback (most recent call last):
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/flwr/simulation/ray_transport/ray_client_proxy.py", line 151, in _submit_job
    res, updated_context = self.actor_pool.get_client_result(self.cid, timeout)
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/flwr/simulation/ray_transport/ray_actor.py", line 425, in get_client_result
    return self._fetch_future_result(cid)
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/flwr/simulation/ray_transport/ray_actor.py", line 306, in _fetch_future_result
    res_cid, res, updated_context = ray.get(
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/ray/_private/auto_init_hook.py", line 24, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/home/gui

ERROR flwr 2024-02-15 18:06:23,516 | ray_client_proxy.py:161 | Traceback (most recent call last):
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/flwr/simulation/ray_transport/ray_client_proxy.py", line 151, in _submit_job
    res, updated_context = self.actor_pool.get_client_result(self.cid, timeout)
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/flwr/simulation/ray_transport/ray_actor.py", line 425, in get_client_result
    return self._fetch_future_result(cid)
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/flwr/simulation/ray_transport/ray_actor.py", line 306, in _fetch_future_result
    res_cid, res, updated_context = ray.get(
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/ray/_private/auto_init_hook.py", line 24, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/home/gui

ERROR flwr 2024-02-15 18:06:23,523 | ray_client_proxy.py:161 | Traceback (most recent call last):
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/flwr/simulation/ray_transport/ray_client_proxy.py", line 151, in _submit_job
    res, updated_context = self.actor_pool.get_client_result(self.cid, timeout)
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/flwr/simulation/ray_transport/ray_actor.py", line 425, in get_client_result
    return self._fetch_future_result(cid)
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/flwr/simulation/ray_transport/ray_actor.py", line 306, in _fetch_future_result
    res_cid, res, updated_context = ray.get(
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/ray/_private/auto_init_hook.py", line 24, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/home/gui

ERROR flwr 2024-02-15 18:06:23,531 | ray_client_proxy.py:161 | Traceback (most recent call last):
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/flwr/simulation/ray_transport/ray_client_proxy.py", line 151, in _submit_job
    res, updated_context = self.actor_pool.get_client_result(self.cid, timeout)
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/flwr/simulation/ray_transport/ray_actor.py", line 425, in get_client_result
    return self._fetch_future_result(cid)
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/flwr/simulation/ray_transport/ray_actor.py", line 306, in _fetch_future_result
    res_cid, res, updated_context = ray.get(
  File "/home/guilherme/cpu-tensorflow-marcelo/nvidia-smi/envs/flower/lib/python3.9/site-packages/ray/_private/auto_init_hook.py", line 24, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/home/gui

History (loss, centralized):
	round 0: 0.6753955483436584
	round 1: 0.7319059371948242
	round 2: 0.8276991248130798
	round 3: 0.8657560348510742
	round 4: 0.9044705033302307
	round 5: 0.8721485733985901
History (metrics, centralized):
{'accuracy': [(0, 0.6329399347305298), (1, 0.7041471600532532), (2, 0.7227329611778259), (3, 0.7227032780647278), (4, 0.7454184293746948), (5, 0.7343499660491943)]}

In [33]:
clientList[0][inputFeatures]

[2m[36m(DefaultActor pid=24182)[0m   client = check_clientfn_returns_client(client_fn(cid))


Unnamed: 0,activity,location,day_of_week,light,phone_lock,proximity,sound,time_to_next_alarm,minutes_day
0,0.00,0.0,0.000000,0.000617,0.0,1.0,0.000000,0.982143,0.597637
1,0.00,0.0,0.000000,0.000617,0.0,1.0,0.000000,0.982143,0.597637
2,0.25,0.5,0.000000,0.000583,0.0,1.0,0.604408,0.982044,0.598332
3,0.25,0.5,0.000000,0.005117,0.0,1.0,0.604408,0.982044,0.598332
4,0.25,0.5,0.000000,0.000700,0.0,1.0,0.601849,0.981944,0.599027
...,...,...,...,...,...,...,...,...,...
17988,0.25,1.0,0.166667,0.000000,1.0,0.0,0.586128,0.983333,0.551077
17989,0.00,1.0,0.166667,0.000000,1.0,0.0,0.572395,0.983234,0.551772
17990,0.75,1.0,0.166667,0.000000,1.0,0.0,0.572395,0.983135,0.552467
17991,0.00,1.0,0.166667,0.000000,1.0,0.0,0.615209,0.983135,0.553162


In [117]:
clientList[0][outputClasses]

Unnamed: 0,awake,asleep
0,False,True
1,False,True
2,False,True
3,False,True
4,False,True
...,...,...
17988,False,True
17989,False,True
17990,False,True
17991,False,True
