<a href="https://colab.research.google.com/github/damjimenezgu/Click-Here-to-Find-My_Repositories/blob/main/Federated-Learning/ECG/01_Sliders_for_distribution_and_number_of_nodes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import libraries

In [9]:
from __future__ import print_function
import numpy as np
import random
import cv2
import os
from imutils import paths
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D
from tensorflow.keras.layers import MaxPooling2D
from tensorflow.keras.layers import Activation
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import SGD
from tensorflow.keras import backend as K

from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.metrics import SparseCategoricalAccuracy
from tensorflow.keras.optimizers import Adam

# from fl_mnist_implementation_tutorial_utils import *


import os
import sys
import time
import pickle

import numpy as np, os, sys, joblib

import pandas as pd
from tqdm.auto import tqdm

from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler

import matplotlib.pyplot as plt

import seaborn as sns
sns.set(style="darkgrid", rc = {'figure.figsize':(5,7)})

from sklearn.impute import SimpleImputer

import plotly.graph_objects as go

from sklearn.model_selection import StratifiedKFold

# Garbage Collector - use it like gc.collect()
import gc

# Custom Callback To Include in Callbacks List At Training Time
class GarbageCollectorCallback(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None):
        gc.collect()

from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE

from keras.layers import LSTM

from google.colab import drive
# Mount drive from Google
drive.mount('/content/gdrive')

import math
from ipywidgets import interact, interactive, fixed, interact_manual, FloatSlider, Layout, IntSlider, FloatLogSlider
import ipywidgets as widgets
import matplotlib.ticker as mticker

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


# Define parameters

In [10]:
# Main path
root_path = '/content/gdrive/MyDrive/ECG-MSc-Thesis/'
%cd $root_path

code_path = root_path + 'code'
models_path = root_path + 'models/Daniel/federated_models'
output_path = root_path + 'output/Daniel/federated_outputs'
data_path = root_path + 'data'

# Define random state for reproducibility
random_state = 0

# root_path = '/content/gdrive/MyDrive/ECG-MSc-Thesis/code/Daniel/imbalanced_data_workaround/'

# Define colors to use in plots
colors = ["#00cfcc","#e6013b","#007f88","#00cccd","#69e0da","darkblue","#FFFFFF"]

/content/gdrive/MyDrive/ECG-MSc-Thesis


# Define functions

In [11]:
def NormalizeData(value, min_val = 0, max_val = 1):
    return (value - min_val) / (max_val - min_val)

# Calculates the JSD for multiple probability distributions
def jsd(prob_dists):
    weight = 1/len(prob_dists)              # Set weights to be uniform
    js_left = np.zeros(len(prob_dists[0]))  
    js_right = 0
    for pd in prob_dists:
        js_left += np.array(pd) * weight
        js_right += weight * entropy(pd, normalize=False)

    jsd = entropy(js_left, normalize=False) - js_right
    
    if len(prob_dists) > 2:
        jsd = NormalizeData(jsd, min_val = 0, max_val = math.log2(len(prob_dists)))
    return jsd

# Entropy function
def entropy(prob_dist, normalize=True):
    entropy = -sum([p * math.log2(p) for p in prob_dist if p != 0])
    if normalize:
        max_entropy = math.log2(prob_dist.shape[0])
        return entropy / max_entropy
    return entropy

from scipy.stats import dirichlet

def load(paths, verbose=-1):
    '''expects images for each class in seperate dir, 
    e.g all digits in 0 class in the directory named 0 '''
    data = list()
    labels = list()
    # loop over the input images
    for (i, imgpath) in enumerate(paths):
        # load the image and extract the class labels
        im_gray = cv2.imread(imgpath, cv2.IMREAD_GRAYSCALE)
        image = np.array(im_gray).flatten()
        label = imgpath.split(os.path.sep)[-2]
        # scale the image to [0, 1] and add to list
        data.append(image/255)
        labels.append(label)
        # show an update every `verbose` images
        if verbose > 0 and i > 0 and (i + 1) % verbose == 0:
            print("[INFO] processed {}/{}".format(i + 1, len(paths)))
    # return a tuple of the data and labels
    return data, labels






def randomList(m, n, random_state_fn):

    # Create an array of size m where
    # every element is initialized to 0
    arr = [0] * m;
    
    random_state_loop_fn = random_state_fn
    # To make the sum of the final list as n
    for i in range(n):
        # Increment any random element
        # from the array by 1
        np.random.seed(random_state_loop_fn)
        arr[np.random.poisson(30, 1)[0] % m] += 1
        random_state_loop_fn += 100
    return arr
    
def create_clients(image_list, label_list, num_clients=5, initial='clients', oversampled_data=False, generate_iid = True):
    ''' return: a dictionary with keys clients' names and value as 
                data shards - tuple of images and label lists.
        args: 
            image_list: a list of numpy arrays of training images
            label_list:a list of binarized labels for each image
            num_client: number of fedrated members (clients)
            initials: the clients'name prefix, e.g, clients_1 
            
    '''
    #create a list of client names
    client_names = ['{}_{}'.format(initial, i+1) for i in range(num_clients)]
    num_classes = len(np.unique(label_list))

    #randomize the data
    data = list(zip(image_list, label_list))
    #shard data and place at each client
    size = len(data)//num_clients
    # List to append the position of the recordings extracted
    ids_list = []    
    
    if oversampled_data == False:
      if generate_iid:
        random.seed(random_state)
        random.shuffle(data)
    
        shards = [data[i:i + size] for i in range(0, size*num_clients, size)]

        #number of clients must equal number of shards
        assert(len(shards) == len(client_names))
      else:
        shards = []
        random_state_loop = random_state

        # List to append the position of the recordings extracted
        ids_list = []

        for i in range(num_clients):
          
          take_from = randomList(num_classes, size, random_state_loop)
          
          classes = np.unique(label_list)
          
          random.seed(random_state_loop)
          random.shuffle(classes)
          
          freq = pd.DataFrame({'class':classes,
                      'nostoextract': take_from })
          
          data_df = pd.DataFrame(data)

          data_df.columns = [*data_df.columns[:-1], 'class']

          w = list(freq.nostoextract/size)
          
          p = [v for i in list(data_df['class']) for k, v in zip(list(freq['class']),w) if k == i]
          
          data_df['p'] = p
          sample = data_df.sample(n=size, weights=data_df.p, random_state=random_state_loop)
          
          # Get the index (name) of the recordings sampled
          ids_list.append(sample.index)
          
          del data_df['p']
          random_state_loop += 100
          
          X = sample.iloc[:, 0].values.tolist()
          y = sample.iloc[:, 1].values.tolist()

          shards.append(list(zip(X,y)))

    else:
      shards = []

      skf = StratifiedKFold(n_splits=num_clients)
      skf.get_n_splits(image_list, label_list)

      X = np.array(image_list)
      y = np.array(label_list)
      # StratifiedKFold(n_splits=2, random_state=None, shuffle=False)
      for train_index, test_index in skf.split(X, y):
        
        X_train_ver, X_test_ver = X[train_index], X[test_index]
        y_train_ver, y_test_ver = y[train_index], y[test_index]
        X_test_ver = list(X_test_ver)
        y_test_ver = list(y_test_ver)

        shards.append(list(zip(X_test_ver,y_test_ver)))

    return {client_names[i] : shards[i] for i in range(len(client_names))}, ids_list




def batch_data(data_shard, bs=32, for_lstm = False):
    '''Takes in a clients data shard and create a tfds object off it
    args:
        shard: a data, label constituting a client's data shard
        bs:batch size
    return:
        tfds object'''
    if for_lstm:
      data_shard = data_shard.reshape(data_shard.shape[0], data_shard.shape[2])
      data_shard = data_shard.tolist()
    
    
    #seperate shard into data and labels lists
    data, label = zip(*data_shard)

    if for_lstm:
      data = np.array(data)
      data = data.reshape(data.shape[0], 1, data.shape[1])
      data = data.tolist()
      
    dataset = tf.data.Dataset.from_tensor_slices((list(data), list(label)))

    return dataset.shuffle(len(label)).batch(bs)


class DNN:
    @staticmethod
    def build(shape, classes):
      # Define weights initializer
      initializer = tf.keras.initializers.GlorotUniform(seed = random_state)

      # Define the model
      model = Sequential()
      model.add(Dense(500, activation='relu', input_shape = (shape,), kernel_initializer=initializer))
      model.add(Dense(500, activation='relu', kernel_initializer=initializer))
      model.add(Dense(500, activation='relu', kernel_initializer=initializer))
      # model.add(Dense(500, activation='relu'))
      # model.add(Dense(500, activation='relu'))    

      model.add(Dense(classes, activation="softmax"))

      return model


class myLSTM:
    @staticmethod
    def build(shape, classes):
      # Define weights initializer
      initializer = tf.keras.initializers.GlorotUniform(seed = random_state)
      # Define the model
      model = Sequential()
      model.add(tf.keras.layers.Input(shape=(1,shape,), dtype=tf.float32, ragged=True))
      model.add(LSTM(classes))
      # model.add(Dropout(0.05))

      model.add(Dense(classes, activation="softmax"))
      
      return model


def weight_scalling_factor(clients_trn_data, client_name):
    client_names = list(clients_trn_data.keys())
    #get the bs
    bs = list(clients_trn_data[client_name])[0][0].shape[0]
    #first calculate the total training data points across clinets
    global_count = sum([tf.data.experimental.cardinality(clients_trn_data[client_name]).numpy() for client_name in client_names])*bs
    # get the total number of data points held by a client
    local_count = tf.data.experimental.cardinality(clients_trn_data[client_name]).numpy()*bs
    return local_count/global_count


def scale_model_weights(weight, scalar):
    '''function for scaling a models weights'''
    weight_final = []
    steps = len(weight)
    for i in range(steps):
        weight_final.append(scalar * weight[i])
    return weight_final



def sum_scaled_weights(scaled_weight_list):
    '''Return the sum of the listed scaled weights. The is equivalent to scaled avg of the weights'''
    avg_grad = list()
    #get the average grad accross all client gradients
    for grad_list_tuple in zip(*scaled_weight_list):
        layer_mean = tf.math.reduce_sum(grad_list_tuple, axis=0)
        avg_grad.append(layer_mean)
        
    return avg_grad


def test_model(X_test, Y_test,  model):
    cce = tf.keras.losses.SparseCategoricalCrossentropy(from_logits = False)
    #logits = model.predict(X_test, batch_size=100)
    y_pred = tf.argmax(model.predict(X_test), axis=1)
    # loss = cce(Y_test, y_pred)
    
    # acc = accuracy_score(tf.argmax(logits, axis=1), tf.argmax(Y_test, axis=1))
    acc = accuracy_score(y_pred, Y_test)
    pre = precision_score(y_pred, Y_test, average='weighted',zero_division = 0)
    rec = recall_score(y_pred, Y_test, average='weighted',zero_division = 0)
    f1s = f1_score(y_pred, Y_test, average='weighted',zero_division = 0)
    ove = (acc + pre + rec + f1s) / 4
    
    # acc = accuracy_score(Y_test, y_pred)
    
    return acc, pre, rec, f1s, ove

# Load data

In [12]:
#Import curated data
df_selected_data = pd.read_csv(data_path + '/all_datasets_federated.csv', sep = ";")

# Get feature names
feature_names = [x for x in df_selected_data.columns if x not in ['db_name', 'label', 'id']]

# Separate features and labels

In [13]:
# Get features
features = df_selected_data.loc[:,df_selected_data.columns.isin(feature_names)]

# Get labels
labels = list(df_selected_data['label'])

# Scaling (normalizing) data

In [14]:
# Replace NaN values with mean values
imputer = SimpleImputer().fit(features)
features = imputer.transform(features)
# features.columns = feature_names

# Define min max scaler
scaler = RobustScaler()
# # Transform data
features = scaler.fit_transform(features).tolist()
# features.columns = feature_names

# Sliders

In [15]:
# Define number of local nodes to be used
local_nodes_glob = 4

In [16]:
def get_spaced_colors(n): 
    max_value = 16581375 #255**3 
    interval = int(max_value / n) 
    colors = ["#"+str(hex(I)[2:].zfill(6)) for I in range(0, max_value, interval)] 
    return colors

# Define colors for classes
colors_classes = get_spaced_colors(27)

## Diagnoses' distribution across local nodes

In [17]:
# Define number of classes
n_classes = len(np.unique(labels))

def update(val):
    amp = np.log(slider.val)
    slider.valtext.set_text(amp)

def slider_stacked_distro(Alpha, Local_Nodes):

    # Get random distribution
    test_distr = np.array(dirichlet.rvs([Alpha] * n_classes, size = Local_Nodes, random_state = random_state))

    # Calculate distance
    JS_dist = np.sqrt(jsd(test_distr))
    # print("Jensen-Shannon distance:", JS_dist)

    # Defne dataframe to plot
    df_simul = pd.DataFrame(test_distr).reset_index()
    df_simul = df_simul*100
    df_simul['index'] = (df_simul['index']/100 + 1).astype(int)
    df_simul.columns = ['Local Node'] + list(np.unique(labels))

    # Plot
    df_simul.plot(x = 'Local Node', kind='bar', stacked=True, color = colors_classes, figsize=(15,7), fontsize = 20, rot=0, ylim=(0,110))
    plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5), fontsize = 12)
    plt.xlabel('Local Node', fontsize = 20)
    plt.ylabel('Participation (%)', fontsize = 20)
    plt.title("Diagnoses' distribution across local nodes", fontsize = 25)
    plt.text(-0.3, 103.5,"Jensen-Shannon dist. = "+str(round(JS_dist,2)), fontsize = 20, backgroundcolor = colors[2], color = colors[6])
    plt.show()
    return()
    
# interact(slider_stacked_distro, Alpha=(0.001,1000,0.001), Local_Nodes=(1,10,1))
# interact(slider_stacked_distro, Alpha=FloatSlider(min = 0.01, max=100, step=0.01, value=100, layout=Layout(width='1000px')), Local_Nodes=IntSlider(min = 1, max=10, step=1, value=4, layout=Layout(width='1000px')))
interact(slider_stacked_distro, Alpha = FloatLogSlider(min = -3, max=3, value = 1000, layout = Layout(width='1000px'), readout_format='.4')
                              , Local_Nodes = IntSlider(min = 1, max=10, step = 1, value  = 4, layout = Layout(width='1000px')))


interactive(children=(FloatLogSlider(value=1000.0, description='Alpha', layout=Layout(width='1000px'), max=3.0…

<function __main__.slider_stacked_distro(Alpha, Local_Nodes)>

## \# of recordings across Diagnoses and local nodes

In [18]:
def slider_scatter_distro(Alpha, Local_Nodes):

    # Get random distribution
    test_distr = np.array(dirichlet.rvs([Alpha] * n_classes, size = Local_Nodes, random_state = random_state))

    # Calculate distance
    JS_dist = np.sqrt(jsd(test_distr))
    # print("Jensen-Shannon distance:", JS_dist)

    # Defne dataframe to plot
    df_simul = pd.DataFrame(test_distr).reset_index()
    df_simul = df_simul*100
    df_simul['index'] = (df_simul['index']/100 + 1).astype(int)
    df_simul.columns = ['Local Node'] + list(np.unique(labels))
    df_simul_long = pd.melt(df_simul, id_vars='Local Node', value_vars=list(df_simul.columns[df_simul.columns != 'Local Node']))
    df_simul_long.sort_values(by=['variable','Local Node'], ascending=False, inplace=True)

    # Define the size of the data (with a final correction of 0.4 just to see better the plot)
    data_size = (int(len(features)*0.9) // Local_Nodes) * 0.4
    
    # Plot
    df_simul_long.plot.scatter(x = 'Local Node', y = 'variable', s = (df_simul_long['value'] / 100) * data_size , figsize=(15,8), fontsize = 18
                               , xlim=(0.5,Local_Nodes + 0.5), ylim=(-2,29), color = colors[0])
    plt.xlabel('Local Node', fontsize = 20)
    plt.ylabel('Diagnoses', fontsize = 20)
    plt.title("# of recordings across Diagnoses and local nodes", fontsize = 25)
    plt.text(0.6, 27.2,"Jensen-Shannon dist. = "+str(round(JS_dist,2)), fontsize = 20, backgroundcolor = colors[2], color = colors[6])
    plt.gca().xaxis.set_major_locator(mticker.MultipleLocator(1))
    plt.show()
    return()
    
interact(slider_scatter_distro, Alpha = FloatLogSlider(min = -3, max=3, value = 1000, layout = Layout(width='1000px'), readout_format='.4')
                              , Local_Nodes = IntSlider(min = 1, max=10, step = 1, value  = 4, layout = Layout(width='1000px')))


interactive(children=(FloatLogSlider(value=1000.0, description='Alpha', layout=Layout(width='1000px'), max=3.0…

<function __main__.slider_scatter_distro(Alpha, Local_Nodes)>

## Distributions separated per node

In [19]:
def slider_divided_distro(Alpha, Local_Nodes):

    # Get random distribution
    test_distr = np.array(dirichlet.rvs([Alpha] * n_classes, size = Local_Nodes, random_state = random_state))

    # Calculate distance
    JS_dist = np.sqrt(jsd(test_distr))
    # print("Jensen-Shannon distance:", JS_dist)

    # Defne dataframe to plot
    df_simul = pd.DataFrame(test_distr).reset_index()
    df_simul = df_simul*100
    df_simul['index'] = (df_simul['index']/100 + 1).astype(int)
    df_simul.columns = ['Local Node'] + list(np.unique(labels))
    df_simul_long = pd.melt(df_simul, id_vars='Local Node', value_vars=list(df_simul.columns[df_simul.columns != 'Local Node']))
    df_simul_long.sort_values(by=['variable','Local Node'], ascending=False, inplace=True)
    data_size = (int(len(features)*0.9) // Local_Nodes) *0.4
    
    # Plot
    # Define dimensions for plot
    f, axs = plt.subplots(1,Local_Nodes,figsize=(70,20))

    # Initialize counter
    cont = 0

    # Loop over the clients
    for i in range(1,Local_Nodes+1):
      
      group = df_simul_long[df_simul_long['Local Node'] == i]
      
      # # Plot each client barplot
      plt.subplot(1, Local_Nodes, cont + 1)
      plt.barh(group.variable, group['value'],alpha = 1, color=colors[0])
      plt.xlabel('Participation (%)', fontsize=60)
      if i == 1:
        plt.ylabel('Diagnoses', fontsize=60)
      plt.title("Local node " +str(i), fontsize=60)
      plt.xticks(fontsize=30)
      plt.yticks(fontsize=30)
      plt.xlim([0, max(df_simul_long['value']) + 1])
      
      
      # # Increase counter
      cont += 1
    return()
    
interact(slider_divided_distro, Alpha = FloatLogSlider(min = -3, max=3, value = 1000, layout = Layout(width='1000px'), readout_format='.4')
                              , Local_Nodes = IntSlider(min = 1, max=10, step = 1, value  = 4, layout = Layout(width='1000px')))


interactive(children=(FloatLogSlider(value=1000.0, description='Alpha', layout=Layout(width='1000px'), max=3.0…

<function __main__.slider_divided_distro(Alpha, Local_Nodes)>