In [21]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.datasets import cifar10
from tensorflow.keras import layers 
from tensorflow.keras import models
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.utils import to_categorical

import os

In [22]:
def unpickle(file):
    import pickle
    with open(file, 'rb') as fo:
        dict = pickle.load(fo, encoding='bytes')
    return dict

In [23]:
# training dataset preparation

path = "/home/fedlearn/Downloads/cifar-10-python/cifar-10-batches-py/"
os.chdir(path)

trainX = []
trainy = []

for file in os.listdir():
    if("data_batch" in file):
        file_name = path+file
        data_batch = unpickle(file_name)
        
#         print(type(data_batch))
#         print(data_batch.keys())
        
        data_X = data_batch[b'data']
        data_X = data_X.reshape(len(data_X),3,32,32).transpose(0,2,3,1)
        trainX.append(data_X)
        
        data_y = data_batch[b'labels']
        data_y = np.array(data_y)
        trainy.append(data_y.reshape(10000,1))

train_images = np.concatenate((trainX[0],trainX[1],trainX[2],trainX[3],trainX[4]),axis=0)
train_labels = np.concatenate((trainy[0],trainy[1],trainy[2],trainy[3],trainy[4]),axis=0)

print(train_images.shape, train_labels.shape)
print(type(train_images), type(train_labels))

(50000, 32, 32, 3) (50000, 1)
<class 'numpy.ndarray'> <class 'numpy.ndarray'>


In [24]:
# testing dataset preparation

file_name = "/home/fedlearn/Downloads/cifar-10-python/cifar-10-batches-py/test_batch"

data_batch = unpickle(file_name)

# print(type(data_batch))
# print(data_batch.keys())

data_X = data_batch[b'data']
data_X = data_X.reshape(len(data_X),3,32,32).transpose(0,2,3,1)

data_y = data_batch[b'labels']
data_y = np.array(data_y).reshape(10000,1)

test_images, test_labels =  data_X, data_y

print(test_images.shape, test_labels.shape)
print(type(test_images), type(test_labels))

(10000, 32, 32, 3) (10000, 1)
<class 'numpy.ndarray'> <class 'numpy.ndarray'>


In [25]:
#Required Scaling
train_images, test_images = train_images / 255.0, test_images / 255.0

In [26]:
def getModel():
    model = models.Sequential()
    model.add(layers.Conv2D(32, (3, 3), activation='relu', input_shape=(32, 32, 3)))
    model.add(layers.MaxPooling2D((2, 2)))
    model.add(layers.Conv2D(64, (3, 3), activation='relu'))
    model.add(layers.MaxPooling2D((2, 2)))
    model.add(layers.Conv2D(64, (3, 3), activation='relu'))
    model.add(layers.Flatten())
    model.add(layers.Dense(64, activation='relu'))
    model.add(layers.Dense(10))
    return model

In [27]:
model = getModel()

In [28]:
# model.compile(optimizer='adam',loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),metrics=['accuracy'])

# history = model.fit(train_images, train_labels, epochs=10, validation_data=(test_images, test_labels))

In [29]:
def get_iid_dataset(trainX, trainY):
  client_count=100
  e0 = 500
  s0 = 0
 
  local_xy_iid=[]
  for i in range(0,client_count):
    local_x= trainX[s0:e0,:]
    local_y= trainY[s0:e0,:]

    local_xy_iid.append((local_x,local_y))
    s0 = e0
    e0 += 500

  print("iid distribution of data done")
  return local_xy_iid

In [30]:
local_xy_iid = get_iid_dataset(train_images, train_labels)

iid distribution of data done


In [31]:
print(local_xy_iid[0][0].shape,local_xy_iid[0][1].shape)

(500, 32, 32, 3) (500, 1)


In [32]:
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_samples, silhouette_score

In [33]:
# # finding optimal number of cluster in the local data
# # The number of clusters for each of the dataset in not known (unsupervised)

def minSilhouetteValue(local_X, local_y, count):
  gmm_list = []
  silhouette_val = []
  for n_comp in range(2,8): # n_comp [2,3,4,5,6,7]
    gmm = GaussianMixture(n_components=n_comp, random_state=10)
    cluster_labels = gmm.fit_predict(local_X)
    silhouette_val.append(silhouette_score(local_X, cluster_labels))
    gmm_list.append(gmm)
  
  min_val_index = silhouette_val.index(min(silhouette_val))
  print("Local Client ", count , " : " ,gmm_list[min_val_index])
  return gmm_list[min_val_index]

In [34]:
silhou = []

In [35]:
temp = silhou

with open('res.txt') as f:
    lines = f.readlines()
    for i in range(0,len(lines)):
        ind = lines[i].find("s=")
        mini_sil = lines[i][ind+2:ind+3]
        silhou.append(mini_sil)

print(len(silhou))

100


In [36]:
# Since for 100 clients, in one shot kernel is now working
# So, whaterver results comes we will store
# And start again from the index where it ends

# num = local_xy_iid[0][0].shape[0]
# d1 = local_xy_iid[0][0].shape[1]
# d2 = local_xy_iid[0][0].shape[2]
# d3 = local_xy_iid[0][0].shape[3]

# for i in range(0,100):
#     X = local_xy_iid[i][0].reshape(num,d1*d2*d3)
#     min_sil = minSilhouetteValue(X,local_xy_iid[i][1], i )
#     silhou.append(min_sil)

In [37]:
# num = local_xy_iid[0][0].shape[0]
# d1 = local_xy_iid[0][0].shape[1]
# d2 = local_xy_iid[0][0].shape[2]
# d3 = local_xy_iid[0][0].shape[3]

# for i in range(33,100):
#     X = local_xy_iid[i][0].reshape(num,d1*d2*d3)
#     min_sil = minSilhouetteValue(X,local_xy_iid[i][1], i )
# #     silhou.append(min_sil)

In [None]:
gmm_list = []

for i in range(0,31):
    
#     Finding best number of cluster using Silhouette Method
#     X = local_xy_iid[i][0].reshape(num,d1*d2*d3)
#     min_sil = minSilhouetteValue(X,local_xy_iid[i][1], i )
    
    # number of component for ith client
    n_comp = int(silhou[i])
    gmm_i = GaussianMixture(n_components=n_comp)
    gmm_i.fit(X = local_xy_iid[i][0].reshape(-1,3072))
    print("GMM : ", i ," Build")
    gmm_list.append(gmm_i)

In [None]:
# for i in range(31,61):
    
# #     Finding best number of cluster using Silhouette Method
# #     X = local_xy_iid[i][0].reshape(num,d1*d2*d3)
# #     min_sil = minSilhouetteValue(X,local_xy_iid[i][1], i )
    
#     # number of component for ith client
#     n_comp = int(silhou[i])
#     gmm_i = GaussianMixture(n_components=n_comp)
#     gmm_i.fit(X = local_xy_iid[i][0].reshape(-1,3072))
#     print("GMM : ", i ," Build")
#     gmm_list.append(gmm_i)

In [None]:
# gmm_list_i = []

# for i in range(40,100):
    
# #     Finding best number of cluster using Silhouette Method
# #     X = local_xy_iid[i][0].reshape(num,d1*d2*d3)
# #     min_sil = minSilhouetteValue(X,local_xy_iid[i][1], i )
    
#     # number of component for ith client
#     n_comp = int(silhou[i])
#     gmm_i = GaussianMixture(n_components=n_comp)
#     gmm_i.fit(X = local_xy_iid[i][0].reshape(-1,3072))
#     print("GMM : ", i ," Build")
#     gmm_list.append(gmm_i)

In [None]:
# Send local GMMs to Server & Do sampling at Server

def syntheticDataGen(gmm_list):
  n_samp = 500 # hyperparameter 

  syn_X_list = []
  syn_y_list = []

  for i in range(30):
    gmm = gmm_list[i]
    syn_X1,_ = gmm.sample(n_samp)
    syn_y1 = np.full(n_samp,i)
    print (syn_X1.shape,syn_y1.shape)

    syn_X_list.append(syn_X1)
    syn_y_list.append(syn_y1)

  syn_X = np.concatenate(syn_X_list)
  syn_y = np.concatenate(syn_y_list)
  syn_y = to_categorical(syn_y)
  print (syn_X, syn_y)
  return syn_X, syn_y  

In [None]:
# synthetic/proxy data generation
syn_X1, syn_y1 = syntheticDataGen(gmm_list)
print(syn_X1.shape, X_central_train.shape)
print(syn_X1.shape, X_central_train.shape)

In [None]:
def updateCentralModelGMM(syn_X, syn_y):
  updated_model = define_model()
  updated_model.compile(optimizer=SGD(learning_rate=0.01, momentum=0.9),loss='categorical_crossentropy', metrics=['accuracy'])

  print (syn_X.shape,syn_y.shape)

  updated_model.fit(syn_X, syn_y, epochs=100, batch_size=500, validation_data=(testX, testY))

  _, accuracy = updated_model.evaluate(testX, testY,verbose=0) # returns loss and accuracy

  return accuracy

In [None]:
new_acc_gmm = updateCentralModelGMM(syn_X1, syn_y1)
print(new_acc_gmm)