**Loading Data sets and shuffling**

In [None]:
import numpy as np
from sklearn.datasets import load_digits
from sklearn.ensemble import RandomForestClassifier
from sklearn.mixture import GaussianMixture

In [None]:
X1, y1 = load_digits(return_X_y=True)

In [None]:
ind_arr = np.arange(0,X1.shape[0])
print (ind_arr)

[   0    1    2 ... 1794 1795 1796]


In [None]:
np.random.shuffle(ind_arr)

In [None]:
print (ind_arr)

[ 274  709  251 ... 1096  406  268]


In [None]:
# reshuffled original data
X = X1[ind_arr]
y = y1[ind_arr]

In [None]:
# redundant step
print (X.shape, y.shape)
# print (X1.shape, y1.shape)

(1797, 64) (1797,)


In [None]:
print (y)

[8 3 9 ... 9 0 2]


In [None]:
ind_digits = []

for i in range(10) :
  _, ind_i = np.where([y==i])
  # print(ind_i)
  ind_digits.append(ind_i)
  # print("\n")

# Create a central data set

In [None]:
n_pts = 5

In [None]:
dummy_list = []

for i in range(10) :
  ind = ind_digits[i][0:n_pts]
  dummy_list.extend(ind)  # extend so that 1D [ind[0], ind[1],...] array not like this [[ind]]

X_central = X[dummy_list]
y_central = y[dummy_list]

In [None]:
print(type(X_central))
print(X_central.shape)

<class 'numpy.ndarray'>
(50, 64)


# Create a centralized model

In [None]:
clf_central = RandomForestClassifier()
clf_central.fit(X_central,y_central)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

# Create digitwise local data sets

In [None]:
local_Xy = []

for i in range(10) :

  ind_i = ind_digits[i][n_pts:]
  # print(ind_i)
  local_X = X[ind_i]
  local_y = y[ind_i]

  # append central data to this local data

  # print (local_X.shape, X_central.shape, local_y.shape, y_central.shape)

  X_cat = np.concatenate((local_X,X_central))
  y_cat = np.concatenate((local_y,y_central))

  # print (X_cat.shape,y_cat.shape)

  local_Xy.append((X_cat,y_cat))

In [None]:
# practice
# X = np.array([[0, 1, 2], [3, 4, 5], [3, 4, 5]]) 
# y = [1,2]
# X[y]

# Train digitwise local models

In [None]:
clf_list = []
for i in range(10) :
  local_X, local_y = local_Xy[i]
  # print (local_X.shape, local_y.shape)
  clf = RandomForestClassifier()
  clf.fit(local_X,local_y)
  clf_list.append(clf)

# Centralized model performance over local data sets

In [None]:
for i in range(10) :
  local_X, local_y = local_Xy[i]
  score = clf_central.score(local_X,local_y)

  print (i,score)

0 0.9955156950672646
1 0.7004405286343612
2 0.8243243243243243
3 0.8464912280701754
4 0.915929203539823
5 0.9118942731277533
6 0.9778761061946902
7 0.8839285714285714
8 0.863013698630137
9 0.7422222222222222


# Performance of local models on each other data sets

In [None]:
score_mat = np.zeros((10,10))

In [None]:
for i in range(10) :
  for j in range(10) :
    local_X, local_y = local_Xy[j]
    score_ij = np.round(clf_list[i].score(local_X,local_y),1)
    score_mat[i][j] = score_ij

In [None]:
print (score_mat)

[[1.  0.7 0.8 0.9 0.8 0.8 0.9 1.  0.8 0.7]
 [0.9 1.  0.6 0.7 0.6 0.6 0.8 0.8 0.3 0.6]
 [1.  0.5 1.  0.6 1.  0.8 0.9 0.9 0.3 0.6]
 [0.9 0.5 0.4 1.  0.9 0.7 0.9 0.5 0.3 0.4]
 [0.7 0.4 0.9 0.8 1.  0.8 0.9 0.6 0.5 0.7]
 [0.8 0.6 0.6 0.7 0.7 1.  0.9 0.7 0.5 0.5]
 [0.9 0.7 0.7 0.8 0.7 0.9 1.  0.9 0.7 0.7]
 [0.9 0.7 0.9 0.8 0.6 0.5 1.  1.  0.5 0.7]
 [1.  0.3 0.3 0.5 0.8 0.6 0.9 0.7 1.  0.5]
 [0.9 0.5 0.7 0.5 0.8 0.6 1.  0.5 0.4 1. ]]


# Create GMMs for each local device (as proxy for its private data)

In [None]:
n_comp = 6

gmm_list = []

for i in range(10) :
  
  gmm_i = GaussianMixture(n_components=n_comp)
  
  local_X, local_y = local_Xy[i]
  
  gmm_i.fit(local_X)

  gmm_list.append(gmm_i)
  

# Send local GMMs to Server & Do sampling at Server

In [None]:
n_samp = 20

syn_X_list = []
syn_y_list = []

for i in range(10):
  gmm = gmm_list[i]
  syn_X1,_ = gmm.sample(n_samp)
  syn_y1 = np.full(n_samp,i)
  
  syn_X_list.append(syn_X1)
  syn_y_list.append(syn_y1)

syn_X = np.concatenate(syn_X_list)
syn_y = np.concatenate(syn_y_list)

print (syn_X.shape,syn_y.shape)

(200, 64) (200,)


# Create updated centralized data set

In [None]:
X_central_updated = np.concatenate((X_central,syn_X))
y_central_updated = np.concatenate((y_central,syn_y))
print (X_central_updated.shape,y_central_updated.shape)

(250, 64) (250,)


# Create updated centralized model

In [None]:
clf_central_updated = RandomForestClassifier()
clf_central_updated.fit(X_central_updated,y_central_updated)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

# Evaluate performance of updated centralized model on local data sets

In [None]:
print ('digit','new score')
# score_updated ,'new score'
for i in range(10) :
  local_X, local_y = local_Xy[i]
  score_old = np.round(clf_central.score(local_X,local_y),2)
  score_updated = np.round(clf_central_updated.score(local_X,local_y),2)
  print (i, "   " ,score_updated)

digit new score
0     0.99
1     0.97
2     0.92
3     0.94
4     0.94
5     0.94
6     0.98
7     0.97
8     0.81
9     0.85


# Build updated local models

In [None]:
clf_list_updated = []

for i in range(10) :
  local_X, local_y = local_Xy[i]
  X1 = np.concatenate((local_X,X_central_updated))
  y1 = np.concatenate((local_y,y_central_updated))
  clf = RandomForestClassifier()
  clf.fit(X1,y1)
  clf_list_updated.append(clf)  

# Evaluate performance of updated local models on each other's data sets

In [None]:
score_mat_old = score_mat.copy()

for i in range(10) :
  for j in range(10) :
    local_X, local_y = local_Xy[j]
    score_ij = np.round(clf_list_updated[i].score(local_X,local_y),1)
    score_mat[i][j] = score_ij

print (score_mat_old)
print (score_mat)

[[1.  0.7 0.8 0.9 0.8 0.8 0.9 1.  0.8 0.7]
 [0.9 1.  0.6 0.7 0.6 0.6 0.8 0.8 0.3 0.6]
 [1.  0.5 1.  0.6 1.  0.8 0.9 0.9 0.3 0.6]
 [0.9 0.5 0.4 1.  0.9 0.7 0.9 0.5 0.3 0.4]
 [0.7 0.4 0.9 0.8 1.  0.8 0.9 0.6 0.5 0.7]
 [0.8 0.6 0.6 0.7 0.7 1.  0.9 0.7 0.5 0.5]
 [0.9 0.7 0.7 0.8 0.7 0.9 1.  0.9 0.7 0.7]
 [0.9 0.7 0.9 0.8 0.6 0.5 1.  1.  0.5 0.7]
 [1.  0.3 0.3 0.5 0.8 0.6 0.9 0.7 1.  0.5]
 [0.9 0.5 0.7 0.5 0.8 0.6 1.  0.5 0.4 1. ]]
[[1.  0.9 0.9 0.9 0.9 0.9 1.  1.  0.9 0.9]
 [1.  1.  0.8 0.9 0.8 0.9 0.9 0.9 0.6 0.8]
 [1.  0.7 1.  0.8 1.  0.9 1.  0.9 0.5 0.8]
 [1.  0.8 0.7 1.  1.  0.8 1.  0.9 0.6 0.6]
 [0.9 0.8 0.9 0.9 1.  0.9 0.9 0.9 0.7 0.8]
 [1.  0.9 0.9 0.9 0.9 1.  1.  0.9 0.7 0.7]
 [1.  0.8 0.9 0.9 0.8 0.9 1.  1.  0.8 0.9]
 [1.  0.9 0.9 0.9 0.8 0.8 1.  1.  0.7 0.8]
 [1.  0.5 0.6 0.7 0.9 0.9 1.  0.9 1.  0.7]
 [1.  0.8 0.9 0.6 0.9 0.8 1.  0.8 0.7 1. ]]


In [None]:
np.where([score_mat_old > score_mat])

(array([], dtype=int64), array([], dtype=int64), array([], dtype=int64))

# Conclusion
- With addition of updated centralized data, local models have enriched themselves