# Importing packages

In [1]:
from models import MulticlassLogReg
import tensorflow_federated
from utils_federated.datasets import stackoverflow_tag_prediction
import nest_asyncio
import numpy as np
from numpy.random import default_rng
from sklearn.datasets import dump_svmlight_file, load_svmlight_file
from prep_data import DATASET_PATH
import pickle

# Downloading data & helper functions

In [2]:
nest_asyncio.apply()

In [3]:
train_fl, test_fl = stackoverflow_tag_prediction.get_federated_datasets(train_client_batch_size=500)

In [4]:
def retrieve_train_data_for_client(client_id, n=500, d_x = 10000, d_y =500):
    one_client_data = train_fl.create_tf_dataset_for_client(client_id)
    X = np.empty(shape = (n, d_x))
    y = np.empty(shape = (n, d_y))
    for element in one_client_data:
        X = element[0].numpy()
        y = element[1].numpy()
    return X, y

# Clustered data

## One cluster

In [5]:
client_num = 50

In [6]:
exp_keyword = 'SO_LR_one_cluster_{}'.format(client_num)

In [7]:
n_train_samples = 90
n_test_samples = 300
n_val_samples = 110

assert n_train_samples + n_test_samples + n_val_samples == 500

X = np.empty(shape = (n_train_samples * client_num, 10000))
y = np.empty(shape = (n_train_samples * client_num, 500))

In [8]:
participating_clients = []
rng = default_rng()
cluster = 3
counter = 0
max_labels_used = 0

with open('datasets/clients_cluster_{}.data'.format(cluster), 'rb') as file:
    selected_clients = pickle.load(file)
    
X_ = None
y_ = None
train_indices = None
val_indices = None
test_indices = None

In [9]:
for client_id in selected_clients:
    if counter == client_num:
        break    
    X_, y_ = retrieve_train_data_for_client(client_id)
    if X_.shape[0] != 500:
        continue
    max_labels_used = max(max_labels_used, np.array([1 for x in y_.sum(axis=0) if x > 0]).sum())
    participating_clients.append(client_id)

    y_sum = y_.sum(axis=0)
    assert y_sum.shape[0] == 500

    all_indices = set(list(range(500)))
    participating_labels = [i for i in range(500) if y_sum[i] > 0]
    unique_datapts = set()
    for label in participating_labels:
        for ind in range(500):
            if y_[ind, label] == 1.:
                unique_datapts.add(ind)
                break
        
    rest_inds = list(all_indices - unique_datapts)
    rng.shuffle(rest_inds)
    all_indices = list(unique_datapts) + rest_inds
    
    train_indices = all_indices[:n_train_samples]
    val_indices = all_indices[n_train_samples : n_train_samples + n_val_samples]
    test_indices = all_indices[n_train_samples + n_val_samples:]
        
    X[counter * n_train_samples : (counter + 1) * n_train_samples, :] = X_[train_indices, :]
    y[counter * n_train_samples : (counter + 1) * n_train_samples, :] = y_[train_indices, :]
    with open('datasets/{}_val{}_X.npy'.format(exp_keyword, client_id), 'wb') as file_1:
        np.save(file_1, X_[val_indices, :])
    with open('datasets/{}_val{}_y.npy'.format(exp_keyword, client_id), 'wb') as file_2:
        np.save(file_2, y_[val_indices, :])                        
    with open('datasets/{}_test{}_X.npy'.format(exp_keyword, client_id), 'wb') as file_3:
        np.save(file_3, X_[test_indices, :])
    with open('datasets/{}_test{}_y.npy'.format(exp_keyword, client_id), 'wb') as file_4:
        np.save(file_4, y_[test_indices, :])
    counter += 1
print('# clients added {}, # clients required {}'.format(counter, client_num))

# clients added 50, # clients required 50


In [10]:
with open('datasets/{}.npz'.format(exp_keyword), 'wb') as file:
    np.savez(file, X=X, y=y)

In [11]:
with open(DATASET_PATH + 'list_clients_{}.data'.format(exp_keyword), 'wb') as file:
    # store the data as binary data stream
    pickle.dump(participating_clients, file)

## Two client datasets

In [12]:
common_clients = sorted(list(set(train_fl.client_ids).intersection(set(test_fl.client_ids))))

In [13]:
participating_clients = []

In [14]:
for i in range(1000):
    client_id = common_clients[i]
    train_d = train_fl.create_tf_dataset_for_client(client_id)
    test_d = test_fl.create_tf_dataset_for_client(client_id)
    tr_flag = False
    te_flag = False
    for d in train_d:
        if d[0].shape[0] == 500:
            tr_flag = True
            break
    for d in test_d:
        if d[0].shape[0] > 400:
            te_flag = True
            break
    if tr_flag and te_flag:
        participating_clients.append(client_id)
    if len(participating_clients) > 10:
        break

In [15]:
participating_clients

['00000267',
 '00000459',
 '00000476',
 '00001288',
 '00001337',
 '00001338',
 '00001831',
 '00001968',
 '00002988',
 '00003043',
 '00003501']

### First client

In [9]:
X_, y_ = retrieve_train_data_for_client(participating_clients[0])

In [10]:
label_dist = y_.sum(axis=0)

In [11]:
sorted(range(len(label_dist)), key = lambda k: label_dist[k], reverse=True)[:10]

[9, 35, 48, 19, 157, 193, 83, 253, 26, 303]

### Second client

In [12]:
X_, y_ = retrieve_train_data_for_client(participating_clients[1])

In [13]:
label_dist = y_.sum(axis=0)

In [14]:
sorted(range(len(label_dist)), key = lambda k: label_dist[k], reverse=True)[:10]

[230, 2, 13, 12, 63, 0, 81, 434, 472, 10]

### Merging datasets

In [15]:
X = np.empty(shape=(1000, 10000))
y = np.empty(shape=(1000, 500))

In [16]:
for i in range(2):
    curr_client_id = participating_clients[i]
    X_, y_ = retrieve_train_data_for_client(curr_client_id)
    X[i * 500 : (i + 1) * 500, :] = X_
    y[i * 500 : (i + 1) * 500, :] = y_

In [17]:
with open('datasets/SO_LR_two_workers_100.npz', 'wb') as file:
    np.savez(file, X=X, y=y)

In [18]:
with open(DATASET_PATH + 'list_clients_SO_LR_two_workers_100.data', 'wb') as file:
    # store the data as binary data stream
    pickle.dump(participating_clients[:2], file)

In [19]:
TEST_DATA_SIZE = 300
VAL_DATA_SIZE = 100
for client_id in participating_clients:
    test_data = test_fl.create_tf_dataset_for_client(client_id)
    X_test = np.empty(shape = (TEST_DATA_SIZE, 10000))
    y_test = np.empty(shape = (TEST_DATA_SIZE, 500))
    X_val = np.empty(shape = (VAL_DATA_SIZE, 10000))
    y_val = np.empty(shape = (VAL_DATA_SIZE, 500))
    
    write_flag = False
    for element in test_data:
        if element[0].shape[0] < TEST_DATA_SIZE + VAL_DATA_SIZE:
            continue
        
        X_test = element[0].numpy()[:TEST_DATA_SIZE, :]
        y_test = element[1].numpy()[:TEST_DATA_SIZE, :]
        
        X_val = element[0].numpy()[TEST_DATA_SIZE : TEST_DATA_SIZE + VAL_DATA_SIZE, :]
        y_val = element[1].numpy()[TEST_DATA_SIZE : TEST_DATA_SIZE + VAL_DATA_SIZE, :]
        
        write_flag = True
        
    if not write_flag:
        raise RuntimeError
        
    with open('datasets/SO_LR_two_workers_100_test{}_X.npy'.format(client_id), 'wb') as file_1:
        np.save(file_1, X_test)
    with open('datasets/SO_LR_two_workers_100_test{}_y.npy'.format(client_id), 'wb') as file_2:
        np.save(file_2, y_test)  
    with open('datasets/SO_LR_two_workers_100_val{}_X.npy'.format(client_id), 'wb') as file_3:
        np.save(file_3, X_val)
    with open('datasets/SO_LR_two_workers_100_val{}_y.npy'.format(client_id), 'wb') as file_4:
        np.save(file_4, y_val)          