In [44]:
import random

In [36]:
import pandas as pd
import tensorflow as tf
import tensorflow_federated as tff
import numpy as np

df = pd.read_csv("encoded_dataset.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,age,education,default,balance,housing,loan,contact,campaign,pdays,...,job_student,job_technician,job_unemployed,marital_divorced,marital_married,marital_single,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown
0,0,58,2,0,0.032119,1,0,0,1,999,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1,1,44,1,0,0.000435,1,0,0,1,999,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
2,2,33,1,0,3e-05,1,1,0,1,999,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
3,3,47,1,0,0.022572,1,0,0,1,999,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4,4,33,1,0,1.5e-05,0,0,0,1,999,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [41]:
from sklearn.model_selection import train_test_split

y = df.y.to_frame()
X = df.drop(columns = ["y"])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [37]:
ids = np.arange(0, df.shape[0])
df["ClientID"] = ids
df.head()

Unnamed: 0.1,Unnamed: 0,age,education,default,balance,housing,loan,contact,campaign,pdays,...,job_technician,job_unemployed,marital_divorced,marital_married,marital_single,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown,ClientID
0,0,58,2,0,0.032119,1,0,0,1,999,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0
1,1,44,1,0,0.000435,1,0,0,1,999,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1
2,2,33,1,0,3e-05,1,1,0,1,999,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,2
3,3,47,1,0,0.022572,1,0,0,1,999,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,3
4,4,33,1,0,1.5e-05,0,0,0,1,999,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,4


In [82]:
def create_clients(image_list, label_list, num_clients=100, initial='clients'):
    ''' return: a dictionary with keys clients' names and value as 
                data shards - tuple of images and label lists.
        args: 
            image_list: a list of numpy arrays of training images
            label_list:a list of binarized labels for each image
            num_client: number of fedrated members (clients)
            initials: the clients'name prefix, e.g, clients_1 
            
    '''

    #create a list of client names
    client_names = ['{}_{}'.format(initial, i+1) for i in range(num_clients)]

    #randomize the data
    data = X
    data["y"] = y
    #shard data and place at each client
    size = len(data)//num_clients
    shards = [data[i:i + size] for i in range(0, size*num_clients, size)]

    #number of clients must equal number of shards
    assert(len(shards) == len(client_names))

    return {client_names[i] : shards[i] for i in range(len(client_names))} 

In [86]:
clients = create_clients(X_train, y_train, num_clients=10, initial='client')

45211


In [4]:
client_id_colname = 'ClientID'
SHUFFLE_BUFFER = 1000
NUM_EPOCHS = 1
# split client id into train and test clients
client_ids = df[client_id_colname].unique()
train_client_ids = pd.DataFrame(client_ids).sample(frac=0.5).values.tolist()
test_client_ids = [x for x in client_ids if x not in train_client_ids]

def create_tf_dataset_for_client_fn(client_id):
  # a function which takes a client_id and returns a
  # tf.data.Dataset for that client
  client_data = df[df[client_id_colname] == client_id[0]]
  dataset = tf.data.Dataset.from_tensor_slices(client_data.fillna('').to_dict("list"))
  dataset = dataset.shuffle(SHUFFLE_BUFFER).batch(1).repeat(NUM_EPOCHS)
  return dataset

train_data = tff.simulation.datasets.ClientData.from_clients_and_tf_fn(
        client_ids=train_client_ids,
        serializable_dataset_fn=create_tf_dataset_for_client_fn
    )
example_dataset = train_data.create_tf_dataset_for_client(
        train_data.client_ids[0]
    )

print(type(example_dataset))
example_element = iter(example_dataset).next()
print(example_element)

<class 'tensorflow.python.data.ops.dataset_ops.RepeatDataset'>
{'age': <tf.Tensor: shape=(1,), dtype=int32, numpy=array([59], dtype=int32)>, 'job': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'retired'], dtype=object)>, 'marital': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'married'], dtype=object)>, 'education': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'tertiary'], dtype=object)>, 'default': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'no'], dtype=object)>, 'balance': <tf.Tensor: shape=(1,), dtype=int32, numpy=array([135], dtype=int32)>, 'housing': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'yes'], dtype=object)>, 'loan': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'no'], dtype=object)>, 'contact': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'unknown'], dtype=object)>, 'day': <tf.Tensor: shape=(1,), dtype=int32, numpy=array([17], dtype=int32)>, 'month': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'nov'], dtype=object

2023-03-15 11:19:31.766689: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2023-03-15 11:19:31.766954: W tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:265] failed call to cuInit: UNKNOWN ERROR (303)
2023-03-15 11:19:31.766973: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (dhiraj-Inspiron-5593): /proc/driver/nvidia/version does not exist
2023-03-15 11:19:31.768399: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [32]:
example_element.values()

dict_values([<tf.Tensor: shape=(1,), dtype=int32, numpy=array([59], dtype=int32)>, <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'retired'], dtype=object)>, <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'married'], dtype=object)>, <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'tertiary'], dtype=object)>, <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'no'], dtype=object)>, <tf.Tensor: shape=(1,), dtype=int32, numpy=array([135], dtype=int32)>, <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'yes'], dtype=object)>, <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'no'], dtype=object)>, <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'unknown'], dtype=object)>, <tf.Tensor: shape=(1,), dtype=int32, numpy=array([17], dtype=int32)>, <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'nov'], dtype=object)>, <tf.Tensor: shape=(1,), dtype=int32, numpy=array([94], dtype=int32)>, <tf.Tensor: shape=(1,), dtype=int32, numpy=array([1], dtype=int32)>, <tf.Tensor: sh

In [26]:
client_data = df[df[client_id_colname] == 1]
client_data.fillna('').to_dict("list")

{'age': [44],
 'job': ['technician'],
 'marital': ['single'],
 'education': ['secondary'],
 'default': ['no'],
 'balance': [29],
 'housing': ['yes'],
 'loan': ['no'],
 'contact': ['unknown'],
 'day': [5],
 'month': ['may'],
 'duration': [151],
 'campaign': [1],
 'pdays': [-1],
 'previous': [0],
 'poutcome': ['unknown'],
 'y': ['no'],
 'ClientID': [1]}