In [1]:
import tensorflow as tf
import numpy as np
import json
import os
import random
from scipy.stats import dirichlet

2025-05-19 15:15:14.481771: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-05-19 15:15:15.062010: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2025-05-19 15:15:16.301724: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-05-19 15:15:16.301825: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-05-19 15:15:16.303355: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to regi

In [2]:
def preprocess_image(image):
    # 保持numpy数组格式，只进行归一化
    return image / 255.0

def generate_data(x, y, num_clients, alpha):
    num_classes = 10
    # 使用字典存储每个客户端的数据，但保持numpy数组格式
    client_data = {str(i): {'x': [], 'y': []} for i in range(num_clients)}
    
    # 为每个类别生成分布
    for k in range(num_classes):
        # 获取当前类别的所有样本
        idx_k = [i for i, label in enumerate(y) if label == k]
        random.shuffle(idx_k)
        
        # 使用Dirichlet分布生成每个客户端的样本比例
        proportions = dirichlet.rvs(alpha * np.ones(num_clients))[0]
        
        # 计算每个客户端应该获得的样本数量
        proportions = [int(p * len(idx_k)) for p in proportions]
        
        # 确保所有样本都被分配
        # proportions[-1] = len(idx_k) - sum(proportions[:-1])
        
        # 分配样本给各个客户端
        start_idx = 0
        for i in range(num_clients):
            end_idx = start_idx + proportions[i]
            if end_idx > start_idx:  # 确保有样本可分配
                # 保持numpy数组格式
                client_data[str(i)]['x'].append(x[idx_k[start_idx:end_idx]])
                client_data[str(i)]['y'].append(y[idx_k[start_idx:end_idx]])
            start_idx = end_idx
    
    # 将每个客户端的数据转换为numpy数组
    for client_id in client_data:
        if client_data[client_id]['x']:  # 确保有数据
            client_data[client_id]['x'] = np.concatenate(client_data[client_id]['x'], axis=0)
            client_data[client_id]['y'] = np.concatenate(client_data[client_id]['y'], axis=0)
    
    return client_data

def get_cluster_id(labels):
    counts = np.bincount(labels, minlength=10)
    return int(np.argmax(counts))

# 修改输出格式，确保numpy数组被正确序列化
def numpy_to_list(data):
    if isinstance(data, np.ndarray):
        return data.tolist()
    elif isinstance(data, dict):
        return {k: numpy_to_list(v) for k, v in data.items()}
    elif isinstance(data, list):
        return [numpy_to_list(item) for item in data]
    return data

In [3]:
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.cifar10.load_data()
y_train = y_train.reshape(-1)
y_test = y_test.reshape(-1)

# 使用全部数据
train_size = int(len(x_train) * 1.0)
test_size = int(len(x_test) * 1.0)

# train_size = int(len(x_train) * 0.2)
# test_size = int(len(x_test) * 0.2)

print(train_size)
print(test_size)

50000
10000


In [4]:
# 随机选择数据
train_indices = np.random.choice(len(x_train), train_size, replace=False)
test_indices = np.random.choice(len(x_test), test_size, replace=False)

x_train = x_train[train_indices]
y_train = y_train[train_indices]
x_test = x_test[test_indices]
y_test = y_test[test_indices]

# 数据预处理并保持numpy数组格式
x_train = np.array([preprocess_image(img) for img in x_train])
x_test = np.array([preprocess_image(img) for img in x_test])
y_train = np.array(y_train)
y_test = np.array(y_test)

# 生成训练和测试数据
num_clients = 100
alpha = 100  # Dirichlet分布的参数，越小越non-IID

train_data = generate_data(x_train, y_train, num_clients, alpha)
test_data = generate_data(x_test, y_test, num_clients, alpha)

In [5]:
# 为每个客户端生成cluster_id
cluster_ids = {client_id: get_cluster_id(data['y']) 
                for client_id, data in train_data.items()}

# 保存数据前转换numpy数组为列表
train_output = {
    'user_data': numpy_to_list(train_data),
    'cluster_ids': list(cluster_ids.values()),
    'users': list(train_data.keys())
}

test_output = {
    'user_data': numpy_to_list(test_data),
    'cluster_ids': list(cluster_ids.values()),
    'users': list(train_data.keys())
}

In [6]:
# 打印统计信息
print(f"Number of clients: {num_clients}")
print(f"Average training samples per client: {np.mean([len(data['x']) for data in train_data.values()])}")
print(f"Average test samples per client: {np.mean([len(data['x']) for data in test_data.values()])}")
print(f"Cluster distribution: {np.bincount(list(cluster_ids.values()), minlength=10)}")
    
# 打印每个客户端的数据量
print("\n每个客户端的数据量:")
for client_id in sorted(train_data.keys()):
    train_samples = len(train_data[client_id]['x'])
    test_samples = len(test_data[client_id]['x'])
    print(f"客户端 {client_id}: 训练集 {train_samples} 样本, 测试集 {test_samples} 样本")

Number of clients: 100
Average training samples per client: 495.02
Average test samples per client: 95.06
Cluster distribution: [12 11  9 11  8 11 13 10  7  8]

每个客户端的数据量:
客户端 0: 训练集 480 样本, 测试集 92 样本
客户端 1: 训练集 492 样本, 测试集 92 样本
客户端 10: 训练集 535 样本, 测试集 89 样本
客户端 11: 训练集 496 样本, 测试集 91 样本
客户端 12: 训练集 472 样本, 测试集 97 样本
客户端 13: 训练集 462 样本, 测试集 91 样本
客户端 14: 训练集 476 样本, 测试集 96 样本
客户端 15: 训练集 494 样本, 测试集 94 样本
客户端 16: 训练集 482 样本, 测试集 93 样本
客户端 17: 训练集 485 样本, 测试集 102 样本
客户端 18: 训练集 499 样本, 测试集 92 样本
客户端 19: 训练集 484 样本, 测试集 98 样本
客户端 2: 训练集 485 样本, 测试集 96 样本
客户端 20: 训练集 488 样本, 测试集 95 样本
客户端 21: 训练集 509 样本, 测试集 94 样本
客户端 22: 训练集 518 样本, 测试集 95 样本
客户端 23: 训练集 512 样本, 测试集 97 样本
客户端 24: 训练集 467 样本, 测试集 93 样本
客户端 25: 训练集 491 样本, 测试集 94 样本
客户端 26: 训练集 473 样本, 测试集 99 样本
客户端 27: 训练集 499 样本, 测试集 94 样本
客户端 28: 训练集 465 样本, 测试集 95 样本
客户端 29: 训练集 516 样本, 测试集 95 样本
客户端 3: 训练集 488 样本, 测试集 98 样本
客户端 30: 训练集 513 样本, 测试集 91 样本
客户端 31: 训练集 515 样本, 测试集 91 样本
客户端 32: 训练集 485 样本, 测试集 88 样本
客户端 33: 训练集 494 样本, 测

In [7]:
# 创建输出目录
os.makedirs('/root/learning-tangle/leaf/data/cifar10/data/train', exist_ok=True)
os.makedirs('/root/learning-tangle/leaf/data/cifar10/data/test', exist_ok=True)

# 保存数据
with open('/root/learning-tangle/leaf/data/cifar10/data/train/data.json', 'w') as file:
    json.dump(train_output, file)
with open('/root/learning-tangle/leaf/data/cifar10/data/test/data.json', 'w') as file:
    json.dump(test_output, file)