# 计算训练集图像均值&方差

$$
\text{Average Mean} = \frac{{\mu_1 + \mu_2 + \ldots + \mu_N}}{N}
$$

$$
\text{Average Variance} = \frac{{\sigma_1^2 + \sigma_2^2 + \ldots + \sigma_N^2}}{N}
$$
$$
\text{Overall Standard Deviation} = \sqrt{\text{Average Variance}}
$$


In [1]:
from PIL import Image
import os
import numpy as np
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm


ROOT_DATA = os.environ.get('LARD_DATA_ROOT_PATH')
ROOT_PROJECT = os.environ.get('LARD_YOLO_ROOT_PATH')

print(ROOT_DATA)
print(ROOT_PROJECT)


def compute_mean_std(image_path):
    # 定义计算均值和标准差的函数
    img = Image.open(image_path)
    img_array = np.array(img)
    mean = np.mean(img_array, axis=(0, 1))
    std = np.std(img_array, axis=(0, 1))
    return mean, std


# 初始化变量
total_mean = np.zeros(3)
total_std = np.zeros(3)
total_images = 0

# 图片文件夹路径
path_train_images = f'{ROOT_DATA}/YoloFormat/train/images' # train_val_split之后的train
print(f"路径({path_train_images})下包括文件数量: {len(os.listdir(path_train_images))}")

# 获取所有JPEG文件路径
jpeg_files = [os.path.join(path_train_images, f) for f in os.listdir(
    path_train_images) if f.endswith('.jpeg')]

print("开始计算")
# 使用多线程来加速计算
with ThreadPoolExecutor() as executor:
    for mean, std in executor.map(compute_mean_std, jpeg_files):
        total_mean += mean
        total_std += std ** 2  # 计算方差和
        total_images += 1

# 计算整体均值和标准差
total_mean /= total_images
total_std = np.sqrt(total_std / total_images)  # 将方差和转换为标准差

print(f"{total_mean=}")
print(f"{total_std=}")

路径(/home/yeli/yeli/data/lard/YoloFormat/train/images)下包括文件数量: 11546
开始计算
total_mean=array([122.00711516, 141.11828193, 164.56574534])
total_std=array([46.91310377, 54.8164231 , 70.38650678])


In [2]:
# train_all 计算耗时14m 5.8s
# total_mean=array([122.02546603, 141.12094067, 164.56260058])
# total_std=array([46.92374909, 54.82440862, 70.39951964])

# train 计算耗时9m 34.2s
# total_mean=array([122.00711516, 141.11828193, 164.56574534])
# total_std=array([46.91310377, 54.8164231 , 70.38650678])
